In [1]:
# Importing the necessary libraries

import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from keras.models import Model
from keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from keras.preprocessing.text import Tokenizer
from keras.preprocessing import sequence
from keras.utils import to_categorical
from sklearn.metrics import confusion_matrix

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# Load the data into Pandas dataframe
df = pd.read_csv('news_data.csv',encoding='latin-1')

# Printing first 5 rows
df.head(5)

Unnamed: 0,text,category
0,There Were 2 Mass Shootings In Texas Last Week...,CRIME
1,Will Smith Joins Diplo And Nicky Jam For The 2...,ENTERTAINMENT
2,Hugh Grant Marries For The First Time At Age 5...,ENTERTAINMENT
3,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,ENTERTAINMENT
4,Julianna Margulies Uses Donald Trump Poop Bags...,ENTERTAINMENT


In [3]:
# Printing dimension of dataset (Rows, Columns)
df.shape

(200853, 2)

In [4]:
# Information about Null Values and data types
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 200853 entries, 0 to 200852
Data columns (total 2 columns):
text        200853 non-null object
category    200853 non-null object
dtypes: object(2)
memory usage: 3.1+ MB


In [5]:
# Filtering out unncessary information from given text
import string
from nltk.corpus import stopwords
from textblob import Word

stop = stopwords.words('english')

# Removing punctuations and all digits from text
filterString = string.punctuation + '“”|”' + string.digits
df['FilterText'] = df['text'].apply(lambda x: x.translate(str.maketrans(filterString,' '*len(filterString),'')))

# Removing all single characters
df['FilterText'] = df['FilterText'].replace('\s+[a-zA-Z]\s+', ' ', regex=True)

# Removing single characters in beginning
df['FilterText'] = df['FilterText'].replace('\^[a-zA-Z]\s+', ' ', regex=True)

# Removing multiple spaces
df['FilterText'] = df['FilterText'].replace('\s+', ' ', regex=True)

# Converting text to lowercase
df['FilterText'] = df['FilterText'].apply(lambda x: x.lower())

# Removing stop words from text
df['FilterText'] = df['FilterText'].str.split(' ').apply(lambda x: ' '.join(k for k in x if k not in stop))

# Lemmatizing all words in the text
df['FilterText'] = df['FilterText'].apply(lambda x: "".join([Word(word).lemmatize() for word in x]))

df.head()

Unnamed: 0,text,category,FilterText
0,There Were 2 Mass Shootings In Texas Last Week...,CRIME,mass shootings texas last week tv left husband...
1,Will Smith Joins Diplo And Nicky Jam For The 2...,ENTERTAINMENT,smith joins diplo nicky jam world cup official...
2,Hugh Grant Marries For The First Time At Age 5...,ENTERTAINMENT,hugh grant marries first time age actor longti...
3,Jim Carrey Blasts 'Castrato' Adam Schiff And D...,ENTERTAINMENT,jim carrey blasts castrato adam schiff democra...
4,Julianna Margulies Uses Donald Trump Poop Bags...,ENTERTAINMENT,julianna margulies uses donald trump poop bags...


In [6]:
# printing all the unique categories in the dataset
print("Categories\n-------------------------------------------------------------------------------------")
print(df['category'].unique())

Categories
-------------------------------------------------------------------------------------
['CRIME' 'ENTERTAINMENT' 'WORLD NEWS' 'IMPACT' 'POLITICS' 'WEIRD NEWS'
 'BLACK VOICES' 'WOMEN' 'COMEDY' 'QUEER VOICES' 'SPORTS' 'BUSINESS'
 'TRAVEL' 'MEDIA' 'TECH' 'RELIGION' 'SCIENCE' 'LATINO VOICES' 'EDUCATION'
 'COLLEGE' 'PARENTS' 'ARTS & CULTURE' 'STYLE' 'GREEN' 'TASTE'
 'HEALTHY LIVING' 'WORLDPOST' 'GOOD NEWS' 'FIFTY' 'ARTS' 'WELLNESS'
 'PARENTING' 'HOME & LIVING' 'STYLE & BEAUTY' 'DIVORCE' 'WEDDINGS'
 'FOOD & DRINK' 'MONEY' 'ENVIRONMENT' 'CULTURE & ARTS']


In [7]:
# printing number of unique categories in the dataset
print("Number of different Category = ",len(df['category'].unique()))

Number of different Category =  40


In [8]:
# Separating the FilterText and category columns

feature=df.FilterText
target=df.category

print("Shape of feature ",feature.shape)
print("Shape of target ",target.shape)

Shape of feature  (200853,)
Shape of target  (200853,)


In [9]:
# Process the labels i.e., assigning the numerical value to words

# create the Label encoder object
le = LabelEncoder()

# convert the categorical columns into numeric by assigning a numerical lebel to categorical label
target = le.fit_transform(target)

# Reshape the target
target = target.reshape(-1,1)
target

array([[ 6],
       [10],
       [10],
       ...,
       [28],
       [28],
       [28]], dtype=int64)

In [10]:
# one-hot encode target column
target = to_categorical(target)
target

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [11]:
# Split the dataset into training and test data
feature_train, feature_test, target_train, target_test = train_test_split(feature,target,test_size=0.2)

In [12]:
# Process the data
# Tokenize the data and convert the text to sequences.

max_words = 1000

# Create Tokenizer
tok = Tokenizer(num_words = max_words) # num_words: the maximum number of words to keep, based on word frequency.

# Train the Tokenizer to the texts
tok.fit_on_texts(feature_train)

# Convert list of strings into list of lists of integers
train_sequences = tok.texts_to_sequences(feature_train)
test_sequences = tok.texts_to_sequences(feature_test)

In order to feed this data into our RNN, all input documents must have the same length. We will limit the maximum review length to max_len by truncating longer reviews and padding shorter reviews with a null value (0).

In [13]:
# Add Padding to ensure that all the sequences have the same shape.

max_len = 150

train_sequences_matrix = sequence.pad_sequences(train_sequences, maxlen=max_len)
test_sequences_matrix = sequence.pad_sequences(test_sequences,maxlen=max_len)

In [14]:
# Defining the RNN structure
def RNN():
    embedding_size = 50
    inputs = Input(name='inputs', shape=[max_len])
    layer = Embedding(max_words, embedding_size, input_length=max_len)(inputs)   # Embedding layer
    layer = Dropout(0.5)(layer)      # Dropout Layer, to reduce overfitting in the LSTM models
    layer = LSTM(64)(layer)     # LSTM Layer with 64 memory units (smart neurons)
    layer = Dropout(0.5)(layer)      # Dropout Layer, to reduce overfitting in the LSTM models
    layer = Dense(256, name='FC1', activation='relu')(layer)   # Dense Layer with 256 neuron and Relu Activation Function
    layer = Dropout(0.5)(layer)      # Dropout Layer, to reduce overfitting in the LSTM models
    layer = Dense(40, name='out_layer', activation='softmax')(layer)    # Dense Layer with a single neuron and Sigmoid Activation
    model = Model(inputs=inputs, outputs=layer)
    return model

In [17]:
# Calling the RNN() function to create model
model = RNN()

In [18]:
# Printing the summery of the model
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
inputs (InputLayer)          (None, 150)               0         
_________________________________________________________________
embedding_3 (Embedding)      (None, 150, 50)           50000     
_________________________________________________________________
dropout_7 (Dropout)          (None, 150, 50)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 64)                29440     
_________________________________________________________________
dropout_8 (Dropout)          (None, 64)                0         
_________________________________________________________________
FC1 (Dense)                  (None, 256)               16640     
_________________________________________________________________
dropout_9 (Dropout)          (None, 256)               0         
__________

The summary is textual and includes information about:

1. The layers and their order in the model.
2. The output shape of each layer.
3. The number of parameters (weights) in each layer.
4. The total number of parameters (weights) in the model.

Next, we need to compile our model. Compiling the model takes three parameters: optimizer, loss and metrics.

In [19]:
# Compile model using accuracy as a measure of model performance
model.compile(loss='binary_crossentropy',optimizer="adam",metrics=['accuracy'])

1. The optimizer controls the learning rate. We will be using ‘adam’ as our optmizer. Adam is generally a good optimizer to use for many cases. The adam optimizer adjusts the learning rate throughout training.
2. The learning rate determines how fast the optimal weights for the model are calculated. A smaller learning rate may lead to more accurate weights (up to a certain point), but the time it takes to compute the weights will be longer.
3. ‘binary_crossentropy’ is used for our loss function. This is the most common choice for classification. A lower score indicates that the model is performing better.
4. ‘accuracy’ metric is used to see the accuracy score on the validation set when we train the model

In [20]:
# Training the model
model.fit(train_sequences_matrix, target_train, batch_size=128, epochs=5, validation_data=(test_sequences_matrix,target_test))

Train on 160682 samples, validate on 40171 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x13013e77320>

To train, we will use the ‘fit()’ function on our model with the following parameters: training data (train_sequences_matrix), target data (target_train), validation data, and the number of epochs.

1. train_sequences_matrix: Features with which we train our model
2. target_train: Target with which we train our model corresponding to featues
3. validation_data: For our validation data, we will use the test set provided to us in our dataset, which we have split into test_sequences_matrix and target_test.
4. epochs: one epoch stands for one complete training of the neural network with all samples.

In [21]:
# Making predictions
preds=model.predict(test_sequences_matrix)
preds

array([[1.2476864e-02, 1.0546276e-02, 1.8356487e-02, ..., 2.1084670e-02,
        1.6219114e-03, 6.5881866e-03],
       [5.9026253e-04, 1.0843971e-03, 1.0219566e-02, ..., 1.0051583e-03,
        1.6580974e-05, 1.4802371e-04],
       [2.5439499e-02, 2.9671442e-02, 6.0585704e-02, ..., 1.3915999e-02,
        1.4159369e-02, 3.2407247e-02],
       ...,
       [4.6463744e-03, 1.7995763e-03, 5.5298987e-03, ..., 7.3921487e-02,
        2.4740695e-04, 1.1034907e-03],
       [1.0140471e-02, 3.4792111e-03, 6.8867858e-03, ..., 1.4516967e-02,
        5.9516253e-03, 2.1710167e-02],
       [1.9800747e-02, 4.3533067e-03, 2.7250056e-03, ..., 2.2361564e-04,
        1.5072875e-03, 9.7633153e-03]], dtype=float32)

In [22]:
# original values in target_test
target_test

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [25]:
# Evaluate the model on the test set
accr = model.evaluate(test_sequences_matrix,target_test)
print('Test set:  Accuracy: {:0.3f} % and Loss: {:0.3f} %'.format(accr[1]*100,accr[0]*100))

Test set:  Accuracy: 97.967 % and Loss: 7.099 %


### Finally we get 97.967 % accuracy