# Using Recurrent Neural Networks to analyze news articles 

### Import libraries

In [52]:
# import general purpose libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# import sklear modules
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder

# import Keras modules
from keras.preprocessing.sequence import pad_sequences
from keras.preprocessing.text import Tokenizer
from keras.models import Sequential
from keras.layers import Dense, LSTM
from keras.layers.embeddings import Embedding
from keras.utils import to_categorical # use for one hot encoding 


# import nltk modules
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))


### Load data

In [2]:
#reading in news articles 

df = pd.read_csv('dataset/bbc-text.csv')

In [3]:
df.head()

Unnamed: 0,category,text
0,tech,tv future in the hands of viewers with home th...
1,business,worldcom boss left books alone former worldc...
2,sport,tigers wary of farrell gamble leicester say ...
3,sport,yeading face newcastle in fa cup premiership s...
4,entertainment,ocean s twelve raids box office ocean s twelve...


In [4]:
df.shape

(2225, 2)

In [6]:
df['category'].value_counts(normalize = True)

sport            0.229663
business         0.229213
politics         0.187416
tech             0.180225
entertainment    0.173483
Name: category, dtype: float64

### Remove Stopwords and create two lists containing articles and labels

In [10]:
articles = [] 
labels = []

for index, row in df.iterrows():
    labels.append(row['category'])
    #label doesn't need pre- processing 
    
    #Space based tokenization on text to remove stopwords
    token_list = [x for x in row['text'].split() if x not in STOPWORDS]
    
    #putting them back togethor 
    articles.append(" ". join(token_list))

In [11]:
df.iloc[0, 1]

'tv future in the hands of viewers with home theatre systems  plasma high-definition tvs  and digital video recorders moving into the living room  the way people watch tv will be radically different in five years  time.  that is according to an expert panel which gathered at the annual consumer electronics show in las vegas to discuss how these new technologies will impact one of our favourite pastimes. with the us leading the trend  programmes and other content will be delivered to viewers via home networks  through cable  satellite  telecoms companies  and broadband service providers to front rooms and portable devices.  one of the most talked-about technologies of ces has been digital and personal video recorders (dvr and pvr). these set-top boxes  like the us s tivo and the uk s sky+ system  allow people to record  store  play  pause and forward wind tv programmes when they want.  essentially  the technology allows for much more personalised tv. they are also being built-in to high

In [12]:
#checking is stop word removal worked 

articles[0]

'tv future hands viewers home theatre systems plasma high-definition tvs digital video recorders moving living room way people watch tv radically different five years time. according expert panel gathered annual consumer electronics show las vegas discuss new technologies impact one favourite pastimes. us leading trend programmes content delivered viewers via home networks cable satellite telecoms companies broadband service providers front rooms portable devices. one talked-about technologies ces digital personal video recorders (dvr pvr). set-top boxes like us tivo uk sky+ system allow people record store play pause forward wind tv programmes want. essentially technology allows much personalised tv. also built-in high-definition tv sets big business japan us slower take europe lack high-definition programming. people forward wind adverts also forget abiding network channel schedules putting together a-la-carte entertainment. us networks cable satellite companies worried means terms a

### Train test split

In [14]:
X_train, X_test, y_train, y_test = train_test_split(articles, labels, random_state = 42, test_size = 0.2,
                                                    stratify = labels)

### Tokenization and word indexing

Going to do another tokenization. 

In [15]:
# Define variables


vocab_size = 5000# Max number of words
embedding_dim = 64 # Word Vector dimension
max_length = 200 # Max length of the sequence
trunc_type = 'post'# Where to chop off the seq
padding_type = 'post'# Where to put padding
oov_tok = '<OOV>' #replaces words which are not in top words with OOV

In [17]:
tokenizer = Tokenizer(num_words = vocab_size, oov_token = oov_tok)

#fit on text 
tokenizer.fit_on_texts(X_train)

In [18]:
tokenizer.word_index

{'<OOV>': 1,
 'said': 2,
 'mr': 3,
 'would': 4,
 'year': 5,
 'also': 6,
 'people': 7,
 'new': 8,
 'us': 9,
 'one': 10,
 'could': 11,
 'last': 12,
 'first': 13,
 'time': 14,
 'two': 15,
 'world': 16,
 'government': 17,
 'uk': 18,
 'years': 19,
 'best': 20,
 'make': 21,
 'told': 22,
 'get': 23,
 'music': 24,
 'like': 25,
 'film': 26,
 'game': 27,
 'made': 28,
 '000': 29,
 'back': 30,
 'many': 31,
 'three': 32,
 'labour': 33,
 'well': 34,
 'set': 35,
 'bbc': 36,
 'number': 37,
 '1': 38,
 'way': 39,
 'next': 40,
 'added': 41,
 'take': 42,
 '2': 43,
 'company': 44,
 'says': 45,
 'market': 46,
 'good': 47,
 'may': 48,
 '2004': 49,
 'home': 50,
 'election': 51,
 'going': 52,
 'england': 53,
 'party': 54,
 'games': 55,
 'work': 56,
 'blair': 57,
 '6': 58,
 'show': 59,
 'much': 60,
 'still': 61,
 'go': 62,
 'think': 63,
 'second': 64,
 'firm': 65,
 'since': 66,
 'win': 67,
 'top': 68,
 'say': 69,
 'play': 70,
 'week': 71,
 'million': 72,
 'part': 73,
 'public': 74,
 'want': 75,
 'use': 76,
 'te

In [19]:
#how many unique words 

len(tokenizer.word_index)

27038

### Convert words to number

In [21]:
#converting words to number 

train_seq = tokenizer.texts_to_sequences(X_train)

### Let's see if text_to_sequence makes sense

In [22]:
X_train[10]

'greek sprinters run careers sprinters kostas kenteris katerina thanou says boss organisation cleared missing drugs test. greek athletics federation boss vassilli sevastis told country parliament: believe kenteris thanou race again. damage commercial interests done added. athletics bosses considering reponse ruling athletes face trial greek court. greek prosecutors brought spearate charges missing drugs test faking motorcycle accident. speaking greek parliament tuesday sevastis said evidence sent international olympic committee athletics governing body iaaf strong enough greek association find sprinters guilty. given task getting snake hole given evidence said. greek hand heart try athletes added. athletes technically free compete iaaf reviews response decision clear kenteris thanou. sevastis said: matter found guilty court arbitration sport current decision reversed.'

In [23]:
#check if its working

print(train_seq[10])

[1747, 3939, 212, 4717, 3939, 4718, 1748, 4959, 2001, 45, 575, 1046, 2846, 1267, 767, 593, 1747, 1445, 2100, 575, 1, 1, 22, 89, 542, 266, 1748, 2001, 493, 1775, 1383, 1240, 1467, 379, 41, 1445, 3250, 1914, 1, 1138, 1953, 287, 574, 1747, 194, 1747, 2037, 724, 1, 898, 1267, 767, 593, 1, 3628, 4103, 876, 1747, 542, 413, 1, 2, 532, 713, 155, 702, 483, 1445, 3137, 877, 1880, 276, 362, 1747, 630, 324, 3939, 1022, 204, 1776, 390, 1, 3138, 204, 532, 2, 1747, 823, 1302, 376, 1953, 41, 1953, 4960, 312, 1241, 1880, 3362, 846, 207, 373, 1748, 2001, 1, 2, 1012, 188, 1022, 194, 4494, 533, 270, 207, 1]


In [24]:
len(train_seq[10])

114

In [25]:
len(train_seq[5])

317

### Let's do padding

When we train neural networks for NLP, we need sequences to be in the same size, that’s why we use padding.

In [26]:
train_padded = pad_sequences(train_seq, maxlen = max_length, padding = padding_type, truncating = trunc_type)

In [28]:
#0 represents padding

train_padded[10]

array([1747, 3939,  212, 4717, 3939, 4718, 1748, 4959, 2001,   45,  575,
       1046, 2846, 1267,  767,  593, 1747, 1445, 2100,  575,    1,    1,
         22,   89,  542,  266, 1748, 2001,  493, 1775, 1383, 1240, 1467,
        379,   41, 1445, 3250, 1914,    1, 1138, 1953,  287,  574, 1747,
        194, 1747, 2037,  724,    1,  898, 1267,  767,  593,    1, 3628,
       4103,  876, 1747,  542,  413,    1,    2,  532,  713,  155,  702,
        483, 1445, 3137,  877, 1880,  276,  362, 1747,  630,  324, 3939,
       1022,  204, 1776,  390,    1, 3138,  204,  532,    2, 1747,  823,
       1302,  376, 1953,   41, 1953, 4960,  312, 1241, 1880, 3362,  846,
        207,  373, 1748, 2001,    1,    2, 1012,  188, 1022,  194, 4494,
        533,  270,  207,    1,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
          0,    0,    0,    0,    0,    0,    0,   

In [29]:
# no 0's because no padding was done as length was over 200. 

train_padded[5]

array([ 831, 2312, 2845,  898,  528,    1,    1,  240,  831, 3243,  899,
       2095,  290, 2035,    1, 2933,  409, 1772,  527,    1, 3243, 4481,
       2095,  732,  899,    1, 2035,  696,    1,    1, 1055,    5,  108,
       2368,  757,  242,    1,    1,    1, 3935,  963,  489, 2201,    1,
       1802, 1913, 2035, 3483,  104,   74,  613,  420,  386,    2,   79,
        528,  409,  337,    1,   95,  662,  924,    1,  925,  386, 1773,
        427, 1833,  898,    1,  119,  560,  409,    1, 2481, 3936,    2,
         14, 4703,  409, 4952, 1214, 2546, 2546,    1,  240,  831, 1214,
       2546,    2,  542,  463,   21, 2934,  240,  831, 3243, 2774, 1214,
       2546,  379,    1, 4704,  912, 4704,   74,  613,  420, 2546, 1022,
        221, 1119,    1,  786,    1,  194, 1157,  223,  386,  167,    2,
          1,    1,  122,    1,    1, 1875,  269,  302,    1, 2312,    2,
          1,  379,  539,  700, 1211, 2249,  649,  329,  420,   25,    1,
         25,  937,   89,   10,  409,  303,    1, 23

In [32]:
#Checking that it all worked

print(len(train_padded[5]))
print(len(train_padded[10]))

200
200


In [34]:
#doing same thing to test data

test_seq = tokenizer.texts_to_sequences(X_test)

test_padded = pad_sequences(test_seq, maxlen =  max_length, padding=padding_type, truncating = trunc_type)

### Encode our labels

In [None]:
#df['label'] = df['label'].map({'sport': 0})

In [35]:
le = LabelEncoder()

y_train = le.fit_transform(y_train)

y_test = le.transform(y_test)

In [36]:
y_train[:5]

array([3, 0, 1, 4, 1])

In [37]:
y_train = to_categorical(y_train, 5)
y_test = to_categorical(y_test, 5)

In [38]:
y_train[:5]

array([[0., 0., 0., 1., 0.],
       [1., 0., 0., 0., 0.],
       [0., 1., 0., 0., 0.],
       [0., 0., 0., 0., 1.],
       [0., 1., 0., 0., 0.]], dtype=float32)

### Define  model

In [46]:
from keras.layers import Bidirectional

In [47]:
model = Sequential()

#when using Rnn and text first layer should be embedding layer 
model.add(Embedding(vocab_size, embedding_dim, input_length = max_length))

#LSTM
model.add(Bidirectional(LSTM(64, return_sequences = True)))#LSTM, with 64 dimension output, returning unflattened. 
model.add(LSTM(32))                          #LSTM goint to convert 64 dimension to 32 dimension output
model.add(Dense(5, activation = 'softmax'))

In [48]:
model.summary()

Model: "sequential_2"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 200, 64)           320000    
_________________________________________________________________
bidirectional (Bidirectional (None, 200, 128)          66048     
_________________________________________________________________
lstm_5 (LSTM)                (None, 32)                20608     
_________________________________________________________________
dense_1 (Dense)              (None, 5)                 165       
Total params: 406,821
Trainable params: 406,821
Non-trainable params: 0
_________________________________________________________________


In [49]:
#compile model 

model.compile(loss = 'categorical_crossentropy', optimizer = 'adam', metrics = ['acc'])

### Fit model

In [51]:
model.fit(train_padded, y_train, validation_data = (test_padded, y_test), batch_size = 128, epochs = 30)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30


<tensorflow.python.keras.callbacks.History at 0x7f961d4601f0>

## Conclusion 
Accuracy around 86%, could be improved by increasing training data amount and  increasing model complexity. This dataset is limited.