In [0]:
# !wget 'https://www.dropbox.com/s/d49goxjvju20n1m/train.csv'

In [0]:
#Importing required packages
import numpy as np
import pandas as pd

In [0]:
train_file= pd.read_csv('/content/train.csv')  #Reading the data

In [4]:
train_file.head()

Unnamed: 0,id,title,author,text,label
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...,0
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ...",1
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...,1
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...,1


In [5]:
train_file.shape

(20800, 5)

In [0]:
#Lets drop NA values...
train_file= train_file.dropna()

In [7]:
train_file.shape

(18285, 5)

In [0]:
X= train_file.drop('label', axis=1)
Y= train_file['label']

In [9]:
X.head()

Unnamed: 0,id,title,author,text
0,0,House Dem Aide: We Didn’t Even See Comey’s Let...,Darrell Lucus,House Dem Aide: We Didn’t Even See Comey’s Let...
1,1,"FLYNN: Hillary Clinton, Big Woman on Campus - ...",Daniel J. Flynn,Ever get the feeling your life circles the rou...
2,2,Why the Truth Might Get You Fired,Consortiumnews.com,"Why the Truth Might Get You Fired October 29, ..."
3,3,15 Civilians Killed In Single US Airstrike Hav...,Jessica Purkiss,Videos 15 Civilians Killed In Single US Airstr...
4,4,Iranian woman jailed for fictional unpublished...,Howard Portnoy,Print \nAn Iranian woman has been sentenced to...


**Here we will consider only one independent column 'title' to predict our Target variable 'label', Hence lets clean our independent variable 'title'**

In [0]:
messages= X.copy()   #Making a copy of our data.

In [11]:
messages['title'][0]

'House Dem Aide: We Didn’t Even See Comey’s Letter Until Jason Chaffetz Tweeted It'

In [0]:
messages.reset_index(inplace=True)   #Resetting the index as we have dropped NA observations earlier.

In [0]:
#Importing required packages
from nltk import  word_tokenize
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
import nltk
import re

#nltk.download('stopwords')
#nltk.download('punkt')

In [0]:
#Removing all the unneccessary characters and keeping only alphabets in the title column. And also removing stopwords
ps = PorterStemmer()
corpus = []
for i in range(0, len(messages)):
    review = re.sub('[^a-zA-Z]', ' ', messages['title'][i])
    review= ' '.join([ps.stem(word) for word in review.lower().split() if word not in stopwords.words('english')])
    corpus.append(review)

In [0]:
#corpus

In [0]:
#Lets find teh length of each row.
sent_lens=[]
for sent in corpus:
  sent_lens.append(len(word_tokenize(sent)))

In [17]:
max(sent_lens)  #We see that max length of all the rows is 

47

In [18]:
np.quantile(sent_lens,0.98)

14.0

**Here we can see that 98% of the reviews have a length of 14. Hence, we would be taking max length as 14**

In [0]:
#Importing required packages...
import tensorflow as tf
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.layers import LSTM, Activation, Dense, Dropout, Input, Embedding
from tensorflow.keras.models import Sequential

from sklearn.model_selection import train_test_split

In [0]:
max_len=14
tok= Tokenizer(char_level=False, split=' ')     
tok.fit_on_texts(corpus)                           #Converting the sentences to words using Tokenizer
sequences_corpus= tok.texts_to_sequences(corpus)   

In [21]:
vocab_len= len(tok.index_word.keys())
vocab_len                                          # Here we see that after tokenizing the max length of the vocabularies is 13931 

13931

In [0]:
sequences_matrix_corpus= sequence.pad_sequences(sequences_corpus, maxlen=max_len)  #We would be padding the sequences after length of 14

In [23]:
sequences_matrix_corpus

array([[   0,    0,    0, ..., 1775, 3009,  420],
       [   0,    0,    0, ...,  150, 1090,    5],
       [   0,    0,    0, ...,  856,   33,   83],
       ...,
       [   0,    0,    0, ...,    1,    3,    2],
       [   0,    0,    0, ..., 4196, 2465, 5024],
       [   0,    0,    0, ...,  197,  137, 1455]], dtype=int32)

In [24]:
sequences_matrix_corpus.shape, Y.shape

((18285, 14), (18285,))

In [0]:
X_Final= sequences_matrix_corpus
Y_Final= Y

In [26]:
X_Final.shape, Y_Final.shape

((18285, 14), (18285,))

**Creating Model**

In [27]:
model= Sequential()
model.add(Embedding(vocab_len+1, 300, input_length=max_len))
model.add(Dropout(0.2))
model.add(LSTM(150))
model.add(Dropout(0.2))
model.add(Dense(1, activation='sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 14, 300)           4179600   
_________________________________________________________________
dropout (Dropout)            (None, 14, 300)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 150)               270600    
_________________________________________________________________
dropout_1 (Dropout)          (None, 150)               0         
_________________________________________________________________
dense (Dense)                (None, 1)                 151       
Total params: 4,450,351
Trainable params: 4,450,351
Non-trainable params: 0
_________________________________________________________________


**Splitting the data into Train and Test**

In [0]:
X_train, X_test, Y_train, Y_test= train_test_split(X_Final, Y_Final, test_size=0.2, stratify=Y_Final, random_state=100)

In [31]:
model.fit(X_train, Y_train, validation_data=(X_test, Y_test), epochs=20, batch_size=200)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


<tensorflow.python.keras.callbacks.History at 0x7f36119cfa20>

**Performancd Metrics and Accuracy**

In [32]:
Y_pred= model.predict_classes(X_test)

Instructions for updating:
Please use instead:* `np.argmax(model.predict(x), axis=-1)`,   if your model does multi-class classification   (e.g. if it uses a `softmax` last-layer activation).* `(model.predict(x) > 0.5).astype("int32")`,   if your model does binary classification   (e.g. if it uses a `sigmoid` last-layer activation).


In [0]:
#Importing required packages...
from sklearn.metrics import confusion_matrix, accuracy_score, roc_auc_score

In [34]:
confusion_matrix(Y_test, Y_pred)

array([[1914,  158],
       [ 124, 1461]])

In [35]:
accuracy_score(Y_test, Y_pred), roc_auc_score(Y_test, Y_pred)

(0.9228876127973749, 0.9227558676296845)