In [46]:
import pandas as pd
import numpy as np
import wordcloud

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import LSTM

In [47]:
cols = ['text', 'label']

In [48]:
df_train = pd.read_csv('data/train.csv', usecols=cols)
df_test = pd.read_csv('data/test.csv', usecols=['text'])
df_train.head()

Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,Ever get the feeling your life circles the rou...,0
2,"Why the Truth Might Get You Fired October 29, ...",1
3,Videos 15 Civilians Killed In Single US Airstr...,1
4,Print \nAn Iranian woman has been sentenced to...,1


In [49]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    20761 non-null  object
 1   label   20800 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 325.1+ KB


In [50]:
#drop null values
df_train.dropna(inplace=True)

In [51]:
# Preprocessing
def preprocess(df):
    # Remove punctuation
    df['text'] = df['text'].str.replace('[^\w\s]','')
    # Remove numbers
    df['text'] = df['text'].str.replace('\d+', '')
    #remove special characters
    df['text'] = df['text'].str.replace('[^a-zA-Z]', ' ')
    # Remove stopwords
    stop = stopwords.words('english')
    df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    df['text'] = df['text'].apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split()]))
    return df

In [52]:
df_train_processed = preprocess(df_train)

  df['text'] = df['text'].str.replace('[^\w\s]','')
  df['text'] = df['text'].str.replace('\d+', '')
  df['text'] = df['text'].str.replace('[^a-zA-Z]', ' ')


In [53]:
df_train_processed.head()

Unnamed: 0,text,label
0,House Dem Aide We Didnt Even See Comeys Letter...,1
1,Ever get feeling life circle roundabout rather...,0
2,Why Truth Might Get You Fired October The tens...,1
3,Videos Civilians Killed In Single US Airstrike...,1
4,Print An Iranian woman sentenced six year pris...,1


In [54]:
#check number of distinct words in the dataset
len(set(" ".join(df_train_processed['text']).split()))

201581

In [78]:
vocab_size = 50000 #this number of numbers will be used to represent words
encoded_docs = [one_hot(d, vocab_size) for d in df_train_processed['text']]

In [79]:
# example_sentence = "This is an example sentence!"
# one_hot(example_sentence, 50000 )

In [80]:
#input length is the max length of the sentence
max_length = 50
embeded_vecotr_size = 50
model = Sequential()
model.add(Embedding(vocab_size, embeded_vecotr_size, input_length=max_length))
#lstm layer
model.add(LSTM(100))
model.add(Dense(512, activation='relu'))
model.add(Dense(256, activation='relu'))
model.add(Dense(128, activation='relu'))
#5 is the number of classes
model.add(Dense(5, activation='softmax'))
model.compile(optimizer='adam', loss='sparse_categorical_crossentropy', metrics=['accuracy'])
print(model.summary())

Model: "sequential_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_6 (Embedding)     (None, 50, 50)            2500000   
                                                                 
 lstm_6 (LSTM)               (None, 100)               60400     
                                                                 
 dense_24 (Dense)            (None, 512)               51712     
                                                                 
 dense_25 (Dense)            (None, 256)               131328    
                                                                 
 dense_26 (Dense)            (None, 128)               32896     
                                                                 
 dense_27 (Dense)            (None, 5)                 645       
                                                                 
Total params: 2,776,981
Trainable params: 2,776,981
No

In [81]:
#pad the sentences to the max length
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[ 7700 32011 18174 ... 29974 46169 33732]
 [35311 39180 17819 ...  2424  4769    53]
 [30295 18254 28879 ... 29609 42763  2564]
 ...
 [43394 22953 15938 ... 37844  4522 36529]
 [15746  9050 21820 ... 47042 11193  5952]
 [11821 32080 44544 ...  2382 41814 29474]]


In [82]:
X = padded_docs
y = df_train_processed['label']

In [83]:
#split the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [84]:
#train the model
model.fit(X_train, y_train, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f583a038b20>

In [85]:
#evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 90.946305
