In [1]:
import pandas as pd
import numpy as np
import wordcloud

import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer

from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Flatten
from tensorflow.keras.layers import Embedding

2022-09-15 15:06:37.861342: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2022-09-15 15:06:37.861381: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
cols = ['text', 'label']

In [3]:
df_train = pd.read_csv('data/train.csv', usecols=cols)
df_test = pd.read_csv('data/test.csv', usecols=['text'])
df_train.head()

Unnamed: 0,text,label
0,House Dem Aide: We Didn’t Even See Comey’s Let...,1
1,Ever get the feeling your life circles the rou...,0
2,"Why the Truth Might Get You Fired October 29, ...",1
3,Videos 15 Civilians Killed In Single US Airstr...,1
4,Print \nAn Iranian woman has been sentenced to...,1


In [4]:
df_train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20800 entries, 0 to 20799
Data columns (total 2 columns):
 #   Column  Non-Null Count  Dtype 
---  ------  --------------  ----- 
 0   text    20761 non-null  object
 1   label   20800 non-null  int64 
dtypes: int64(1), object(1)
memory usage: 325.1+ KB


In [5]:
#drop null values
df_train.dropna(inplace=True)

In [6]:
# Preprocessing
def preprocess(df):
    # Remove punctuation
    df['text'] = df['text'].str.replace('[^\w\s]','')
    # Remove numbers
    df['text'] = df['text'].str.replace('\d+', '')
    #remove special characters
    df['text'] = df['text'].str.replace('[^a-zA-Z]', ' ')
    # Remove stopwords
    stop = stopwords.words('english')
    df['text'] = df['text'].apply(lambda x: " ".join(x for x in x.split() if x not in stop))
    # Lemmatization
    lemmatizer = WordNetLemmatizer()
    df['text'] = df['text'].apply(lambda x: " ".join([lemmatizer.lemmatize(word) for word in x.split()]))
    return df

In [7]:
df_train_processed = preprocess(df_train)

In [8]:
df_train_processed.head()

Unnamed: 0,text,label
0,House Dem Aide We Didnt Even See Comeys Letter...,1
1,Ever get feeling life circle roundabout rather...,0
2,Why Truth Might Get You Fired October The tens...,1
3,Videos Civilians Killed In Single US Airstrike...,1
4,Print An Iranian woman sentenced six year pris...,1


In [9]:
vocab_size = 5000
encoded_docs = [one_hot(d, vocab_size) for d in df_train_processed['text']]

In [10]:
#input length is the max length of the sentence
max_length = 50
embeded_vecotr_size = 32
model = Sequential()
model.add(Embedding(vocab_size, embeded_vecotr_size, input_length=max_length))
model.add(Flatten())
model.add(Dense(1, activation='sigmoid'))
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
print(model.summary())

Model: "sequential"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding (Embedding)       (None, 50, 32)            160000    
                                                                 
 flatten (Flatten)           (None, 1600)              0         
                                                                 
 dense (Dense)               (None, 1)                 1601      
                                                                 
Total params: 161,601
Trainable params: 161,601
Non-trainable params: 0
_________________________________________________________________
None


2022-09-15 15:09:09.181463: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcuda.so.1'; dlerror: libcuda.so.1: cannot open shared object file: No such file or directory
2022-09-15 15:09:09.181532: W tensorflow/stream_executor/cuda/cuda_driver.cc:269] failed call to cuInit: UNKNOWN ERROR (303)
2022-09-15 15:09:09.181579: I tensorflow/stream_executor/cuda/cuda_diagnostics.cc:156] kernel driver does not appear to be running on this host (pops): /proc/driver/nvidia/version does not exist
2022-09-15 15:09:09.182568: I tensorflow/core/platform/cpu_feature_guard.cc:193] This TensorFlow binary is optimized with oneAPI Deep Neural Network Library (oneDNN) to use the following CPU instructions in performance-critical operations:  AVX2 FMA
To enable them in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [11]:
#pad the sentences to the max length
padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')
print(padded_docs)

[[3185 3781 2879 ... 1914 2793 3512]
 [1504 2570 3725 ... 2557 4764 4627]
 [1543  519 2545 ... 4583 2322 1756]
 ...
 [4434 1409 2508 ... 2380 1976  509]
 [  84  104  357 ...  753  230 1254]
 [1045 2662  973 ... 2757  456 4295]]


In [12]:
X = padded_docs
y = df_train_processed['label']

In [13]:
#split the data into train and test
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [14]:
#train the model
model.fit(X_train, y_train, epochs=5, verbose=1)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fd1284bf8b0>

In [15]:
#evaluate the model
loss, accuracy = model.evaluate(X_test, y_test, verbose=1)
print('Accuracy: %f' % (accuracy*100))

Accuracy: 90.440643
