# Kaggle Competition using NLP



In [1]:
# import relevant packages

import pandas as pd
import numpy as np

In [2]:
# get data loaded into dataframe
validation_data = pd.read_csv('nlp-getting-started/test.csv')
train_data = pd.read_csv('nlp-getting-started/train.csv')

In [3]:
train_data.head()

Unnamed: 0,id,keyword,location,text,target
0,1,,,Our Deeds are the Reason of this #earthquake M...,1
1,4,,,Forest fire near La Ronge Sask. Canada,1
2,5,,,All residents asked to 'shelter in place' are ...,1
3,6,,,"13,000 people receive #wildfires evacuation or...",1
4,7,,,Just got sent this photo from Ruby #Alaska as ...,1


In [4]:
print(train_data.isna().sum())
print(len(train_data))

id             0
keyword       61
location    2533
text           0
target         0
dtype: int64
7613


In [5]:
# Lets get rid of the keyword and location columns because i don't reckon we need them
train_data = train_data.drop(columns = ['keyword','location'])

In [63]:
# Lets clean up the data in text
# importing all necessary modules
from nltk.tokenize import sent_tokenize, word_tokenize
import warnings
warnings.filterwarnings(action='ignore')
from pathlib import Path
import re
import nltk
import pickle
from tensorflow.keras.preprocessing.text import Tokenizer


# Preparing the dataset
#all_sentences = nltk.sent_tokenize(all_data_string)
#all_words = [nltk.word_tokenize(sent) for sent in all_sentences]

def clean_data(data):
    # This function takes an array of strings and returns an array of cleaned up strings
    cleaned_data = []
    for row,texts in enumerate(data):
        texts = texts.lower()
        # remove special characters
        texts = texts.replace(r"(http|@)\S+", "")
        texts = texts.replace(r'http\S+',"")
        texts = texts.replace(r"::", " ")
        texts = texts.replace(r"’", "")
        texts = texts.replace(r",", " ")
        texts = texts.replace(r"[^a-z\':_]", " ")
        texts = texts.replace("#","")
        texts = texts.replace("!","")
        texts = texts.replace("?","")
        # remove repetition
        # Transform short negation form
        texts = texts.replace(r"(can't|cannot)", 'can not')
        texts = texts.replace(r"n't", ' not')
        # Remove stop words
        stopwords = nltk.corpus.stopwords.words('english')
        stopwords.remove('not')
        stopwords.remove('nor')
        stopwords.remove('no')
        cleaned_line = ''
        for word in texts.split(" "):
            if word not in stopwords:
                cleaned_line = cleaned_line + " " + word
        cleaned_data.append(cleaned_line)
    return cleaned_data

X = clean_data(train_data['text'])
y = np.array(train_data['target'])
#X_test = clean_data(test_data['text'])

In [64]:
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [68]:
X_test

[' new weapon cause un-imaginable destruction.',
 ' f$&amp;@ing things gishwhes got soaked deluge going pads tampons. thx @mishacollins @/@',
 ' dt @georgegalloway: rt @galloway4mayor: \x89ûïthe col police catch pickpocket liverpool stree... http://t.co/vxin1goq4q',
 ' aftershock back school kick great. want thank everyone making possible. great night.',
 ' response trauma children addicts develop defensive self - one decreases vulnerability. (3',
 ' @calum5sos look like got caught rainstorm amazing disgusting time',
 ' favorite lady came volunteer meeting\nhopefully joining youth collision excite http://t.co/ij0wq490cs',
 ' @brianroemmele ux fail emv - people want insert remove quickly like gas pump stripe reader. 1 person told crashed pos',
 ' ca not find ariana grande shirt  fucking tragedy',
 ' murderous story america\x89ûªs first hijacking http://t.co/eyugk6byxr',
 ' akilah world news cop pulls man car avoid ... http://t.co/vn2fnmy7li',
 ' walk plank sinking ship',
 ' @zak_bagans 

# Standard ML models first using Tfidf and count vectorizers


In [73]:
# https://github.com/aditya-xq/Text-Emotion-Detection-Using-NLP

from sklearn.feature_extraction.text import TfidfVectorizer

# Extracting TF-IDF parameters
tfidf = TfidfVectorizer(max_features=1000, analyzer='word',ngram_range=(1,3))
X_train_tfidf = tfidf.fit_transform(X_train)
X_test_tfidf = tfidf.fit_transform(X_test)

In [104]:
from sklearn.feature_extraction.text import CountVectorizer

# Extracting Count Vectors Parameters
count_vect = CountVectorizer(analyzer='word')
count_vect.fit(X_train)
X_train_count =  count_vect.transform(X_train)
X_test_count =  count_vect.transform(X_test)

# Trying four classification models with the TfidfVectorizer

1. Naive Bayes
2. Stochastic Gradient Descent
3. Logistic Regression
4. Random Forest

In [105]:
from sklearn.naive_bayes import MultinomialNB
from sklearn.metrics import accuracy_score

# Model 1: Multinomial Naive Bayes Classifier
nb = MultinomialNB()
nb.fit(X_train_tfidf, y_train)
y_pred = nb.predict(X_test_tfidf)
print('naive bayes tfidf accuracy %s' % accuracy_score(y_pred, y_test))
# naive bayes tfidf accuracy 0.5289017341040463

naive bayes tfidf accuracy 0.5053720652606446


In [106]:
from sklearn.linear_model import SGDClassifier

# Model 2: Linear SVM
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_tfidf, y_train)
y_pred = lsvm.predict(X_test_tfidf)
print('svm using tfidf accuracy %s' % accuracy_score(y_pred, y_test))

svm using tfidf accuracy 0.536808595304417


In [107]:
from sklearn.linear_model import LogisticRegression

# Model 3: logistic regression
logreg = LogisticRegression(C=1)
logreg.fit(X_train_tfidf, y_train)
y_pred = logreg.predict(X_test_tfidf)
print('log reg tfidf accuracy %s' % accuracy_score(y_pred, y_test))

log reg tfidf accuracy 0.52566653402308


In [108]:
from sklearn.ensemble import RandomForestClassifier

# Model 4: Random Forest Classifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train_tfidf, y_train)
y_pred = rf.predict(X_test_tfidf)
print('random forest tfidf accuracy %s' % accuracy_score(y_pred, y_test))

random forest tfidf accuracy 0.5559092717867091


# Try the same models but this time with the count vector

In [109]:
## Building models using count vectors feature
# Model 1: Multinomial Naive Bayes Classifier
nb = MultinomialNB()
nb.fit(X_train_count, y_train)
y_pred = nb.predict(X_test_count)
print('naive bayes count vectors accuracy %s' % accuracy_score(y_pred, y_test))

naive bayes count vectors accuracy 0.7934739355352168


In [110]:
# Model 2: Linear SVM
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_count, y_train)
y_pred = lsvm.predict(X_test_count)
print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_test))

lsvm using count vectors accuracy 0.7958615200955034


In [111]:
# Model 3: Logistic Regression
logreg = LogisticRegression(C=1)
logreg.fit(X_train_count, y_train)
y_pred = logreg.predict(X_test_count)
print('log reg count vectors accuracy %s' % accuracy_score(y_pred, y_test))

log reg count vectors accuracy 0.7926780740151214


In [17]:
# Model 4: Random Forest Classifier
rf = RandomForestClassifier(n_estimators=500)
rf.fit(X_train_count, y_train)
y_pred = rf.predict(X_test_count)
print('random forest with count vectors accuracy %s' % accuracy_score(y_pred, y_test))

random forest with count vectors accuracy 0.7843215280541186


It seems like the count vectors are way generally way better - like 20% better.

Now lets try something a little bit more advanced...

# Neural network model

In [103]:
# Now that the data is cleaned up we need to tokenize it
from tensorflow.keras.preprocessing.sequence import pad_sequences
num_words = 20000
max_length = 100
padding_type = 'post'
trunc_type = 'post'

tokenizer = Tokenizer(num_words=num_words, lower=True)
tokenizer.fit_on_texts(X_train)
word_index = tokenizer.word_index

train_sequences = tokenizer.texts_to_sequences(X_train)
train_padded = pad_sequences(train_sequences, padding = padding_type, maxlen = max_length, truncating = trunc_type)

test_sequences = tokenizer.texts_to_sequences(X_test)
test_padded = pad_sequences(test_sequences, padding = padding_type, maxlen = max_length, truncating = trunc_type)

In [87]:
training_padded = np.array(train_padded)
training_labels = np.array(y_train)
testing_padded = np.array(test_padded)
testing_labels = np.array(y_test)

training_padded.shape

(5100, 100)

In [94]:
# The data is tokenized - now lets try putting it in the model
import tensorflow as tf

from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession
tf.compat.v1.disable_eager_execution()
config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config=config)
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Flatten
from keras.layers.convolutional import Conv1D
from keras.layers.convolutional import MaxPooling1D
from keras.layers.embeddings import Embedding
from keras.preprocessing import sequence

max_words = 100
# Building the CNN Model
model = Sequential()      # initilaizing the Sequential nature for CNN model# Adding the embedding layer which will take in maximum of 450 words as input and provide a 32 dimensional output of those words which belong in the top_words dictionary
model.add(Embedding(20000, 32, input_length=max_words))
model.add(Conv1D(32, 3, padding='same', activation='relu'))
model.add(MaxPooling1D())
model.add(Flatten())
model.add(Dense(250, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

model.compile(loss='binary_crossentropy',optimizer='adam',metrics=['accuracy'])


In [95]:
# create model

model.summary()


Model: "sequential_11"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_10 (Embedding)     (None, 100, 32)           640000    
_________________________________________________________________
conv1d_6 (Conv1D)            (None, 100, 32)           3104      
_________________________________________________________________
max_pooling1d_4 (MaxPooling1 (None, 50, 32)            0         
_________________________________________________________________
flatten_4 (Flatten)          (None, 1600)              0         
_________________________________________________________________
dense_16 (Dense)             (None, 250)               400250    
_________________________________________________________________
dense_17 (Dense)             (None, 1)                 251       
Total params: 1,043,605
Trainable params: 1,043,605
Non-trainable params: 0
___________________________________________

In [101]:
num_epochs = 20
history = model.fit(training_padded, training_labels,batch_size=128, epochs=num_epochs, 
                    validation_data=(testing_padded, testing_labels), verbose=2)

Train on 5100 samples, validate on 2513 samples
Epoch 1/20
5100/5100 - 0s - loss: 0.0074 - accuracy: 0.9971 - val_loss: 0.8827 - val_accuracy: 0.7776
Epoch 2/20
5100/5100 - 0s - loss: 0.0065 - accuracy: 0.9969 - val_loss: 0.8530 - val_accuracy: 0.7696
Epoch 3/20
5100/5100 - 0s - loss: 0.0065 - accuracy: 0.9971 - val_loss: 0.8732 - val_accuracy: 0.7736
Epoch 4/20
5100/5100 - 0s - loss: 0.0067 - accuracy: 0.9969 - val_loss: 0.8624 - val_accuracy: 0.7772
Epoch 5/20
5100/5100 - 0s - loss: 0.0059 - accuracy: 0.9967 - val_loss: 0.9273 - val_accuracy: 0.7764
Epoch 6/20
5100/5100 - 0s - loss: 0.0069 - accuracy: 0.9967 - val_loss: 0.8906 - val_accuracy: 0.7756
Epoch 7/20
5100/5100 - 0s - loss: 0.0071 - accuracy: 0.9967 - val_loss: 0.8837 - val_accuracy: 0.7756
Epoch 8/20
5100/5100 - 0s - loss: 0.0068 - accuracy: 0.9973 - val_loss: 0.8520 - val_accuracy: 0.7728
Epoch 9/20
5100/5100 - 0s - loss: 0.0058 - accuracy: 0.9965 - val_loss: 0.9566 - val_accuracy: 0.7724
Epoch 10/20
5100/5100 - 0s - loss:

In [102]:
# Getting score metrics from our model
scores = model.evaluate(testing_padded, y_test, verbose=0)# Displays the accuracy of correct sentiment prediction over test data
print("Accuracy: %.2f%%" % (scores[1]*100))

Accuracy: 77.40%


In [100]:
# Model 2: Linear SVM
lsvm = SGDClassifier(alpha=0.001, random_state=5, max_iter=15, tol=None)
lsvm.fit(X_train_count, y_train)
y_pred = lsvm.predict(X_test_count)
print('lsvm using count vectors accuracy %s' % accuracy_score(y_pred, y_test))

lsvm using count vectors accuracy 0.7958615200955034


Interesting... it turns out that the lvsm model using count vectors has better accuracy than the convolutional neural network model. Next it could be worth trying one a bidirectional neural network so it is better at seeing the context?

Below are the predictions from the CNN model and then the LVSM models respectively

In [114]:
X_test_actual = clean_data(validation_data['text'])
actual_test_sequences = tokenizer.texts_to_sequences(X_test_actual)
actual_test_padded = pad_sequences(actual_test_sequences, padding = padding_type, maxlen = max_length, truncating = trunc_type)

predictions = model.predict(actual_test_padded)
predictions

array([[0.02017093],
       [0.98771703],
       [0.9981112 ],
       ...,
       [0.79837924],
       [0.98159325],
       [0.957903  ]], dtype=float32)

In [115]:
X_test_actual_count_vec =  count_vect.transform(X_test_actual)

predictions = lsvm.predict(X_test_actual_count_vec)
predictions

array([0, 1, 1, ..., 1, 1, 0])