In [1]:
import numpy as np
import pandas as pd
import tensorflow as tf

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from sklearn.model_selection import train_test_split

Intake the training data and only keep the necessary columns

In [2]:
data = pd.read_csv('Twitter_training.csv', names=["Tweet_ID", "Entity", "Sentiment", "Text"])
data = data[['Text','Sentiment']]
data.head()

Unnamed: 0,Text,Sentiment
0,im getting on borderlands and i will murder yo...,Positive
1,I am coming to the borders and I will kill you...,Positive
2,im getting on borderlands and i will kill you ...,Positive
3,im coming on borderlands and i will murder you...,Positive
4,im getting on borderlands 2 and i will murder ...,Positive


Goal is to identify Positive and Negative tweets, drop everything else and keep only valid text

In [3]:
import re
data = data[data.Sentiment != "Neutral"]
data = data[data.Sentiment != "Irrelevant"]
data.Text = data.Text.apply(lambda x: str(x).lower())
data.Text = data.Text.apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

Lemmatize words to elimate stopwords that provide no context

In [4]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatiser = WordNetLemmatizer()
stopwords = set(stopwords.words())
def remove_stopwords(ls):
    # Removes stop words and lemmatises
    ls = [lemmatiser.lemmatize(word) for word in ls if word not in (stopwords) and (word.isalpha())]
    
    ls = " ".join(ls)
    return ls

data.Text = data.Text.apply(word_tokenize)
data.Text = data.Text.apply(remove_stopwords)

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\savio\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\savio\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\savio\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Tokenize the words to eliminate variations of words

In [5]:
print(data[data.Sentiment == 'Positive'].size)
print(data[data.Sentiment == 'Negative'].size)

for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')
    
max_features = 1000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data.Text.values)
X = tokenizer.texts_to_sequences(data.Text.values)
X = pad_sequences(X)

41664
45084


Visualization of data after applying transformations

In [6]:
data.head()

Unnamed: 0,Text,Sentiment
0,getting borderland murder,Positive
1,coming border kill,Positive
2,getting borderland kill,Positive
3,coming borderland murder,Positive
4,getting borderland murder,Positive


Here, we used LSTM, a recurrent neural network implmentation, to differentiate and distinguish the context of the content as the method to determining sentiment. 

The LSTM layer only uses the dropout and not the recurrent_dropout parameter in order to accelerate training. Recurrent_dropout is currently not supported by Nvidia CUDNN and will prevent the model from utilizing GPU acceleration.

The Dense layer should only be 2 units as our sentiment has only 2 possible values

In [7]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

embed_dim = 128
lstm_out = 196

model = Sequential([
     Embedding(max_features, embed_dim, input_length = X.shape[1]),
     SpatialDropout1D(0.4),
     LSTM(lstm_out, dropout=0.2),
     Dense(2, activation='softmax')
])

model.compile(
     loss='categorical_crossentropy',
     optimizer='adam',
     metrics=['accuracy']
)
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 99, 128)           128000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 99, 128)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 2)                 394       
Total params: 383,194
Trainable params: 383,194
Non-trainable params: 0
_________________________________________________________________
None


Split the dataset into training and validation

In [8]:
Y = pd.get_dummies(data.Sentiment).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(29060, 99) (29060, 2)
(14314, 99) (14314, 2)


In [9]:
model.fit(X_train, Y_train, epochs = 10, batch_size=32,
    callbacks=[tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=4)], verbose = 1)

Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x19cb316d160>

In [10]:
validation_size = 1500

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = 32)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

401/401 - 2s - loss: 0.3184 - accuracy: 0.8707
score: 0.32
acc: 0.87


Run tests on validation set

In [11]:
accuracy = {
    'pos_cnt':0, 
    'neg_cnt':0, 
    'pos_correct':0, 
    'neg_correct':0
}

def inc(count):
    count+=1

for x in range(len(X_validate)):
    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]), batch_size=1)[0]
   
    if np.argmax(result) == np.argmax(Y_validate[x]):
        accuracy['neg_correct' if np.argmax(Y_validate[x]) == 0 else 'pos_correct'] +=1
       
    accuracy['neg_cnt' if np.argmax(Y_validate[x]) == 0 else 'pos_cnt'] +=1

print("pos_acc", accuracy['pos_correct']/accuracy['pos_cnt']*100, "%")
print("neg_acc", accuracy['neg_correct']/accuracy['neg_cnt']*100, "%")

pos_acc 90.19337016574586 %
neg_acc 84.27835051546391 %


Vectorize the tweet by the pre-fitted tokenizer instance then pad the tweet to have the same dimensions as the input

In [12]:
def apply_prediction(twt):
    twtData = tokenizer.texts_to_sequences([twt])
    twtData = pad_sequences(twtData, maxlen=28, dtype='int32', value=0)
    print(twtData)
    sentiment = model.predict(twtData,batch_size=1,verbose = 2)[0]
    sentimentValue = "negative" if(np.argmax(sentiment) == 0) else "positive"
    return sentimentValue
    
twt = 'The new CoD is pretty lit'
print(apply_prediction(twt))

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0  10 308 167 902]]
1/1 - 0s
positive
