In [50]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Activation
from tensorflow.keras.models import Sequential

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score

import re
import pickle

Intake the training data and only keep the necessary columns

In [3]:
data = pd.read_csv('twitter_training.csv', names=["Tweet_ID", "Entity", "Sentiment", "Text"])
data = data[['Text','Sentiment']]

data.head()

Unnamed: 0,Text,Sentiment
0,im getting on borderlands and i will murder yo...,Positive
1,I am coming to the borders and I will kill you...,Positive
2,im getting on borderlands and i will kill you ...,Positive
3,im coming on borderlands and i will murder you...,Positive
4,im getting on borderlands 2 and i will murder ...,Positive


In [72]:
data.Sentiment.unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [74]:
data.iloc[425].Text

"@GearboxOfficial @Borderlands I love(d) this game. But've spent 325hrs beating your game twice and farming like crazy to get which way those anoints that I want. And you say that that way that I'm playing wasn't which way it had intended so you weaken my favorite anoint..."

In [75]:
data.iloc[426].Text

'Yasss!!! Co-Stream with @jimmysgotya  twitch.tv/jimmysgotya'

Goal is to identify Positive and Negative tweets, drop everything else and keep only valid text

In [7]:
data = data[data.Sentiment != "Neutral"]
data = data[data.Sentiment != "Irrelevant"]
data.Text = data.Text.apply(lambda x: str(x).lower())
data.Text = data.Text.apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

# tfidf stuff 
corpus=data['Text']




In [11]:
tfidfvectorizer= TfidfVectorizer(stop_words='english', max_features=5000, min_df=5)
countvectorizer = CountVectorizer(analyzer='word', stop_words='english')

In [53]:

tfidf_matrix=tfidfvectorizer.fit_transform(corpus)
tfidf_data=tfidf_matrix.toarray()

countvectorizer_matrix = countvectorizer.fit_transform(corpus)
count_data = countvectorizer_matrix.toarray()

In [14]:
tfidf_data.shape

(43374, 5000)

In [13]:
count_data.shape

(43374, 22965)

In [15]:
count_tokens = countvectorizer.get_feature_names()
tfidf_tokens = tfidfvectorizer.get_feature_names()

In [19]:
x=tfidf_data
y= pd.get_dummies(data.Sentiment).values
X_train, X_test, Y_train, Y_test = train_test_split(x,y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(29060, 5000) (29060, 2)
(14314, 5000) (14314, 2)


In [26]:
checkpointer=ModelCheckpoint(filepath='best_weights1.hdf5',verbose=0,save_best_only=True)
monitor=EarlyStopping(monitor='val_loss',min_delta=1e-3,patience=2,verbose=2,mode='auto')
model1=Sequential()
model1.add(Dense(25, input_dim=x.shape[1],activation='relu'))
model1.add(Dense(2, activation='softmax'))
model1.compile(loss='categorical_crossentropy',optimizer='adam')



In [28]:


model1.fit(X_train,Y_train,validation_data=(X_test,Y_test),callbacks=[monitor,checkpointer],verbose=2,epochs=100)

Epoch 1/100
909/909 - 2s - loss: 0.4214 - val_loss: 0.3145
Epoch 2/100
909/909 - 1s - loss: 0.2627 - val_loss: 0.2810
Epoch 3/100
909/909 - 1s - loss: 0.2227 - val_loss: 0.2667
Epoch 4/100
909/909 - 1s - loss: 0.1989 - val_loss: 0.2608
Epoch 5/100
909/909 - 1s - loss: 0.1806 - val_loss: 0.2583
Epoch 6/100
909/909 - 1s - loss: 0.1652 - val_loss: 0.2622
Epoch 7/100
909/909 - 1s - loss: 0.1514 - val_loss: 0.2629
Epoch 00007: early stopping


<tensorflow.python.keras.callbacks.History at 0x7f84a7e64340>

In [52]:
model1.save_weights("model1.hdf5")

In [44]:
pred = model1.predict(X_test)
pred = np.argmax(pred, axis=1)

In [48]:
Y_test = np.argmax(Y_test, axis=1)

In [47]:
Y_test[0]

array([0, 1], dtype=uint8)

In [34]:
class_labels = ['negative', 'positive']
pred_class = class_labels[np.argmax(pred[0])]

In [35]:
pred_class

'positive'

In [37]:
class_labels[np.argmax(Y_test[0])]

'positive'

In [49]:
f1_score(Y_test, pred, average="weighted")

0.8949066131156255

Lemmatize words to elimate stopwords that provide no context

In [81]:
vectorizer2 = TfidfVectorizer(stop_words='english', max_features=5000, min_df=1)

test = pd.DataFrame(['wow I am really happy'], columns=['Text'])
test = vectorizer2.fit_transform(test.Text)
test_arr = test.toarray()

In [73]:
vectorizer2.get_feature_names()

['cool', 'happy', 'really', 'wow']

In [67]:
model1_test=Sequential()
model1_test.add(Dense(25, input_dim=x.shape[1],activation='relu'))
model1_test.add(Dense(2, activation='softmax'))
model1_test.compile(loss='categorical_crossentropy',optimizer='adam')

In [68]:
model1.load_weights('model1.hdf5')

In [78]:
test_pred = model1.predict(test_arr[0])

ValueError: cannot reshape array of size 4 into shape (5000,1)

In [60]:
import nltk
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer

nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

lemmatiser = WordNetLemmatizer()
stopwords = set(stopwords.words())
def remove_stopwords(ls):
    # Removes stop words and lemmatises
    ls = [lemmatiser.lemmatize(word) for word in ls if word not in (stopwords) and (word.isalpha())]
    
    ls = " ".join(ls)
    return ls



[nltk_data] Downloading package punkt to
[nltk_data]     /Users/austinwilson/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/austinwilson/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/austinwilson/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [17]:
data.Text = data.Text.apply(word_tokenize)
data.iloc[420].Text

In [61]:
data.iloc[420].Text

"@GearboxOfficial  @Borderlands I love(d) this game. I've spent 325hrs beating the game twice and farming like crazy to get the weapons and anoints that I want. Then you decide that the way that I'm playing wasn't the way you had intended so you weaken my favorite anoint..."

In [None]:
data.Text = data.Text.apply(remove_stopwords)

Tokenize the words to eliminate variations of words

In [41]:
print(data[data.Sentiment == 'Positive'].size)
print(data[data.Sentiment == 'Negative'].size)

for idx,row in data.iterrows():
    row[0] = row[0].replace('rt',' ')
    
max_features = 1000
tokenizer = Tokenizer(num_words=max_features, split=' ')
tokenizer.fit_on_texts(data.Text.values)
X = tokenizer.texts_to_sequences(data.Text.values)
X = pad_sequences(X)

41664
45084


In [42]:
X.shape

(43374, 99)

In [43]:
import pickle
with open('tokenizer.pickle', 'wb') as handle:
    pickle.dump(tokenizer, handle, protocol=pickle.HIGHEST_PROTOCOL)

Visualization of data after applying transformations

In [8]:
data.head()

Unnamed: 0,Text,Sentiment
0,getting borderland murder,Positive
1,coming border kill,Positive
2,getting borderland kill,Positive
3,coming borderland murder,Positive
4,getting borderland murder,Positive


Here, we used LSTM, a recurrent neural network implmentation, to differentiate and distinguish the context of the content as the method to determining sentiment. 

The LSTM layer only uses the dropout and not the recurrent_dropout parameter in order to accelerate training. Recurrent_dropout is currently not supported by Nvidia CUDNN and will prevent the model from utilizing GPU acceleration.

The Dense layer should only be 2 units as our sentiment has only 2 possible values

In [10]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Embedding, LSTM, SpatialDropout1D

embed_dim = 128
lstm_out = 196

model = Sequential([
     Embedding(max_features, embed_dim, input_length = X.shape[1]),
     SpatialDropout1D(0.4),
     LSTM(lstm_out, dropout=0.2),
     Dense(2, activation='softmax')
])

model.compile(
     loss='categorical_crossentropy',
     optimizer='adam',
     metrics=['accuracy']
)
print(model.summary())

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding (Embedding)        (None, 99, 128)           128000    
_________________________________________________________________
spatial_dropout1d (SpatialDr (None, 99, 128)           0         
_________________________________________________________________
lstm (LSTM)                  (None, 196)               254800    
_________________________________________________________________
dense (Dense)                (None, 2)                 394       
Total params: 383,194
Trainable params: 383,194
Non-trainable params: 0
_________________________________________________________________
None


Split the dataset into training and validation

In [11]:
Y = pd.get_dummies(data.Sentiment).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(29060, 99) (29060, 2)
(14314, 99) (14314, 2)


In [20]:
from tensorflow.keras.callbacks import EarlyStopping, ModelCheckpoint
monitor=EarlyStopping(monitor='val_loss',min_delta=1e-3,patience=2,verbose=2,mode='auto')
checkpointer=ModelCheckpoint(filepath='weights2.hdf5',verbose=0,save_best_only=True)
model.fit(X_train, Y_train, epochs = 10, batch_size=32,
    callbacks=[monitor], verbose = 1)
#tf.keras.callbacks.EarlyStopping(monitor='accuracy', patience=4)

Epoch 1/10
Epoch 2/10

KeyboardInterrupt: 

In [13]:
validation_size = 1500

X_validate = X_test[-validation_size:]
Y_validate = Y_test[-validation_size:]
X_test = X_test[:-validation_size]
Y_test = Y_test[:-validation_size]
score,acc = model.evaluate(X_test, Y_test, verbose = 2, batch_size = 32)
print("score: %.2f" % (score))
print("acc: %.2f" % (acc))

401/401 - 8s - loss: 0.3056 - accuracy: 0.8689
score: 0.31
acc: 0.87


Run tests on validation set

In [14]:
accuracy = {
    'pos_cnt':0, 
    'neg_cnt':0, 
    'pos_correct':0, 
    'neg_correct':0
}

def inc(count):
    count+=1

for x in range(len(X_validate)):
    result = model.predict(X_validate[x].reshape(1,X_test.shape[1]), batch_size=1)[0]
   
    if np.argmax(result) == np.argmax(Y_validate[x]):
        accuracy['neg_correct' if np.argmax(Y_validate[x]) == 0 else 'pos_correct'] +=1
       
    accuracy['neg_cnt' if np.argmax(Y_validate[x]) == 0 else 'pos_cnt'] +=1

print("pos_acc", accuracy['pos_correct']/accuracy['pos_cnt']*100, "%")
print("neg_acc", accuracy['neg_correct']/accuracy['neg_cnt']*100, "%")

pos_acc 89.3646408839779 %
neg_acc 84.27835051546391 %


Vectorize the tweet by the pre-fitted tokenizer instance then pad the tweet to have the same dimensions as the input

In [15]:
def apply_prediction(twt):
    twtData = tokenizer.texts_to_sequences([twt])
    twtData = pad_sequences(twtData, maxlen=28, dtype='int32', value=0)
    print(twtData)
    sentiment = model.predict(twtData,batch_size=1,verbose = 2)[0]
    sentimentValue = "negative" if(np.argmax(sentiment) == 0) else "positive"
    return sentimentValue
    
twt = 'The new CoD is pretty lit'
print(apply_prediction(twt))

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0  10 308 167 902]]
1/1 - 0s
positive


In [17]:
model.save_weights(filepath='weights.hdf5')

In [21]:
embed_dim = 128
lstm_out = 196

model2 = Sequential([
     Embedding(max_features, embed_dim, input_length = X.shape[1]),
     SpatialDropout1D(0.4),
     LSTM(lstm_out, dropout=0.2),
     Dense(2, activation='softmax')
])

model2.compile(
     loss='categorical_crossentropy',
     optimizer='adam',
     metrics=['accuracy']
)

In [22]:
model2.load_weights('weights.hdf5')

In [44]:
# loading
with open('tokenizer.pickle', 'rb') as handle:
    tokenizer = pickle.load(handle)

In [45]:
def apply_prediction2(twt):
    twtData = tokenizer.texts_to_sequences([twt])
    twtData = pad_sequences(twtData, maxlen=99, dtype='int32', value=0)
    print(twtData)
    sentiment = model2.predict(twtData,batch_size=1,verbose = 2)[0]
    sentimentValue = "negative" if(np.argmax(sentiment) == 0) else "positive"
    return sentimentValue
    
twt = 'The new CoD is pretty lit'
print(apply_prediction(twt))

[[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
    0   0   0   0   0   0  10 298 163 879]]
1/1 - 0s
positive


0.3003715

[[4.6424815e-01 5.3575188e-01]
 [3.3016404e-04 9.9966979e-01]
 [9.0963371e-02 9.0903664e-01]
 [1.1913437e-01 8.8086557e-01]
 [9.5881008e-02 9.0411901e-01]]


In [31]:
np.argmax(pred).shape

()

In [30]:
pred.shape

(12814, 2)