In [117]:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.keras.layers import Dense, Activation, Embedding, LSTM, SpatialDropout1D
from tensorflow.keras.models import Sequential

from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.keras.callbacks import EarlyStopping
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import f1_score,plot_precision_recall_curve, precision_score, recall_score

import re
import pickle

Intake the training data and only keep the necessary columns

In [87]:
data = pd.read_csv('twitter_training.csv', names=["Tweet_ID", "Entity", "Sentiment", "Text"])
data = data[['Text','Sentiment']]

data.head()

Unnamed: 0,Text,Sentiment
0,im getting on borderlands and i will murder yo...,Positive
1,I am coming to the borders and I will kill you...,Positive
2,im getting on borderlands and i will kill you ...,Positive
3,im coming on borderlands and i will murder you...,Positive
4,im getting on borderlands 2 and i will murder ...,Positive


In [72]:
data.Sentiment.unique()

array(['Positive', 'Neutral', 'Negative', 'Irrelevant'], dtype=object)

In [74]:
data.iloc[425].Text

"@GearboxOfficial @Borderlands I love(d) this game. But've spent 325hrs beating your game twice and farming like crazy to get which way those anoints that I want. And you say that that way that I'm playing wasn't which way it had intended so you weaken my favorite anoint..."

In [75]:
data.iloc[426].Text

'Yasss!!! Co-Stream with @jimmysgotya  twitch.tv/jimmysgotya'

Goal is to identify Positive and Negative tweets, drop everything else and keep only valid text

In [88]:
data = data[data.Sentiment != "Neutral"]
data = data[data.Sentiment != "Irrelevant"]
data.Text = data.Text.apply(lambda x: str(x).lower())
data.Text = data.Text.apply((lambda x: re.sub('[^a-zA-z0-9\s]','',x)))

# tfidf stuff 
corpus=data['Text']




In [89]:
data.Sentiment.unique()

array(['Positive', 'Negative'], dtype=object)

In [None]:
class_labels = ['negative', 'positive']

In [90]:
tfidfvectorizer= TfidfVectorizer(stop_words='english', max_features=5000, min_df=5)
countvectorizer = CountVectorizer(analyzer='word', stop_words='english')

In [91]:

tfidf_matrix=tfidfvectorizer.fit_transform(corpus)
tfidf_data=tfidf_matrix.toarray()

countvectorizer_matrix = countvectorizer.fit_transform(corpus)
count_data = countvectorizer_matrix.toarray()

In [92]:
tfidf_data.shape

(43374, 5000)

In [93]:
count_data.shape

(43374, 22965)

In [94]:
count_tokens = countvectorizer.get_feature_names()
tfidf_tokens = tfidfvectorizer.get_feature_names()

In [None]:
count_tokens

In [125]:
x=tfidf_data
y= pd.get_dummies(data.Sentiment).values
X_train, X_test, Y_train, Y_test = train_test_split(x,y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape)
print(X_test.shape,Y_test.shape)

(29060, 5000) (29060, 2)
(14314, 5000) (14314, 2)


In [96]:
x2=count_data
y2= pd.get_dummies(data.Sentiment).values
X2_train, X2_test, Y2_train, Y2_test = train_test_split(x,y, test_size = 0.33, random_state = 42)
print(X2_train.shape,Y2_train.shape)
print(X2_test.shape,Y2_test.shape)

(29060, 5000) (29060, 2)
(14314, 5000) (14314, 2)


In [126]:
checkpointer=ModelCheckpoint(filepath='best_weights1.hdf5',verbose=0,save_best_only=True)
monitor=EarlyStopping(monitor='val_loss',min_delta=1e-3,patience=2,verbose=2,mode='auto')
model1=Sequential()
model1.add(Dense(25, input_dim=x.shape[1],activation='relu'))
model1.add(Dense(2, activation='softmax'))
model1.compile(loss='categorical_crossentropy',optimizer='adam')




In [127]:
# model 2 
checkpointer=ModelCheckpoint(filepath='best_weights2.hdf5',verbose=0,save_best_only=True)
monitor=EarlyStopping(monitor='val_loss',min_delta=1e-3,patience=2,verbose=2,mode='auto')
model2=Sequential()
model2.add(Dense(25, input_dim=x.shape[1],activation='relu'))
model2.add(Dense(50, input_dim=x.shape[1],activation='relu'))
model2.add(Dense(25, input_dim=x.shape[1],activation='relu'))
model2.add(Dense(10, input_dim=x.shape[1],activation='relu'))

model2.add(Dense(2, activation='softmax'))
model2.compile(loss='categorical_crossentropy',optimizer='adam')

In [128]:
# model 3
# model3
embed_dim = 128
lstm_out = 196
max_features = 1000
model3 = Sequential([
     Embedding(max_features, embed_dim, input_length = x.shape[1]),
     SpatialDropout1D(0.4),
     LSTM(lstm_out, dropout=0.2),
     Dense(2, activation='softmax')
])

model3.compile(
     loss='categorical_crossentropy',
     optimizer='adam',
     metrics=['accuracy']
)
print(model3.summary())

Model: "sequential_9"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 5000, 128)         128000    
_________________________________________________________________
spatial_dropout1d_2 (Spatial (None, 5000, 128)         0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_25 (Dense)             (None, 2)                 394       
Total params: 383,194
Trainable params: 383,194
Non-trainable params: 0
_________________________________________________________________
None


In [131]:
# monitor3=EarlyStopping(monitor='val_loss',min_delta=1e-3,patience=2,verbose=2,mode='auto')
# checkpointer3=ModelCheckpoint(filepath='weights3.hdf5',verbose=0,save_best_only=True)
# model3.fit(X_train, Y_train, epochs = 10, batch_size=32,validation_data=(X_test,Y_test),
#     callbacks=[monitor3, checkpointer3], verbose = 1)
# model3.save_weights("model3.hdf5")
model3.load_weights("weights.hdf5")

In [132]:
monitor2=EarlyStopping(monitor='val_loss',min_delta=1e-3,patience=2,verbose=2,mode='auto')
checkpointer2=ModelCheckpoint(filepath='weights2.hdf5',verbose=0,save_best_only=True)
model2.fit(X_train,Y_train,validation_data=(X_test,Y_test),callbacks=[monitor,checkpointer],verbose=2,epochs=100)
model2.save_weights("model2.hdf5")


Epoch 1/100
909/909 - 2s - loss: 0.3797 - val_loss: 0.2875
Epoch 2/100
909/909 - 1s - loss: 0.1946 - val_loss: 0.2150
Epoch 3/100
909/909 - 1s - loss: 0.1149 - val_loss: 0.2007
Epoch 4/100
909/909 - 1s - loss: 0.0816 - val_loss: 0.2219
Epoch 5/100
909/909 - 1s - loss: 0.0670 - val_loss: 0.2319
Epoch 00005: early stopping


In [133]:
monitor1=EarlyStopping(monitor='val_loss',min_delta=1e-3,patience=2,verbose=2,mode='auto')
checkpointer1=ModelCheckpoint(filepath='weights1.hdf5',verbose=0,save_best_only=True)

model1.fit(X_train,Y_train,validation_data=(X_test,Y_test),callbacks=[monitor1,checkpointer1],verbose=2,epochs=100)
model1.save_weights("model1.hdf5")

Epoch 1/100
909/909 - 2s - loss: 0.4241 - val_loss: 0.3160
Epoch 2/100
909/909 - 1s - loss: 0.2639 - val_loss: 0.2801
Epoch 3/100
909/909 - 1s - loss: 0.2238 - val_loss: 0.2671
Epoch 4/100
909/909 - 1s - loss: 0.2005 - val_loss: 0.2616
Epoch 5/100
909/909 - 1s - loss: 0.1823 - val_loss: 0.2603
Epoch 6/100
909/909 - 1s - loss: 0.1678 - val_loss: 0.2625
Epoch 7/100
909/909 - 1s - loss: 0.1534 - val_loss: 0.2628
Epoch 00007: early stopping


In [140]:
pred1 = np.argmax(model1.predict(X_test), axis=1)
#pred = np.argmax(pred, axis=1)
pred2 = np.argmax(model2.predict(X_test), axis=1)
#pred2 = np.argmax(pred2, axis=1)
#pred3 = np.argmax(model3.predict(X_test), axis=1)

# Y_test = np.argmax(Y_test, axis=1)
Y_test = np.argmax(Y_test, axis=1)

In [141]:
# f1_score(Y_test, pred, average="weighted")
f1_model1 = f1_score(Y_test, pred1, average="weighted")
f1_model2 = f1_score(Y_test, pred2, average="weighted")
#f1_model3 = f1_score(Y_test, pred3, average="weighted")

recall_model1 = recall_score(Y_test, pred1)
recall_model2 = recall_score(Y_test, pred2)
#recall_model3 = recall_score(Y_test, pred3)


precision_model1 = precision_score(Y_test, pred1)
precision_model2 = precision_score(Y_test, pred2)
#precision_model3 = precision_score(Y_test, pred3)





In [142]:
print("f1 scores for all models")
print(f1_model1)
print(f1_model2)
# print(f1_model3)

f1 scores for all models
0.8952003335461977
0.9218411008747404


In [143]:
print('recall scores')
print(recall_model1)
print(recall_model2)
# print(recall_model3)

recall scores
0.915968855589834
0.8946672542970472


In [144]:
print('precision scores')
print(precision_model1)
print(precision_model2)
# print(precision_model3)

precision scores
0.870323841429369
0.938366718027735
