## Sentiment analysis Project by Mohamed Alaa Gaida and Nadine Fakhet

In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from sklearn.model_selection import train_test_split

## Data Preparation :

In [2]:
path = "training.1600000.processed.noemoticon.csv"
data = pd.read_csv(path, encoding='latin-1')

In [3]:
data.columns = ["sentiment", "id", "date", "query", "user", "text"]
data = data.drop(["id", "date", "query", "user"], axis = 1)

In [4]:
data.head()

Unnamed: 0,sentiment,text
0,0,is upset that he can't update his Facebook by ...
1,0,@Kenichan I dived many times for the ball. Man...
2,0,my whole body feels itchy and like its on fire
3,0,"@nationwideclass no, it's not behaving at all...."
4,0,@Kwesidei not the whole crew


In [5]:
y = data.sentiment
X = data.text
y = y/4
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2, random_state=200)

In [6]:
print(X_train.shape)
print(y_train.shape)
print(X_test.shape)
print(y_test.shape)

(1279999,)
(1279999,)
(320000,)
(320000,)


In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

vocabulary = 100000
tokenizer = Tokenizer(num_words=vocabulary)
tokenizer.fit_on_texts(X_train)
sequences = tokenizer.texts_to_sequences(X_train)

Using TensorFlow backend.


In [9]:
for x in X_train[:5]:
    print (x)

just ate dinner... at 11pm. And now I'm in bed thinking about stuff. 
i had something really good to post earlier... just cant remember what it was now 
@nyc_paris Well... My Uncle Allen thinks I need help 
@BrookeAdamsTBG5 congrats on the win. cant wait to see you this summer ! miss you so much 
Oh hell...I'm 30 today. 


In [10]:
sequences[:5]

[[20, 714, 383, 23, 4877, 6, 27, 19, 11, 141, 398, 61, 293],
 [1, 66, 208, 63, 28, 2, 471, 843, 20, 171, 515, 55, 9, 25, 27],
 [1123, 1572, 74, 5, 2086, 3154, 873, 1, 93, 241],
 [51857, 634, 13, 3, 440, 171, 143, 2, 68, 7, 26, 238, 88, 7, 15, 89],
 [83, 479, 19, 443, 41]]

In [11]:
length = []
for x in X_train:
    length.append(len(x.split()))

In [12]:
max(length)

64

In [13]:
M = sum(length)/len(length)
M

13.176499356640122

In [14]:
LEN = 32

In [15]:
x_train_seq = pad_sequences(sequences, maxlen=LEN)       
x_train_seq = np.matrix(x_train_seq)
print('Shape of data tensor:', x_train_seq.shape)

Shape of data tensor: (1279999, 32)


In [16]:
x_train_seq[:5]

matrix([[    0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,    20,   714,   383,    23,  4877,     6,    27,    19,
            11,   141,   398,    61,   293],
        [    0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     1,
            66,   208,    63,    28,     2,   471,   843,    20,   171,
           515,    55,     9,    25,    27],
        [    0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,  1123,  1572,    74,     5,  2086,
          3154,   873,     1,    93,   241],
        [    0,     0,     0,     0,     0,     0,     0,     0,     0,
             0,     0,     0,     0,     0,     0,     0, 51857,   634,
            13,     3,   440,   171,   143,     2,    68,     7,    26,
 

In [17]:
sequences_test = tokenizer.texts_to_sequences(X_test)
x_test_seq = pad_sequences(sequences_test, maxlen=LEN)

## Creating the model :

In [18]:
hidden_size = 192

In [19]:
import keras
from keras.models import Sequential, load_model
from keras.layers import Dense, Activation, Embedding, Dropout, TimeDistributed
from keras.layers import LSTM
from keras.optimizers import Adam
from keras.callbacks import ModelCheckpoint

In [20]:
model = Sequential()
model.add(Embedding(vocabulary, hidden_size, input_length=LEN))
model.add(LSTM(1, return_sequences=False))
opt = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False)
model.compile(loss='mean_squared_error', optimizer=opt, metrics=['accuracy'])

In [21]:
print(model.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 32, 192)           19200000  
_________________________________________________________________
lstm_1 (LSTM)                (None, 1)                 776       
Total params: 19,200,776
Trainable params: 19,200,776
Non-trainable params: 0
_________________________________________________________________
None


In [26]:
n_epoch = 5
batch_size = 64
model.fit(x_train_seq, y_train, validation_data=(x_test_seq, y_test), epochs=n_epoch, batch_size=batch_size, verbose=1)

Train on 1279999 samples, validate on 320000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7fb8dd984f28>

In [27]:
model.save_weights('myModel.h5')


In [43]:
model.load_weights('myModel.h5')

## Changing the architecture :
Adding 2 more LSTM Layers and changing the optimizer to RMSprop

In [23]:
model2 = Sequential()
model2.add(Embedding(vocabulary, hidden_size, input_length=LEN))
model2.add(LSTM(hidden_size, return_sequences=True))
model2.add(LSTM(hidden_size//2, return_sequences=True))
model2.add(LSTM(1, return_sequences=False))
opt2 = keras.optimizers.RMSprop(lr=0.001, rho=0.9, epsilon=None, decay=0.0)
model2.compile(loss='mean_squared_error', optimizer=opt2, metrics=['accuracy'])

In [24]:
print(model2.summary())

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_2 (Embedding)      (None, 32, 192)           19200000  
_________________________________________________________________
lstm_2 (LSTM)                (None, 32, 192)           295680    
_________________________________________________________________
lstm_3 (LSTM)                (None, 32, 96)            110976    
_________________________________________________________________
lstm_4 (LSTM)                (None, 1)                 392       
Total params: 19,607,048
Trainable params: 19,607,048
Non-trainable params: 0
_________________________________________________________________
None


In [None]:
n_epoch = 5
batch_size = 64
model2.fit(x_train_seq, y_train, validation_data=(x_test_seq, y_test), epochs=n_epoch, batch_size=batch_size, verbose=1)

Train on 1279999 samples, validate on 320000 samples
Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x7f8f3927fc88>

In [None]:
model2.save_weights('myModel2.h5')


In [26]:
model2.load_weights('myModel2.h5')

## Changing the architecture :
changing the learining rate and batch size

In [29]:
model3 = Sequential()
model3.add(Embedding(vocabulary, hidden_size, input_length=LEN))
model3.add(LSTM(hidden_size, return_sequences=True))
model3.add(LSTM(hidden_size//2, return_sequences=True))
model3.add(LSTM(1, return_sequences=False))
opt3 = keras.optimizers.RMSprop(lr=0.01, rho=0.9, epsilon=None, decay=0.0)
model3.compile(loss='mean_squared_error', optimizer=opt2, metrics=['accuracy'])

In [None]:
n_epoch = 2
batch_size = 32
model3.fit(x_train_seq, y_train, validation_data=(x_test_seq, y_test), epochs=2, batch_size=batch_size, verbose=1)

Train on 1279999 samples, validate on 320000 samples
Epoch 1/2
Epoch 2/2


In [None]:
model3.save_weights('myModel3.h5')

In [32]:
model3.load_weights('myModel3.h5')

## Recap

In [53]:
scores1 = model.evaluate(x_test_seq, y_test, verbose=0)
print("Model 1 : %s: %.2f%%" % (model.metrics_names[1], scores1[1]*100))

scores2 = model2.evaluate(x_test_seq, y_test, verbose=0)
print("Model 2 : %s: %.2f%%" % (model2.metrics_names[1], scores2[1]*100))

scores3 = model3.evaluate(x_test_seq, y_test, verbose=0)
print("Model 3 : %s: %.2f%%" % (model3.metrics_names[1], scores3[1]*100))

Model 1 : acc: 80.72%
Model 2 : acc: 83.27%
Model 3 : acc: 83.12%


## Testing the models

In [45]:
x = [" I love cats "]
x2 = tokenizer.texts_to_sequences(x)
x3 = pad_sequences(x2, maxlen=32)

s1 = model.predict(x3, verbose=2)
s2 = model2.predict(x3, verbose=2)
s3 = model3.predict(x3, verbose=2)

In [46]:
print("Model 1 :  %.2f%%" % (s1[0][0]*100))
print("Model 2 :  %.2f%%" % (s2[0][0]*100))
print("Model 3 :  %.2f%%" % (s3[0][0]*100))

Model 1 :  86.85%
Model 2 :  90.81%
Model 3 :  87.70%
