Basic tutorial to understand some aspects of Sentiment Analysis

In [1]:
#
# Adapted from 
# https://www.kaggle.com/ngyptr/lstm-sentiment-analysis-keras
#
import pandas as pd 
import numpy as np
import keras

from sklearn.feature_extraction.text import CountVectorizer
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.models import Sequential
from keras.layers import Dense, Embedding, LSTM, SpatialDropout1D
from sklearn.model_selection import train_test_split
from keras.utils.np_utils import to_categorical
import re

data = pd.read_csv("Sentiment.csv")
data = data[['text', 'sentiment']]
data = data[data.sentiment != 'Neutral']

Using TensorFlow backend.


In [2]:
data.text = data.text.apply(lambda x: x.lower())
data.text = data.text.apply(lambda x: re.sub('[^a-zA-z0-9\s]','',x))

for idx, row in data.iterrows():
    row[0] = row[0].replace('rt', ' ')

In [3]:
data.text.head()

1      scottwalker didnt catch the full gopdebate l...
3      robgeorge that carly fiorina is trending  ho...
4      danscavino gopdebate w realdonaldtrump deliv...
5      gregabbott_tx tedcruz on my first day i will...
6      warriorwoman91 i liked her and was happy whe...
Name: text, dtype: object

Use a tokenizer to transform word into integer

In [8]:
from nltk.corpus import stopwords
stopWords = set(stopwords.words('english'))

In [9]:
from nltk.stem.wordnet import WordNetLemmatizer
lmtzr = WordNetLemmatizer()

In [10]:
def CleanUp(text):
    tmp = text.strip().split(' ')
    res = []
    for i in range(len(tmp)):
        if tmp != "" and tmp != " ":
            tmp_2 = lmtzr.lemmatize(tmp[i])
            if tmp_2 not in stopWords:
                res.append(tmp_2)
    return res

data.text = data.text.apply(CleanUp)

Tokenizer to turn word into integer

In [11]:
tokenizer = Tokenizer(num_words = max_features, split = " ")
tokenizer.fit_on_texts(data['text'].values)
X = tokenizer.texts_to_sequences(data['text'].values)

In [12]:
X = pad_sequences(X)
print(X.shape)
X[:5,]

(10729, 24)


array([[   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
         271,   61, 1904,  604,    1,   15,   14,  207,  114,  377, 1087,
         687,  624],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,    0,    0,  227,  179,    2,  439,    8,    2,   96,    1,
          43,  605],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0, 1088,    1,  218,    9, 1712, 1440,  133,  584,  111,    8,
         282,  585],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,   65,  175,  265,  211,  336, 1089, 1338,  982, 1713,  126,
           1,   22],
       [   0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,
           0,  983,    7, 1165,  502,    7,  115,  127,  688,    1,   18,
           2,  149]])

In [13]:
keyList = list(tokenizer.word_counts.keys())
for i  in range(10):
    print(keyList[i], tokenizer.word_counts[keyList[i]])
### Should have used some kind of lemmatization and stop words

scottwalker 66
didnt 229
catch 6
full 27
gopdebate 6546
last 611
night 634
scott 88
best 145
line 47


In [14]:
print(tokenizer.document_count, X.shape)
print(tokenizer.num_words)

10729 (10729, 24)
2000


In [15]:
embed_dim = 128
lstm_out = 196

model = Sequential()
model.add(Embedding(max_features, embed_dim, input_length = X.shape[1]))
model.add(SpatialDropout1D(0.4))

model.add(LSTM(lstm_out, dropout = 0.2, recurrent_dropout=0.2))
model.add(Dense(2, activation = 'softmax'))
model.compile(loss = 'categorical_crossentropy', optimizer='adam',metrics = ['accuracy'])
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 24, 128)           256000    
_________________________________________________________________
spatial_dropout1d_1 (Spatial (None, 24, 128)           0         
_________________________________________________________________
lstm_1 (LSTM)                (None, 196)               254800    
_________________________________________________________________
dense_1 (Dense)              (None, 2)                 394       
Total params: 511,194
Trainable params: 511,194
Non-trainable params: 0
_________________________________________________________________


Embedding has 24 \* 128 \* 2000 (vocab size) number of parameters. It output 24 (max-size) * 128 (embedding dimensions) <br>
ltsm_1 has 196 cells, for a total of 254800 parameters => each cell has 1300 parameters.

Let's examine the LSTM layers:

In [16]:
lstm = model.layers[2]
print(lstm.input_shape)
print(lstm.output_shape)

(None, 24, 128)
(None, 196)


Each LSTM outputs only 1 value. 

In [17]:
Y = pd.get_dummies(data['sentiment']).values
X_train, X_test, Y_train, Y_test = train_test_split(X,Y, test_size = 0.33, random_state = 42)
print(X_train.shape,Y_train.shape, Y_train[:, 1].sum())
print(X_test.shape,Y_test.shape, Y_test[:,1].sum())

(7188, 24) (7188, 2) 1516
(3541, 24) (3541, 2) 720


In [18]:
batch_size = 32
model.fit(X_train, Y_train, epochs = 7, batch_size=batch_size, verbose = 2)

Epoch 1/7
 - 31s - loss: 0.4464 - acc: 0.8122
Epoch 2/7
 - 19s - loss: 0.3225 - acc: 0.8634
Epoch 3/7
 - 19s - loss: 0.2818 - acc: 0.8826
Epoch 4/7
 - 19s - loss: 0.2522 - acc: 0.8941
Epoch 5/7
 - 17s - loss: 0.2282 - acc: 0.9062
Epoch 6/7
 - 12s - loss: 0.2083 - acc: 0.9121
Epoch 7/7
 - 11s - loss: 0.1864 - acc: 0.9229


<keras.callbacks.History at 0xcd55a20>

In [19]:
from sklearn import metrics
Y_train_score = model.predict_proba(X_train)
print(metrics.roc_auc_score(Y_train[:, 1], Y_train_score[:, 1]))
    
Y_test_score = model.predict_proba(X_test)
print(metrics.roc_auc_score(Y_test[:, 1], Y_test_score[:, 1]))

0.982133163045
0.845789761314


In [20]:
X_train[0]

array([  0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,   0,
        32,  37,  99, 673, 650, 630, 155, 288, 304, 111, 678])

Let's examine the output of the embedding layer (layer 0)

In [25]:
from keras import backend as K

get_1st_output = K.function([model.layers[0].input], [model.layers[1].output])

In [45]:
test = get_1st_output([X_train[0].reshape(1,-1)])
print(np.array(test).shape)
print(test[0][0])

(1, 1, 24, 128)
[[  2.64670756e-02   2.49574725e-02  -4.84679490e-02 ...,   2.40733866e-02
   -6.80007041e-03   1.60450228e-02]
 [  2.64670756e-02   2.49574725e-02  -4.84679490e-02 ...,   2.40733866e-02
   -6.80007041e-03   1.60450228e-02]
 [  2.64670756e-02   2.49574725e-02  -4.84679490e-02 ...,   2.40733866e-02
   -6.80007041e-03   1.60450228e-02]
 ..., 
 [  1.03278260e-04   2.66935006e-02  -1.17941119e-01 ...,  -8.81516933e-02
   -2.38156486e-02  -8.97239055e-03]
 [  1.01476498e-02  -1.47995986e-02   5.72442962e-03 ...,   2.35218704e-02
   -3.68115567e-02   4.38964628e-02]
 [  7.68071339e-02   4.76800241e-02  -4.70870174e-02 ...,  -6.06759861e-02
   -6.72121868e-02   6.10079952e-02]]


The embedding layers turn the input of 24 values into output of shape 24x128. Notice that the first few rows of the word vectors are the same, because they correspond to the same token (" ").