# Sentiment Classification


## Loading the dataset

In [0]:
import numpy as np
np.load.__defaults__=(None, True, True, 'ASCII')

In [2]:

from keras.datasets import imdb

vocab_size = 10000 #vocab size

(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size) # vocab_size is no.of words to consider from the dataset, ordering based on frequency.


Using TensorFlow backend.


Downloading data from https://s3.amazonaws.com/text-datasets/imdb.npz


In [0]:
from keras.preprocessing.sequence import pad_sequences
vocab_size = 10000 #vocab size
maxlen = 20  #number of word used from each review --> making the training faster with 20 words

## Train test split

In [0]:
#load dataset as a list of ints
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=vocab_size)
#make all sequences of the same length
x_train = pad_sequences(x_train, maxlen=maxlen)
x_test =  pad_sequences(x_test, maxlen=maxlen)

In [15]:
#Creating the key value pair for word to id and id to word, so that the test/train data (x) can be viewed meaningfully.
word_to_id = imdb.get_word_index()
word_to_id = {k:v for k,v in word_to_id.items()}

Downloading data from https://s3.amazonaws.com/text-datasets/imdb_word_index.json


In [0]:
id_to_word = {value:key for key,value in word_to_id.items()}

In [60]:
#Check the sample train data
for i in range(0,10):
  print (' '.join(id_to_word[id] for id in x_train[i]) , " ----> ",y_train[i] )

their with her nobody most that with wasn't to with armed acting watch an for with heartfelt film want an  ---->  1
are of ship for with of wild to one is very work dark they don't do dvd with those them  ---->  0
80's was big also light don't and as it in character looked cinematography so stories is far br man acting  ---->  0
br halfway to of took work 20 br similar more he good flower for hit at coming not see reputation  ---->  1
for film's was tale have flash but look part i'm film as to penelope is script hard br only acting  ---->  0
i i slowly lot of above and with connect in of script their that out end his and i i  ---->  0
movies get are and br yes female just its because many br of overly to descent people time very bland  ---->  1
once arts like have then own is ebay has have one is you for off his dutch we they an  ---->  0
that hilarious not was into through to why for as it by br of where suits was one your life  ---->  1
do period it couple in college in viewers get b


## Build Keras Embedding Layer Model
We can think of the Embedding layer as a dicionary that maps a index assigned to a word to a word vector. This layer is very flexible and can be used in a few ways:

* The embedding layer can be used at the start of a larger deep learning model. 
* Also we could load pre-train word embeddings into the embedding layer when we create our model.
* Use the embedding layer to train our own word2vec models.

The keras embedding layer doesn't require us to onehot encode our words, instead we have to give each word a unqiue intger number as an id. For the imdb dataset we've loaded this has already been done, but if this wasn't the case we could use sklearn [LabelEncoder](http://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.LabelEncoder.html).

In [0]:
##Model parameters
max_features = 20000
maxlen = 80
batch_size = 32

from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Dense, Embedding
from tensorflow.python.keras.layers import LSTM

#Building Sequential model
model = Sequential()
model.add(Embedding(max_features, 128))
model.add(LSTM(128, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(1, activation='sigmoid'))

In [0]:
model.compile(loss='binary_crossentropy',
              optimizer='adam',
              metrics=['accuracy'])

In [25]:
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=15,
          validation_data=(x_test, y_test))


Epoch 1/15
Epoch 2/15
Epoch 3/15
Epoch 4/15
Epoch 5/15
Epoch 6/15
Epoch 7/15
Epoch 8/15
Epoch 9/15
Epoch 10/15
Epoch 11/15
Epoch 12/15
Epoch 13/15
Epoch 14/15
Epoch 15/15


<tensorflow.python.keras.callbacks.History at 0x7fdadbb0ed30>

In [61]:
score, acc = model.evaluate(x_test, y_test,
                            batch_size=batch_size)
print('Test score:', score)
print('Test accuracy:', acc)

Test score: 1.6945756673812866
Test accuracy: 0.7307199835777283


In [0]:
from sklearn.metrics import confusion_matrix, recall_score, precision_score,accuracy_score, f1_score,roc_auc_score
import pandas as pd

def binary_classification_performance(y_test, y_pred):
    tn, fp, fn, tp = confusion_matrix(y_test, y_pred).ravel()
    accuracy = round(accuracy_score(y_pred = y_pred, y_true = y_test),2)
    precision = round(precision_score(y_pred = y_pred, y_true = y_test),2)
    recall = round(recall_score(y_pred = y_pred, y_true = y_test),2)
    f1_score = round(2*precision*recall/(precision + recall),2)
    specificity = round(tn/(tn+fp),2)
    npv = round(tn/(tn+fn),2)
    auc_roc = round(roc_auc_score(y_score = y_pred, y_true = y_test),2)


    result = pd.DataFrame({'Accuracy' : [accuracy],
                         'Precision or PPV' : [precision],
                         'Recall or senitivity or TPR' : [recall],
                         'f1 score' : [f1_score],
                         'AUC_ROC' : [auc_roc],
                         'Specificty or TNR': [specificity],
                         'NPV' : [npv],
                         'True Positive' : [tp],
                         'True Negative' : [tn],
                         'False Positive':[fp],
                         'False Negative':[fn]})
    return result

In [0]:
y_pred = model.predict(x_test)

In [66]:
y_pred_upd = [0 if val <0.5 else 1 for val in y_pred]

ERROR! Session/line number was not unique in database. History logging moved to new session 66


In [70]:
binary_classification_performance(y_test, y_pred_upd)

Unnamed: 0,Accuracy,Precision or PPV,Recall or senitivity or TPR,f1 score,AUC_ROC,Specificty or TNR,NPV,True Positive,True Negative,False Positive,False Negative
0,0.73,0.72,0.77,0.74,0.73,0.69,0.75,9582,8686,3814,2918


## Retrive the output of each layer in keras for a given single test sample from the trained model you built

In [0]:
from tensorflow.python.keras import backend as K

inp = model.input                                           # input placeholder
outputs = [layer.output for layer in model.layers]          # all layer outputs
functors = [K.function([inp], [out]) for out in outputs]    # evaluation functions



In [52]:
# Testing
layer_outs = [func(x_test) for func in functors]
print (layer_outs[0])

[array([[[ 1.99849814e-01, -2.87888125e-02, -1.28157750e-01, ...,
          1.13628805e-01,  1.60491168e-01,  1.31762773e-01],
        [-5.93710579e-02, -1.11400321e-01, -1.29733920e-01, ...,
         -1.08561909e-03,  3.37580554e-02,  1.01813294e-01],
        [ 1.58350449e-02, -7.62492791e-02, -2.71570459e-02, ...,
          3.66376303e-02,  7.42851570e-02, -3.77052575e-02],
        ...,
        [ 3.96433026e-02,  1.09101467e-01, -4.15450670e-02, ...,
          1.37761965e-01, -3.25513519e-02,  1.14513509e-01],
        [ 3.09907608e-02, -7.10750185e-03, -4.18368615e-02, ...,
          2.18732748e-02, -4.92075905e-02, -1.92552485e-04],
        [-1.20647117e-01, -1.39270589e-01, -5.02942428e-02, ...,
         -1.02081321e-01,  6.09316118e-03, -3.94395553e-04]],

       [[ 5.04734330e-02, -9.61201712e-02, -1.63638815e-02, ...,
         -1.20148470e-03, -3.25324461e-02, -9.80055612e-03],
        [ 5.04734330e-02, -9.61201712e-02, -1.63638815e-02, ...,
         -1.20148470e-03, -3.25324461