# Sentiment Analysis on the rotten tomatoes data set

In [9]:
import pandas as pd
import os
import os.path as path

In [3]:
data = pd.read_csv("mrdata.tsv", delimiter = '\t')
print(data.head())
print(list(data.columns.values))

   PhraseId  SentenceId                                             Phrase  \
0         1           1  A series of escapades demonstrating the adage ...   
1         2           1  A series of escapades demonstrating the adage ...   
2         3           1                                           A series   
3         4           1                                                  A   
4         5           1                                             series   

   Sentiment  
0          1  
1          2  
2          2  
3          2  
4          2  
['PhraseId', 'SentenceId', 'Phrase', 'Sentiment']


## Pre processing the data

In [4]:
data2 = data.apply(lambda x: x.astype(str).str.lower())
data2['PhraseId'] = pd.to_numeric(data2['PhraseId'])
data2['SentenceId'] = pd.to_numeric(data2['SentenceId'])
data2['Sentiment'] = pd.to_numeric(data2['Sentiment'])
print(data2.head())
print(data2.shape)

   PhraseId  SentenceId                                             Phrase  \
0         1           1  a series of escapades demonstrating the adage ...   
1         2           1  a series of escapades demonstrating the adage ...   
2         3           1                                           a series   
3         4           1                                                  a   
4         5           1                                             series   

   Sentiment  
0          1  
1          2  
2          2  
3          2  
4          2  
(156060, 4)


### 1.2 removing Special Characters

In [5]:
#data2['Phrase'] = data2['Phrase'].str.replace('\W', '')
print(data2.head())
print(type(data2['Phrase']))
print(data2.iloc[0,2])

   PhraseId  SentenceId                                             Phrase  \
0         1           1  a series of escapades demonstrating the adage ...   
1         2           1  a series of escapades demonstrating the adage ...   
2         3           1                                           a series   
3         4           1                                                  a   
4         5           1                                             series   

   Sentiment  
0          1  
1          2  
2          2  
3          2  
4          2  
<class 'pandas.core.series.Series'>
a series of escapades demonstrating the adage that what is good for the goose is also good for the gander , some of which occasionally amuses but none of which amounts to much of a story .


###

In [6]:
labels = []
texts = []
#print(len(data2.index))
for i in range(len(data2.index)):
    texts.append(data2.iloc[i,2])
    labels.append(data2.iloc[i,3])

### 1.4 Tokenizing the data

In [7]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
import numpy as np

maxlen = 100
max_words = 15288

tokenizer = Tokenizer(num_words=max_words)
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)
word_index = tokenizer.word_index
print('Found %s unique tokens.' % len(word_index))
data = pad_sequences(sequences, maxlen=maxlen)
labels = np.asarray(labels)
print('Shape of data :', data.shape)
print('Shape of label:', labels.shape)

Using TensorFlow backend.


Found 15288 unique tokens.
Shape of data : (156060, 100)
Shape of label: (156060,)


### 1.5 Parsing the GloVe word-embeddings file

For embedding the data set glove dataset with 600 dimensional is used

In [25]:
glove_dir = 'C:/Users/Navaneeth/Documents/Deep Learning/Assignments/Question3/IMDB'
embeddings_index = {}
f = open(os.path.join(glove_dir, 'glove.6B.100d.txt'), encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


### 1.6 Preparing the GloVe word-embeddings matrix

In [27]:
embedding_dim = 100
embedding_matrix = np.zeros((max_words, embedding_dim))
Accuracy = 49.32456
for word, i in word_index.items():
    if i < max_words:
        embedding_vector = embeddings_index.get(word)
        if embedding_vector is not None:
            embedding_matrix[i] = embedding_vector
print(embedding_matrix.shape)

(15288, 100)


In [12]:
print(data[2])
print(sequences[2])
print(data2.iloc[2, 2])
x = np.random.random((2, 4))
y = np.random.random((4, 2))
print(np.dot(x, y))

[  0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
   0   0   0   0   0   0   0   0   2 323]
[2, 323]
a series
[[0.96875309 1.50331128]
 [0.75704193 1.52072373]]


## 2 Modeling the data

In [14]:
def sigmoid(x):  
    return 1/(1+np.exp(-x))

def sigmoid_der(x):  
    return sigmoid(x) *(1-sigmoid (x))

def softmax(A):  
    expA = np.exp(A)
    return expA / expA.sum(axis=0, keepdims=True)

In [15]:
one_hot_labels = np.zeros((data.shape[0], 5))

for i in range(data.shape[0]):  
    one_hot_labels[i, labels[i]] = 1

In [23]:
length = data.shape[0]
tra_len = int(100)
train = data[:tra_len]
test = data[int(0.95*tra_len):]
train_label = labels[:tra_len]
test_label = labels[int(0.95*tra_len):]

In [19]:
epho = 1
word_dim = 100
output_dim = 10

      #--------------------------------initiallising params of LSTM layer--------------------#

#----------initiallising params for input_activation---------------
W_a = np.random.random((output_dim, word_dim))
U_a = np.random.random((output_dim,output_dim))
b_a = np.random.random((output_dim,))

#----------intilallising params for input_gate---------------------
W_i = np.random.random((output_dim, word_dim))
U_i = np.random.random((output_dim,output_dim))
b_i = np.random.random((output_dim,))

#----------intiallising params for forget_gate---------------------
W_f = np.random.random((output_dim, word_dim))
U_f = np.random.random((output_dim,output_dim))
b_f = np.random.random((output_dim,))

#----------intiallising params for output_gate---------------------
W_o = np.random.random((output_dim, word_dim))
U_o = np.random.random((output_dim,output_dim))
b_o = np.random.random((output_dim,))

      #---------------------------------intiallising params of classification layer------------#

#-------initiallising params for input to hidden--------
hidden_units = 5
W_hi = np.random.random((hidden_units, output_dim)) #'i' indicates input,  'h' hidden units
b_h = np.random.random((hidden_units,))

#-------initiallising params for hidden to classification------
classification_units = 5
W_ch = np.random.random((classification_units, hidden_units)) # 'c' indicates output, 'h' indicates hidden units
b_c = np.random.random((classification_units,))

learning_rate = 0.1
Z_h = []
a_h = []
Z_c = []
a_c = []
feature = []
for i in range(epho):
    Xt = []
    At = []
    It = []
    Ft = []
    Ot = []
    State =[]
    Output =[]
    #feed_forward_propagation
    for j in range(train.shape[0]):
        input = data[j] # The sequence of tokenized input
        #------intiallising initial internal state and output-----------
        state_t = np.zeros((output_dim,))
        output_t = np.zeros((output_dim,))
        Xt.append(j)
        At.append(j)
        It.append(j)
        Ft.append(j)
        Ot.append(j)
        State.append(j)
        Output.append(j)
        
        Xt[j] = []
        At[j] = []
        It[j] = []
        Ft[j] = []
        Ot[j] = []
        State[j] = []
        Output[j] = []
        
        for token_index in input:
            x_t = embedding_matrix[token_index]
            
            a_t = np.tanh(np.dot(W_a, x_t) + np.dot(U_a, output_t) + b_a) # Input activation
            i_t = sigmoid(np.dot(W_i, x_t) + np.dot(U_i, output_t) + b_i) # Input gate
            f_t = sigmoid(np.dot(W_f, x_t) + np.dot(U_f, output_t) + b_f) # forget gate
            o_t = sigmoid(np.dot(W_o, x_t) + np.dot(U_o, output_t) + b_o) # output_gate
            
            Xt[j].append(x_t)
            At[j].append(a_t)
            It[j].append(i_t)
            Ft[j].append(f_t)
            Ot[j].append(o_t)
            State[j].append(state_t)
            Output[j].append(output_t)

            state_t = np.multiply(a_t, i_t) + np.multiply(f_t, state_t) # internal state
            output_t = np.multiply(np.tanh(state_t), o_t) # internal output
            
            
        feature.append(output_t)
        Z_h.append(np.dot(W_hi, output_t) + b_h)
        a_h.append(sigmoid(np.dot(W_hi, output_t) + b_h))
        
        Z_c.append(np.dot(W_ch, sigmoid(np.dot(W_hi, output_t) + b_h)) + b_c)
        a_c.append(softmax(np.dot(W_ch, sigmoid(np.dot(W_hi, output_t) + b_h)) + b_c))
        
    #----------------------- backward_propagation----------------------------
    dcost_dwc = a_c - one_hot_labels[:tra_len] # n*5
    dzc_dwc = a_h # n*h
    dcost_wc = np.dot(dcost_dwc.T, dzc_dwc)
    dcost_bc = dcost_dwc #n*5
    
    dzc_dah = W_ch # 5*h
    dcost_dah = np.dot(dcost_dwc, dzc_dah) # n*h
    
    dah_dzh = np.zeros((train.shape[0], hidden_units))
    for j in range(train.shape[0]):
        dah_dzh[j] = sigmoid_der(Z_h[j]) # n*h
    
    dzh_dwh = feature # n*10
    dcost_wh = np.dot( (dah_dzh * dcost_dah).T , dzh_dwh,) #h*10
    dcost_bh = dcost_dah * dah_dzh #n*h
    
    dzh_df = W_hi # h*10
    dcost_df = np.dot(dah_dzh * dcost_dah, dzh_df) #n*10
    
    #updating classification weights
    W_hi -= learning_rate * dcost_wh
    b_h -= learning_rate * dcost_bh.sum(axis=0)

    W_ch -= learning_rate * dcost_wc
    b_c -= learning_rate * dcost_bc.sum(axis=0)
    
    #-----------back_propagation on LSTM layer-----------------
    delta_Wa = np.zeros((output_dim, word_dim))
    delta_Wi = np.zeros((output_dim, word_dim))
    delta_Wf = np.zeros((output_dim, word_dim))
    delta_Wo = np.zeros((output_dim, word_dim))
    
    delta_Ua = np.zeros((output_dim, output_dim))
    delta_Ui = np.zeros((output_dim, output_dim))
    delta_Uf = np.zeros((output_dim, output_dim))
    delta_Uo = np.zeros((output_dim, output_dim))
    
    delta_ba = np.zeros((output_dim, ))
    delta_bi = np.zeros((output_dim, ))
    delta_bf = np.zeros((output_dim, ))
    delta_bo = np.zeros((output_dim, ))
    
    Delta_t = learning_rate * dcost_df
    
    for j in range(train.shape[0]):
        
        Delta_out = np.zeros((output_dim, ))
        delta_state = np.zeros((output_dim, ))

        for t in reversed(range(data.shape[1])):
            delta_out = Delta_t[j] + Delta_out
            delta_state = delta_out * Ot[j][t] * 1 - (np.tanh(State[j][t])**2)  + delta_state * Ft[j][t]
            delta_at = delta_state * It[j][t] * (1- (At[j][t]**2))
            #print(delta_at.shape)
            delta_it = delta_state * At[j][t] * It[j][t] * (1- (It[j][t]**2))
            #print(delta_it.shape)
            delta_ft = delta_state * State[j][t-1] * Ft[j][t] * (1- Ft[j][t])
            #print(delta_ft.shape)
            delta_ot = delta_out * np.tanh(State[j][t]) * Ot[j][t] * (1- Ot[j][t])
            #print(delta_ot.shape)
            delta_gate = []
            delta_gate.append(delta_at)
            delta_gate.append(delta_it)
            delta_gate.append(delta_ft)
            delta_gate.append(delta_ot)
            #delta_gate = np.asarray(delta_gate)
            #print(U_a.shape)
            Delta_out += np.dot(U_a, delta_at)
            Delta_out += np.dot(U_i,  delta_it)
            Delta_out += np.dot(U_f, delta_ft)
            Delta_out += np.dot(U_o, delta_ot)

            delta_Wa += np.outer(delta_at, Xt[j][t])
            delta_Wi += np.outer(delta_it, Xt[j][t])
            delta_Wf += np.outer(delta_ft, Xt[j][t])
            delta_Wo += np.outer(delta_ot, Xt[j][t])

            delta_Ua += np.outer(delta_at, Output[j][t])
            delta_Ui += np.outer(delta_it, Output[j][t])
            delta_Uf += np.outer(delta_ft, Output[j][t])
            delta_Uo += np.outer(delta_ot, Output[j][t])

            delta_ba += delta_at
            delta_bi += delta_it
            delta_bf += delta_ft
            delta_bo += delta_ot
            
    W_a -= learning_rate*delta_Wa
    W_i -= learning_rate*delta_Wi
    W_f -= learning_rate*delta_Wf
    W_o -= learning_rate*delta_Wo
    
    U_a -= learning_rate*delta_Ua
    U_i -= learning_rate*delta_Ui
    U_f -= learning_rate*delta_Uf
    U_o -= learning_rate*delta_Uo
    
    b_a -= learning_rate*delta_ba
    b_i -= learning_rate*delta_bi
    b_f -= learning_rate*delta_bf
    b_o -= learning_rate*delta_bo
    

## predicting the labels of test data


In [24]:
label_pred = []
for j in range(test.shape[0]):
    input = test[j] # The sequence of tokenized input
    #------initiallising initial internal state and output-----------
    state_t = np.zeros((output_dim,))
    output_t = np.zeros((output_dim,))
    Xt.append(j)
    At.append(j)
    It.append(j)
    Ft.append(j)
    Ot.append(j)
    State.append(j)
    Output.append(j)

    Xt[j] = []
    At[j] = []
    It[j] = []
    Ft[j] = []
    Ot[j] = []
    State[j] = []
    Output[j] = []

    for token_index in input:
        x_t = embedding_matrix[token_index]

        a_t = np.tanh(np.dot(W_a, x_t) + np.dot(U_a, output_t) + b_a) # Input activation
        i_t = sigmoid(np.dot(W_i, x_t) + np.dot(U_i, output_t) + b_i) # Input gate
        f_t = sigmoid(np.dot(W_f, x_t) + np.dot(U_f, output_t) + b_f) # forget gate
        o_t = sigmoid(np.dot(W_o, x_t) + np.dot(U_o, output_t) + b_o) # output_gate
        
        state_t = np.multiply(a_t, i_t) + np.multiply(f_t, state_t) # internal state
        output_t = np.multiply(np.tanh(state_t), o_t) # internal output


    #feature.append(output_t)
    Z_h = np.dot(W_hi, output_t) + b_h
    a_h = sigmoid(np.dot(W_hi, output_t) + b_h)

    Z_c = np.dot(W_ch, sigmoid(np.dot(W_hi, output_t) + b_h)) + b_c
    a_c = softmax(np.dot(W_ch, sigmoid(np.dot(W_hi, output_t) + b_h)) + b_c)
    #print(a_c.shape)
    a_c = np.asarray(a_c)
    label_pred.append(np.argmax(a_c))
       

### 3.1 test_accuracy

In [28]:
accuracy = np.sum(test_label == label_pred) / test.shape[0]
print(accuracy*100)

50.977462892315586
