# Sentiment Analysis for Text User Reviews
### Predicting the user ratings based on the unstructured text data

In [67]:
import os
import sys
import numpy as np
import pandas as pd
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences
from keras.utils import to_categorical
from keras.layers import Dense, Input, GlobalMaxPooling1D, Dropout, LSTM, Activation, GRU
from keras.layers import Conv1D, MaxPooling1D, Embedding, Flatten
from keras.models import Model
from keras.utils import plot_model
import pydot
from keras.models import model_from_json

### Reading the input dataset
* The input dataset contains two fields
  * Reviews: The user entered mobile device reviews
  * Rating: The user assigned rating to the mobile device 1 to 5 (Transformed to 0 to 4 by subtracting 1)
    

In [5]:
df = pd.read_csv('Mobile Review Rating.csv')
df.head()

Unnamed: 0,Rating,Reviews
0,5,I feel so LUCKY to have found this used (phone...
1,4,"nice phone, nice up grade from my pantach revu..."
2,5,Very pleased
3,4,It works good but it goes slow sometimes but i...
4,4,Great phone to replace my lost phone. The only...


### Processing Input Dataset

In [6]:
print("Shape of the Original Input Dataframe: " + str(df.shape))

# To speed up processing we will only use first 5000 records
df = df.head(5000)

print("Shape of the truncated Input Dataframe: " + str(df.shape))

Shape of the Original Input Dataframe: (413840, 2)
Shape of the truncated Input Dataframe: (5000, 2)


In [7]:
texts = []  # list of text samples
labels = [] # list of label ids

for i in range(df.shape[0]):
    texts.append(str(df['Reviews'][i]))
    labels.append(df['Rating'][i] - 1)
print ('Found %s texts. ' % len(texts))

#Or texts = list(map(str, texts))

Found 5000 texts. 


## Indexing Word Vectors
* Using the GloVe 100 encoding

### Additional Information Embedding Layer 
* The Embedding layer is defined as the first hidden layer of a network. It must specify 3 arguments:
  * Input_dim: This is the size of the vocabulary in the text data. In our case this number is 10K so the size of the vocabulary would be 10K + 1 words.
  * output_dim: This is the size of the vector space in which words will be embedded. In our case this is 100 as we are using GloVe 100.
  * input_length: This is the length of input sequences, in our case this value is 300.

In [8]:
f = open("glove.6B.100d.txt", 'r', encoding="utf8")
embeddings_index = {}
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()
print("Found %s word vectors." % len(embeddings_index))

Found 400000 word vectors.


## Keras Preprocessing
* Tokenization
* Text cleaning
* Integer encoding of the text data and word index creation to be later utilized in the embedding 
* The maximum sequence length (maxlen) is taken as 300
* The vocab size is fixed to a vocab of 10000 words

In [9]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

#Functions provided by the Tokenizer class
# [tokenizer.word_counts]: Give back the word count for each unique word
# [tokenizer.document_count]: Give back the number of documents (in our case number of elements in the list)
# [tokenizer.word_index]: A dictionary of words and their uniquely assigned integers
# [tokenizer.word_docs]: A dictionary of words and how many documents each appeared in
# num_words will not truncate the words found in the input but it will truncate the usage. 
# num_words is respected in the texts_to_sequences method which turns input into numerical arrays. (During dictionary creation
# the tokenizer creates the complete dictionary. So in a way this is your vocab limit while converting text to sequences and 
# encoding sentences.)

tokenizer = Tokenizer(num_words=10000) # Vocab size 10000
tokenizer.fit_on_texts(texts)
sequences = tokenizer.texts_to_sequences(texts)

word_index = tokenizer.word_index

print('Found %s unique tokens.' % len(word_index))

data = pad_sequences(sequences, maxlen=300) # Maximum sequence length

print ('Shape of data tensor: ', data.shape)

labels = np.asarray(labels)
#np.utils.to_categorical is used to convert array of labeled data(from 0 to nb_classes-1) to one-hot vector.
labels = to_categorical(np.asarray(labels))
print ('Shape of labels tensor: ', labels.shape) #Here in our case ar the ratings are from 1-5 so we are getting one hot encoding 0-5 i.e. 6 classes

#Another method for one hot encoding
#labels = np.eye(5)[labels.reshape(-1)]

Found 9166 unique tokens.
Shape of data tensor:  (5000, 300)
Shape of labels tensor:  (5000, 5)


### Splitting data into training and validation set
* We will use 95% data for training the model
* 5% data will be used as validation set

In [19]:
#Splitting data into training and validation set
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
labels = labels[indices]
df['Reviews'] = df['Reviews'][indices]
nb_validation_samples = int(0.05 * data.shape[0]) #10% Test dataset

x_train = data[:-nb_validation_samples]
y_train = labels[:-nb_validation_samples]

x_val = data[-nb_validation_samples:]
y_val = labels[-nb_validation_samples:]

In [20]:
print ("Shape of training dataset: " + str(x_train.shape))
print ("Shape of test dataset: " + str(x_val.shape))

Shape of training dataset: (4750, 300)
Shape of test dataset: (250, 300)


### Preparing the Embedding Layer
* We will use embedding_index dictionary and word_index to compute embedding matrix

In [21]:
embedding_matrix = np.zeros((len(word_index) + 1, 100)) # 100 is the embedding dimension as we are using Globe 100
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # Words not found in embedding index will be all zeros
        embedding_matrix[i] = embedding_vector
        
print("Shape of Embedding Matrix :"  + str(embedding_matrix.shape))

Shape of Embedding Matrix :(9167, 100)


### Loading the Embedding Matrix into an Embedding Layer

In [22]:
from keras.layers import Embedding

embedding_layer = Embedding(len(word_index) + 1, #Adding 1 to fit Keras embedding requirements to accomodate unknown word
                           100, #  100 is the embedding dimension as we are using Globe 100
                           weights=[embedding_matrix],
                           input_length=300, # 300 is the maximum sequence length used in the padding function
                           trainable=False) # Trainable false to prevent the weights from being updated during training

In [17]:
#print("Weights[0][1][1]: ", + str(embedding_layer.get_weights()))

* Functions of Embedding Layer
  * Embedding layer is fed sequences of integers i.e. a 2D input of shape (samples, indices)
  * These input sequences ideally should have the same lenght
  * Embedding layes maps the integer inputs to the vectors found at the corresponding index in the embedding matrix
  * Ex: sequence [1,2] will convert to [embeddings[1], embeddings[2]]
  * The output of the embedding layer will be a 3D tensor of shape (batch size, max input length, dimensions of word vector)
  * In our case (batch size, 300, 100)

### Building the Conv Net

In [40]:
sequence_input = Input(shape=(300,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
X = Conv1D(128, 5, activation='relu')(embedded_sequences)
X = MaxPooling1D(5)(X)
X = Conv1D(128, 5, activation='relu')(X)
X = GlobalMaxPooling1D()(X)
X = Dense(5, activation='relu')(X)
X = Dense(5, activation='softmax')(X)
model = Model(inputs=sequence_input, outputs=X)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_8 (InputLayer)         (None, 300)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 300, 100)          916700    
_________________________________________________________________
conv1d_15 (Conv1D)           (None, 296, 128)          64128     
_________________________________________________________________
max_pooling1d_8 (MaxPooling1 (None, 59, 128)           0         
_________________________________________________________________
conv1d_16 (Conv1D)           (None, 55, 128)           82048     
_________________________________________________________________
global_max_pooling1d_8 (Glob (None, 128)               0         
_________________________________________________________________
dense_15 (Dense)             (None, 5)                 645       
__________

In [41]:
model.compile(loss='categorical_crossentropy',
             optimizer='adam',
             metrics=['accuracy'])

In [43]:
model.fit(x_train, y_train, 
          validation_data=(x_val, y_val),
          epochs=25,
          batch_size=64,
          shuffle=True)

Train on 4750 samples, validate on 250 samples
Epoch 1/25
Epoch 2/25
Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25
Epoch 24/25
Epoch 25/25


<keras.callbacks.History at 0x2bf275d50b8>

### Identify Mislabelled Examples

In [48]:
prediction_val = model.predict(x_val[:51])
for i in range(10):
    num = np.argmax(prediction_val[i])
    if (num != np.argmax(y_val[i])):
        print ("Wrong Prediction for: " + df['Reviews'][i])
        print ("Predicted Rating " + str(num))

Wrong Prediction for: Great phone to replace my lost phone. The only thing is the volume up button does not work, but I can still go into settings to adjust. Other than that, it does the job until I am eligible to upgrade my phone again.Thaanks!
Predicted Rating 2
Wrong Prediction for: The charging port was loose. I got that soldered in. Then needed a new battery as well. $100 later (not including cost of purchase) I have a usable phone. The phone should not have been sold in the state it was in.
Predicted Rating 4
Wrong Prediction for: Phone looks good but wouldn't stay charged, had to buy new battery. Still couldn't stay charged long.so I trashed it.MONEY lost, never again will I buy from this person! !!!
Predicted Rating 3


### Testing on Own Input

In [50]:
x_test = np.array(['Stupid waste crap not'])
m = x_test.shape[0]  # Number of example to test
#Initialize matrix to hold embedding values
x_test_indices = np.zeros((m, 300)) #300 is the maximum length
for i in range(m): 
    sentence_words = x_test[i].lower().split()
    j = 0
    # Loop over the words of sentence_words
    for w in sentence_words:
    # Set the (i,j)th entry of X_indices to the index of the correct word.
        x_test_indices[i, j] = word_index[w]
        j = j + 1
print(x_test[0] + ' Predicted Rating: ' + str(np.argmax(model.predict(x_test_indices))))

Stupid waste crap not Predicted Rating: 0


### Building the LSTM RNN

In [57]:
sequence_input = Input(shape=(300,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
X = LSTM(units=128, return_sequences=True)(embedded_sequences)
X = Dropout(0.5)(X)
X = LSTM(units=128, return_sequences=False)(X)
X = Dropout(0.5)(X)
X = Dense(5)(X)
X = Activation('softmax')(X)
model = Model(inputs=sequence_input, outputs=X)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_10 (InputLayer)        (None, 300)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 300, 100)          916700    
_________________________________________________________________
lstm_3 (LSTM)                (None, 300, 128)          117248    
_________________________________________________________________
dropout_3 (Dropout)          (None, 300, 128)          0         
_________________________________________________________________
lstm_4 (LSTM)                (None, 128)               131584    
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_18 (Dense)             (None, 5)                 645       
__________

In [58]:
#loaded_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [63]:
model.fit(x_train, y_train, 
                 epochs=5, 
                 batch_size=64, 
                 shuffle=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2bf2edd72e8>

In [65]:
loss, acc = model.evaluate(x_val, y_val)
print("Test Accuracy with LSTM: " + str(acc))

Test Accuracy with LSTM: 0.667999999046


In [66]:
x_test = np.array(['never going to buy this phone again'])
m = x_test.shape[0]  # Number of example to test
#Initialize matrix to hold embedding values
x_test_indices = np.zeros((m, 300)) #300 is the maximum length
for i in range(m): 
    sentence_words = x_test[i].lower().split()
    j = 0
    # Loop over the words of sentence_words
    for w in sentence_words:
    # Set the (i,j)th entry of X_indices to the index of the correct word.
        x_test_indices[i, j] = word_index[w]
        j = j + 1
print(x_test[0] + ' Predicted Rating:' + str(np.argmax(model.predict(x_test_indices))))

never going to buy this phone again Predicted Rating:4


### Building a GRU RNN

In [68]:
sequence_input = Input(shape=(300,), dtype='int32')
embedded_sequences = embedding_layer(sequence_input)
X = GRU(units=128, activation = 'relu', return_sequences=True)(embedded_sequences)
X = Dropout(0.5)(X)
X = GRU(units=128, activation = 'relu', return_sequences=False)(X)
X = Dropout(0.5)(X)
X = Dense(5)(X)
X = Activation('softmax')(X)
model = Model(inputs=sequence_input, outputs=X)
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_11 (InputLayer)        (None, 300)               0         
_________________________________________________________________
embedding_2 (Embedding)      (None, 300, 100)          916700    
_________________________________________________________________
gru_1 (GRU)                  (None, 300, 128)          87936     
_________________________________________________________________
dropout_5 (Dropout)          (None, 300, 128)          0         
_________________________________________________________________
gru_2 (GRU)                  (None, 128)               98688     
_________________________________________________________________
dropout_6 (Dropout)          (None, 128)               0         
_________________________________________________________________
dense_19 (Dense)             (None, 5)                 645       
__________

In [69]:
#loaded_model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])

In [70]:
model.fit(x_train, y_train, 
                 epochs=5, 
                 batch_size=32, 
                 shuffle=True)

Epoch 1/5
Epoch 2/5
Epoch 3/5
Epoch 4/5
Epoch 5/5


<keras.callbacks.History at 0x2bf4ac15160>

In [71]:
loss, acc = model.evaluate(x_val, y_val)
print("Test Accuracy with LSTM: " + str(acc))

Test Accuracy with LSTM: 0.675999999046


In [72]:
x_test = np.array(['never going to buy this phone again'])
m = x_test.shape[0]  # Number of example to test
#Initialize matrix to hold embedding values
x_test_indices = np.zeros((m, 300)) #300 is the maximum length
for i in range(m): 
    sentence_words = x_test[i].lower().split()
    j = 0
    # Loop over the words of sentence_words
    for w in sentence_words:
    # Set the (i,j)th entry of X_indices to the index of the correct word.
        x_test_indices[i, j] = word_index[w]
        j = j + 1
print(x_test[0] + ' Predicted Rating:' + str(np.argmax(model.predict(x_test_indices))))

never going to buy this phone again Predicted Rating:4


#### Storing the Model to the Disk

In [None]:
# serialize model to JSON
model_json = model.to_json()
with open("model.json", "w") as json_file:
    json_file.write(model_json)
# serialize weights to HDF5
model.save_weights("model.h5")
print("Saved model to disk")

#### Loading the Model from the Disk

In [53]:
# load json and create model
json_file = open('model.json', 'r')
loaded_model_json = json_file.read()
json_file.close()
loaded_model = model_from_json(loaded_model_json)
# load weights into new model
loaded_model.load_weights("model.h5")
print("Loaded model from disk")
 
# evaluate loaded model on test data
#loaded_model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
#score = loaded_model.evaluate(X, Y, verbose=0)

Loaded model from disk
