### Sentiment Analysis of movie reviews with neural network 

This notebook contains code for building a neural network that takes reviews and uses a neural network to classify the sentiment.

In [1]:
# imports
from collections import Counter
import numpy as np
import time
import sys

In [2]:
### Step 1 - Reading in the data

# Open file read lines and get it into a list
reviews=list()
with open('reviews.txt', 'r') as f:
    for line in f.readlines():
        reviews.append(line[:-1])

labels=list()
with open('labels.txt', 'r') as f1:
    for label in f1.readlines():
        labels.append(label[:-1].upper())
print(len(reviews))
# We have 25,000 reviews and their labels


25000


In [3]:
#### Step2 - Calculate word counts and ratios
# Using counters to get word counts for all, positive and negative reviews
total_counts=Counter()
positive_counts=Counter()
negative_counts=Counter()
 
for i in range(len(reviews)):
    if labels[i]=='POSITIVE':
        for word in reviews[i].split(" "):
            positive_counts[word] +=1
            total_counts[word] +=1
    else:
        for word in reviews[i].split(" "):
            negative_counts[word] +=1
            total_counts[word] +=1
            
## To implement later -  ratio part 

In [4]:
## Implement methods to update input layer

## Find Length of vocabulary
# using a set to extract unique words
vocab = set(total_counts.keys())
vocab_size = len(vocab)
print("There are "+str(vocab_size) + " words in the vocabulary")

# Initialize a numpy vector/array with same length as vocab. This will serve as layer_0 or the input layer
# This will help create a constant length vector to feed into the Neural network
layer_0 = np.zeros((1, vocab_size))

## Word to index dictionary
word2index={}
for i,word in enumerate(vocab):
    word2index[word]=i

There are 74074 words in the vocabulary


In [5]:
# Supporting methods
def get_target_for_label(label):
    return 1 if label=='POSITIVE' else 0

def update_input_layer(review):
    global layer_0
    # Clearing contents of layer_0
    layer_0 *=0
    for word in review.split(" "):
        layer_0[0][word2index[word]] = 1

# Testing the method
update_input_layer(reviews[0])   
layer_0

array([[ 1.,  0.,  0., ...,  0.,  0.,  0.]])

In [87]:
### Step 3 - Building the Neural Network

class NeuralNetwork(object):

    def __init__(self, reviews,labels,hidden_nodes = 10, learning_rate = 0.1):

        # Setting the seed for the random number generator
        np.random.seed(1)
        # Make a call to preprocess
        self.preprocess_data(reviews,labels)
        # Call the constructor
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)
        
    
    def preprocess_data(self,reviews,labels):
        
        # This preforms all the pre processing required to make sure 
        # calculating vocab size
        review_vocab = set()
        for i in range(len(reviews)):
            for word in reviews[i].split(" "):
                review_vocab.add(word)
        self.review_vocab = list(review_vocab)
        
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        self.label_vocab = list(label_vocab)
        
        self.reviews_vocab_size = len(self.review_vocab)
        self.labels_vocab_size = len(self.label_vocab)

        # Initialize a numpy vector/array with same length as vocab. This will serve as layer_0 or the input layer
        # This will help create a constant length vector to feed into the Neural network
        

        ## Word to index dictionary
        self.word2index={}
        for i,word in enumerate(self.review_vocab):
            self.word2index[word]=i
        
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i
            
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # initialize the layers
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes
        
        
        #Initialize the weights with zeros
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes)) # Create a matrix of input_nodes x hidden_nodes
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5,(self.hidden_nodes, self.output_nodes))
        
        self.learning_rate = learning_rate
        
        self.layer_0 = np.zeros((1, input_nodes))
    
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)  
    
    def get_target_for_label(self, label):
        return 1 if label=='POSITIVE' else 0

    def update_input_layer(self, review):
        # Clearing contents of layer_0
        self.layer_0 *= 0
        for word in review.split(" "):
            if(word in self.word2index.keys()):
                self.layer_0[0][self.word2index[word]] = 1
                
                

    def train(self, training_reviews, training_labels):

        assert(len(training_reviews) == len(training_labels))
        
        correct_so_far = 0
        
        start = time.time()
        
        for i in range(len(training_reviews)):
            review = training_reviews[i]
            label = training_labels[i]

            ## Forward pass            
            
            # Get the input layer set
            self.update_input_layer(review.lower())
            
            # Hidden Layer
            layer_1 = self.layer_0.dot(self.weights_0_1)
            
            #Output layer
            layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))
            
        ##Backward propagation
            
            layer_2_error = layer_2 - self.get_target_for_label(label)
            layer_2_delta = layer_2_error *self.sigmoid_output_2_derivative(layer_2)
            
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) 
            layer_1_delta= layer_1_error # Because we aren't applying any activation function here
            
            #Update weights
            self.weights_1_2 -= layer_1.T.dot(layer_2_delta) * self.learning_rate 
            self.weights_0_1 -= self.layer_0.T.dot(layer_1_delta) * self.learning_rate            
        
            # Logging training progress
            
            if(np.abs(layer_2_error) < 0.5):
                correct_so_far += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
                
    def test(self, testing_reviews, testing_labels):
        correct = 0
        start = time.time()
        
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                            + "% #Correct:" + str(correct) + " #Tested:" + str(i+1) + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
 
            
    
    def run(self, review):
        # This is to process the data and generate outputs based on trained network

        # This populates layer_0 with input values
        # Input layer
        self.update_input_layer(review.lower())
        
        # Hidden Layer
        layer_1 = self.layer_0.dot(self.weights_0_1)

        #Output layer
        layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))
        
        # Returning the predicted class
        if layer_2[0] > 0.5:
            return "POSITIVE"
        else:
            return "NEGATIVE"

In [88]:
mlp = NeuralNetwork(reviews[:-1000],labels[:-1000], learning_rate=0.1)

In [90]:
# train the network
mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Trained:1 Training Accuracy:0.0%
Progress:10.4% Speed(reviews/sec):67.03 #Correct:1814 #Trained:2501 Training Accuracy:72.5%
Progress:20.8% Speed(reviews/sec):88.40 #Correct:3776 #Trained:5001 Training Accuracy:75.5%
Progress:31.2% Speed(reviews/sec):100.1 #Correct:5867 #Trained:7501 Training Accuracy:78.2%
Progress:41.6% Speed(reviews/sec):107.3 #Correct:8009 #Trained:10001 Training Accuracy:80.0%
Progress:52.0% Speed(reviews/sec):112.3 #Correct:10121 #Trained:12501 Training Accuracy:80.9%
Progress:62.5% Speed(reviews/sec):115.7 #Correct:12247 #Trained:15001 Training Accuracy:81.6%
Progress:72.9% Speed(reviews/sec):118.4 #Correct:14378 #Trained:17501 Training Accuracy:82.1%
Progress:83.3% Speed(reviews/sec):120.4 #Correct:16557 #Trained:20001 Training Accuracy:82.7%
Progress:93.7% Speed(reviews/sec):121.8 #Correct:18734 #Trained:22501 Training Accuracy:83.2%
Progress:99.9% Speed(reviews/sec):122.6 #Correct:20049 #Trained:24000 Training 

In [89]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:99.9% Speed(reviews/sec):822.1% #Correct:500 #Tested:1000 Testing Accuracy:50.0%