### Sentiment Analysis of movie reviews with neural network 

This notebook contains code for building a neural network that takes reviews and uses a neural network to classify the sentiment.

In [3]:
# imports
from collections import Counter
import numpy as np
import time
import sys

In [4]:
### Step 1 - Reading in the data

# Open file read lines and get it into a list
reviews=list()
with open('reviews.txt', 'r') as f:
    for line in f.readlines():
        reviews.append(line[:-1])

labels=list()
with open('labels.txt', 'r') as f1:
    for label in f1.readlines():
        labels.append(label[:-1].upper())
print(len(reviews))
# We have 25,000 reviews and their labels


25000


In [5]:
#### Step2 - Calculate word counts and ratios
# Using counters to get word counts for all, positive and negative reviews
total_counts=Counter()
positive_counts=Counter()
negative_counts=Counter()
 
for i in range(len(reviews)):
    if labels[i]=='POSITIVE':
        for word in reviews[i].split(" "):
            positive_counts[word] +=1
            total_counts[word] +=1
    else:
        for word in reviews[i].split(" "):
            negative_counts[word] +=1
            total_counts[word] +=1
            
## To implement later -  ratio part 

In [6]:
## Implement methods to update input layer

## Find Length of vocabulary
# using a set to extract unique words
vocab = set(total_counts.keys())
vocab_size = len(vocab)
print("There are "+str(vocab_size) + " words in the vocabulary")

# Initialize a numpy vector/array with same length as vocab. This will serve as layer_0 or the input layer
# This will help create a constant length vector to feed into the Neural network
layer_0 = np.zeros((1, vocab_size))

## Word to index dictionary
word2index={}
for i,word in enumerate(vocab):
    word2index[word]=i

There are 74074 words in the vocabulary


In [7]:
# Supporting methods
def get_target_for_label(label):
    return 1 if label=='POSITIVE' else 0

def update_input_layer(review):
    global layer_0
    # Clearing contents of layer_0
    layer_0 *=0
    for word in review.split(" "):
        layer_0[0][word2index[word]] = 1

# Testing the method
update_input_layer(reviews[0])   
layer_0

array([[ 1.,  0.,  0., ...,  0.,  0.,  0.]])

In [66]:
### Step 3 - Building the Neural Network

class NeuralNetwork(object):

    def __init__(self, reviews,labels,min_count = 10,polarity_cutoff = 0.1,hidden_nodes = 10, learning_rate = 0.1):

        # Setting the seed for the random number generator
        np.random.seed(1)
        # Make a call to preprocess
        self.preprocess_data(reviews,labels)
        # Call the constructor
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)
        
    
    def preprocess_data(self,reviews,labels):
        
        # This preforms all the pre processing required to make sure 
        # calculating vocab size
        review_vocab = set()
        for i in range(len(reviews)):
            for word in reviews[i].split(" "):
                review_vocab.add(word)
        self.review_vocab = list(review_vocab)
        
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        self.label_vocab = list(label_vocab)
        
        self.reviews_vocab_size = len(self.review_vocab)
        self.labels_vocab_size = len(self.label_vocab)

        # Initialize a numpy vector/array with same length as vocab. This will serve as layer_0 or the input layer
        # This will help create a constant length vector to feed into the Neural network
        

        ## Word to index dictionary
        self.word2index={}
        for i,word in enumerate(self.review_vocab):
            self.word2index[word]=i
        
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i
            
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # initialize the layers
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes
        
        
        #Initialize the weights with zeros
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes)) # Create a matrix of input_nodes x hidden_nodes
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5,(self.hidden_nodes, self.output_nodes))
        
        self.learning_rate = learning_rate
        
        self.layer_0 = np.zeros((1, input_nodes))
        self.layer_1 = np.zeros((1,hidden_nodes))
    
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)  
    
    def get_target_for_label(self, label):
        return 1 if label=='POSITIVE' else 0

    def update_input_layer(self, review):
        # Clearing contents of layer_0
        self.layer_0 *= 0
        for word in review.split(" "):
            if(word in self.word2index.keys()):
                self.layer_0[0][self.word2index[word]] = 1
                
                

    def train(self, training_reviews_raw, training_labels):

        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_reviews.append(list(indices))
            
        assert(len(training_reviews) == len(training_labels))
        
        correct_so_far = 0
        
        start = time.time()
        
        for i in range(len(training_reviews)):
            review = training_reviews[i]
            label = training_labels[i]

            ## Forward pass            
            
            # Get the input layer set
        #    self.update_input_layer(review.lower())
            
            # Hidden Layer
            #layer_1 = self.layer_0.dot(self.weights_0_1)
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]
           
        
            #Output layer
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
            
        ##Backward propagation
            
            layer_2_error = layer_2 - self.get_target_for_label(label)
            layer_2_delta = layer_2_error *self.sigmoid_output_2_derivative(layer_2)
            
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) 
            layer_1_delta= layer_1_error # Because we aren't applying any activation function here
            
            #Update weights
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate 
            
            
            #self.weights_0_1 -= self.layer_0.T.dot(layer_1_delta) * self.learning_rate            
        
            for index in review:
                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update input-to-hidden weights with gradient descent step
           
        # Logging training progress
            
            if(np.abs(layer_2_error) < 0.5):
                correct_so_far += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
                
    def test(self, testing_reviews, testing_labels):
        correct = 0
        start = time.time()
        
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                            + "% #Correct:" + str(correct) + " #Tested:" + str(i+1) + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
 
            
    
    def run(self, review):
        # This is to process the data and generate outputs based on trained network

        # This populates layer_0 with input values
        # Input layer
        self.update_input_layer(review.lower())
        
        # Hidden Layer
        layer_1 = self.layer_0.dot(self.weights_0_1)

        #Output layer
        layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))
        
        # Returning the predicted class
        if layer_2[0] > 0.5:
            return "POSITIVE"
        else:
            return "NEGATIVE"

In [67]:
mlp = NeuralNetwork(reviews[:-1000],labels[:-1000], learning_rate=0.1)

In [68]:
# train the network
mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Trained:1 Training Accuracy:0.0%
Progress:10.4% Speed(reviews/sec):1930. #Correct:1803 #Trained:2501 Training Accuracy:72.0%
Progress:20.8% Speed(reviews/sec):1917. #Correct:3787 #Trained:5001 Training Accuracy:75.7%
Progress:31.2% Speed(reviews/sec):1824. #Correct:5901 #Trained:7501 Training Accuracy:78.6%
Progress:41.6% Speed(reviews/sec):1712. #Correct:8045 #Trained:10001 Training Accuracy:80.4%
Progress:52.0% Speed(reviews/sec):1631. #Correct:10174 #Trained:12501 Training Accuracy:81.3%
Progress:62.5% Speed(reviews/sec):1597. #Correct:12313 #Trained:15001 Training Accuracy:82.0%
Progress:72.9% Speed(reviews/sec):1563. #Correct:14431 #Trained:17501 Training Accuracy:82.4%
Progress:83.3% Speed(reviews/sec):1562. #Correct:16604 #Trained:20001 Training Accuracy:83.0%
Progress:93.7% Speed(reviews/sec):1574. #Correct:18786 #Trained:22501 Training Accuracy:83.4%
Progress:99.9% Speed(reviews/sec):1569. #Correct:20120 #Trained:24000 Training 

In [44]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:99.9% Speed(reviews/sec):864.7% #Correct:855 #Tested:1000 Testing Accuracy:85.5%

In [45]:
### Changes to add min cutoff and polarity

### Step 3 - Building the Neural Network

class NeuralNetwork(object):

    def __init__(self, reviews,labels,min_count = 10,polarity_cutoff = 0.1,hidden_nodes = 10, learning_rate = 0.1):

        # Setting the seed for the random number generator
        np.random.seed(1)
        # Make a call to preprocess
        self.preprocess_data(reviews,min_count,polarity_cutoff)
        # Call the constructor
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)
        
    
    def preprocess_data(self,reviews,polarity_cutoff,min_count):
        
        # This preforms all the pre processing required to make sure 
        
        positive_counts = Counter()
        negative_counts = Counter()
        total_counts = Counter()

        for i in range(len(reviews)):
            if(labels[i] == 'POSITIVE'):
                for word in reviews[i].split(" "):
                    positive_counts[word] += 1
                    total_counts[word] += 1
            else:
                for word in reviews[i].split(" "):
                    negative_counts[word] += 1
                    total_counts[word] += 1

        pos_neg_ratios = Counter()

        for term,cnt in list(total_counts.most_common()):
            if(cnt >= 50):
                pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
                pos_neg_ratios[term] = pos_neg_ratio

        for word,ratio in pos_neg_ratios.most_common():
            if(ratio > 1):
                pos_neg_ratios[word] = np.log(ratio)
            else:
                pos_neg_ratios[word] = -np.log((1 / (ratio + 0.01)))
        
         # calculating vocab size
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                if(total_counts[word] > min_count):
                    if(word in pos_neg_ratios.keys()):
                        if((pos_neg_ratios[word] >= polarity_cutoff) or (pos_neg_ratios[word] <= -polarity_cutoff)):
                            review_vocab.add(word)
                    else:
                        review_vocab.add(word)
        self.review_vocab = list(review_vocab)
        
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        self.label_vocab = list(label_vocab)
        
        self.reviews_vocab_size = len(self.review_vocab)
        self.labels_vocab_size = len(self.label_vocab)

        # Initialize a numpy vector/array with same length as vocab. This will serve as layer_0 or the input layer
        # This will help create a constant length vector to feed into the Neural network
        

        ## Word to index dictionary
        self.word2index={}
        for i,word in enumerate(self.review_vocab):
            self.word2index[word]=i
        
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i
            
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # initialize the layers
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes
        
        
        #Initialize the weights with zeros
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes)) # Create a matrix of input_nodes x hidden_nodes
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5,(self.hidden_nodes, self.output_nodes))
        
        self.learning_rate = learning_rate
        
        self.layer_0 = np.zeros((1, input_nodes))
        self.layer_1 = np.zeros((1,hidden_nodes))
    
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)  
    
    def get_target_for_label(self, label):
        return 1 if label=='POSITIVE' else 0

    def update_input_layer(self, review):
        # Clearing contents of layer_0
        self.layer_0 *= 0
        for word in review.split(" "):
            if(word in self.word2index.keys()):
                self.layer_0[0][self.word2index[word]] = 1
                
                

    def train(self, training_reviews_raw, training_labels):

        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_reviews.append(list(indices))
            
        assert(len(training_reviews) == len(training_labels))
        
        correct_so_far = 0
        
        start = time.time()
        
        for i in range(len(training_reviews)):
            review = training_reviews[i]
            label = training_labels[i]

            ## Forward pass            
            
            # Get the input layer set
        #    self.update_input_layer(review.lower())
            
            # Hidden Layer
            #layer_1 = self.layer_0.dot(self.weights_0_1)
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]
           
        
            #Output layer
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
            
        ##Backward propagation
            
            layer_2_error = layer_2 - self.get_target_for_label(label)
            layer_2_delta = layer_2_error *self.sigmoid_output_2_derivative(layer_2)
            
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) 
            layer_1_delta= layer_1_error # Because we aren't applying any activation function here
            
            #Update weights
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate 
            
            
            #self.weights_0_1 -= self.layer_0.T.dot(layer_1_delta) * self.learning_rate            
        
            for index in review:
                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update input-to-hidden weights with gradient descent step
           
        # Logging training progress
            
            if(np.abs(layer_2_error) < 0.5):
                correct_so_far += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
                
    def test(self, testing_reviews, testing_labels):
        correct = 0
        start = time.time()
        
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            reviews_per_second = i / float(time.time() - start)
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                            + "% #Correct:" + str(correct) + " #Tested:" + str(i+1) + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
 
            
    
    def run(self, review):
        # This is to process the data and generate outputs based on trained network

        # This populates layer_0 with input values
        # Input layer
        self.update_input_layer(review.lower())
        
        # Hidden Layer
        layer_1 = self.layer_0.dot(self.weights_0_1)

        #Output layer
        layer_2 = self.sigmoid(layer_1.dot(self.weights_1_2))
        
        # Returning the predicted class
        if layer_2[0] > 0.5:
            return "POSITIVE"
        else:
            return "NEGATIVE"

In [51]:
mlp = NeuralNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.9,learning_rate=0.001)

In [52]:
mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Trained:1 Training Accuracy:0.0%
Progress:10.4% Speed(reviews/sec):10766 #Correct:1710 #Trained:2501 Training Accuracy:68.3%
Progress:20.8% Speed(reviews/sec):10658 #Correct:3431 #Trained:5001 Training Accuracy:68.6%
Progress:31.2% Speed(reviews/sec):10421 #Correct:5177 #Trained:7501 Training Accuracy:69.0%
Progress:41.6% Speed(reviews/sec):10819 #Correct:6930 #Trained:10001 Training Accuracy:69.2%
Progress:52.0% Speed(reviews/sec):10874 #Correct:8565 #Trained:12501 Training Accuracy:68.5%
Progress:62.5% Speed(reviews/sec):10841 #Correct:10278 #Trained:15001 Training Accuracy:68.5%
Progress:72.9% Speed(reviews/sec):10849 #Correct:11964 #Trained:17501 Training Accuracy:68.3%
Progress:83.3% Speed(reviews/sec):10868 #Correct:13708 #Trained:20001 Training Accuracy:68.5%
Progress:93.7% Speed(reviews/sec):10889 #Correct:15448 #Trained:22501 Training Accuracy:68.6%
Progress:99.9% Speed(reviews/sec):10889 #Correct:16493 #Trained:24000 Training A

In [69]:
from collections import Counter
def get_most_similar_words(focus = "horrible"):
    most_similar = Counter()

    for word in mlp.word2index.keys():
        most_similar[word] = np.dot(mlp.weights_0_1[mlp.word2index[word]],mlp.weights_0_1[mlp.word2index[focus]])
    
    return most_similar.most_common()

In [71]:
get_most_similar_words("excellent")

[('excellent', 1.6141085995140161),
 ('perfect', 1.4661781128475326),
 ('amazing', 1.1716020230199957),
 ('great', 1.106824937443754),
 ('fun', 1.0916212237728868),
 ('wonderful', 1.0851484649968648),
 ('best', 1.0272721546088022),
 ('definitely', 0.90967917408894217),
 ('today', 0.904832116454372),
 ('enjoyable', 0.88964827608851538),
 ('favorite', 0.85712484782242815),
 ('superb', 0.84508961590762455),
 ('loved', 0.83867219556723294),
 ('highly', 0.80607184795756592),
 ('brilliant', 0.79187103209485932),
 ('liked', 0.78236965983669771),
 ('fantastic', 0.77187509205737781),
 ('job', 0.76432245473640803),
 ('enjoyed', 0.73952444689094921),
 ('rare', 0.70074074748086956),
 ('beautiful', 0.693564721023046),
 ('simple', 0.69350159692886959),
 ('bit', 0.68043470211309942),
 ('gem', 0.67933081912066684),
 ('entertaining', 0.67382816101944898),
 ('surprised', 0.67029706894799179),
 ('especially', 0.65792704921943923),
 ('incredible', 0.64157028789857218),
 ('strong', 0.64113293341181143),
 (