In [1]:
def pretty_print_review_and_label(i):
    print(labels[i] + "\t:\t" + reviews[i][:80] + "...")

g = open('reviews.txt','r') # What we know!
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()

g = open('labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()

In [2]:
# TODO: -Copy the SentimentNetwork class from Project 5 lesson
#       -Modify it according to the above instructions 

import time
import sys
import numpy as np
from collections import Counter

# Encapsulate our neural network in a class
class SentimentNetwork:
    def __init__(self, reviews,labels,hidden_nodes = 10, learning_rate = 0.1, sig_cut = 1, min_count = 50):
        """Create a SentimenNetwork with the given settings
        Args:
            reviews(list) - List of reviews used for training
            labels(list) - List of POSITIVE/NEGATIVE labels associated with the given reviews
            hidden_nodes(int) - Number of nodes to create in the hidden layer
            learning_rate(float) - Learning rate to use while training
        
        """
        # Assign a seed to our random number generator to ensure we get
        # reproducable results during development 
        np.random.seed(1)

        # process the reviews and their associated labels so that everything
        # is ready for training
        self.pre_process_data(reviews, labels, sig_cut, min_count)
        
        # Build the network to have the number of hidden nodes and the learning rate that
        # were passed into this initializer. Make the same number of input nodes as
        # there are vocabulary words and create a single output node.
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)

    def pre_process_data(self, reviews, labels, sig_cut, min_count):
        
        positive_counts = Counter()
        negative_counts = Counter()
        total_counts = Counter()
        
        # Loop over all the words in all the reviews and increment the counts in the appropriate counter objects
        for n, review in enumerate(reviews):
            if labels[n] == 'POSITIVE':
                positive_counts.update(review.split(' '))
            else:
                negative_counts.update(review.split(' '))
            total_counts.update(review.split(' '))
            
        # Create Counter object to store positive/negative ratios
        pos_neg_ratios = Counter() 

        # Calculate the ratios of positive and negative uses of the most common words
        #       Consider words to be "common" if they've been used at least 100 times
        for word in positive_counts:
            if total_counts[word] >= min_count:
                if negative_counts[word]:
                    pos_neg_ratios[word] = positive_counts[word] / negative_counts[word]
                #else:
                #    pos_neg_ratios[word] = 0
            
        # Convert ratios to logs
        for word, ratio in pos_neg_ratios.most_common():
            pos_neg_ratios[word] = np.log(ratio)
        pos_neg_std  = np.std(list(pos_neg_ratios.values()))
        pos_neg_mean = np.mean(list(pos_neg_ratios.values()))
        sys.stdout.write("Cutoff:" + str((pos_neg_std * sig_cut)))
        
        # populate review_vocab with all of the words in the given reviews
        #review_vocab = set()
        #for review in reviews:
        #    for word in review.split(" "):
        #        review_vocab.add(word)
        ## Faster
        #review_vocab = set()
        #for review in reviews:
        #    review_vocab.update(review.split(' '))
        self.review_vocab = []
        for word, ratio in pos_neg_ratios.most_common():
            if (abs(ratio-pos_neg_mean)) > (pos_neg_std * sig_cut):
                self.review_vocab.append(word)
        # Check the cut
        #    else:                             
        #        del pos_neg_ratios[word]
            
        #hist, edges = np.histogram(list(map(lambda x:x[1],pos_neg_ratios.most_common())), density=True, bins=100, normed=True)

        #p = figure(tools="pan,wheel_zoom,reset,save",
        #           toolbar_location="above",
        #           title="Word Positive/Negative Affinity Distribution")
        #p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
        #show(p)  
        
        # Convert the vocabulary set to a list so we can access words via indices
        #self.review_vocab = list(review_vocab)
        
        # populate label_vocab with all of the words in the given labels.
        label_vocab = set()
        #for label in labels:
        #    label_vocab.add(label)
        label_vocab.update(labels)
        
        # Convert the label vocabulary set to a list so we can access labels via indices
        self.label_vocab = list(label_vocab)
        
        # Store the sizes of the review and label vocabularies.
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        # Create a dictionary of words in the vocabulary mapped to index positions
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        # Create a dictionary of labels mapped to index positions
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i

    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Store the learning rate
        self.learning_rate = learning_rate

        # Initialize weights

        # These are the weights between the input layer and the hidden layer.
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))

        # These are the weights between the hidden layer and the output layer.
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        ## New for Project 5: Removed self.layer_0; added self.layer_1
        # The input layer, a two-dimensional matrix with shape 1 x hidden_nodes
        self.layer_1 = np.zeros((1,hidden_nodes))
    
    ## New for Project 5: Removed update_input_layer function
    
    def get_target_for_label(self,label):
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    ## New for Project 5: changed name of first parameter form 'training_reviews' 
    #                     to 'training_reviews_raw'
    def train(self, training_reviews_raw, training_labels):

        ## New for Project 5: pre-process training reviews so we can deal 
        #                     directly with the indices of non-zero inputs
        training_reviews = list()
        #for review in training_reviews_raw:
        #    indices = set()
        #    for word in review.split(" "):
        #        if(word in self.word2index.keys()):
        #            indices.add(self.word2index[word])
        #    training_reviews.append(list(indices))
        #training_reviews = []
        for review in training_reviews_raw:
            review = set(review.split(' '))
            indices = []
            for word in review:
                if word.isalpha() and word in self.word2index.keys():
                    indices.append(self.word2index[word])
            training_reviews.append(indices)

        # make sure out we have a matching number of reviews and labels
        assert(len(training_reviews) == len(training_labels))
        
        # Keep track of correct predictions to display accuracy during training 
        correct_so_far = 0

        # Remember when we started for printing time statistics
        start = time.time()
        
        # loop through all the given reviews and run a forward and backward pass,
        # updating weights for every item
        for i in range(len(training_reviews)):

            # Get the next review and its correct label
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            ## New for Project 5: Removed call to 'update_input_layer' function
            #                     because 'layer_0' is no longer used

            # Hidden layer
            ## New for Project 5: Add in only the weights for non-zero items
            #self.layer_1 *= 0
            #for index in review:
            #    self.layer_1 += self.weights_0_1[index]
            self.layer_1 = np.sum(self.weights_0_1[review], axis=0, keepdims=True)


            # Output layer
            ## New for Project 5: changed to use 'self.layer_1' instead of 'local layer_1'
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))            
            
            #### Implement the backward pass here ####
            ### Backward pass ###

            # Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # Update the weights
            ## New for Project 5: changed to use 'self.layer_1' instead of local 'layer_1'
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            
            ## New for Project 5: Only update the weights that were used in the forward pass
            #for index in review:
            #    self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update input-to-hidden weights with gradient descent step
            self.weights_0_1[review] -= layer_1_delta * self.learning_rate
                
            # Keep track of correct predictions.
            if(layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            elif(layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the training process. 
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_reviews, testing_labels):
        """
        Attempts to predict the labels for the given testing_reviews,
        and uses the test_labels to calculate the accuracy of those predictions.
        """
        
        # keep track of how many correct predictions we make
        correct = 0

        # we'll time how many predictions per second we make
        start = time.time()

        # Loop through each of the given reviews and call run to predict
        # its label. 
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the prediction process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        """
        Returns a POSITIVE or NEGATIVE prediction for the given review.
        """
        # Run a forward pass through the network, like in the "train" function.
        
        ## New for Project 5: Removed call to update_input_layer function
        #                     because layer_0 is no longer used

        # Hidden layer
        ## New for Project 5: Identify the indices used in the review and then add
        #                     just those weights to layer_1 
        self.layer_1 *= 0
        unique_indices = set()
        for word in review.lower().split(" "):
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weights_0_1[index]
        
        # Output layer
        ## New for Project 5: changed to use self.layer_1 instead of local layer_1
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
        
        # Return POSITIVE for values above greater-than-or-equal-to 0.5 in the output layer;
        # return NEGATIVE for other values
        if(layer_2[0] >= 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"


In [4]:
#mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.05,learning_rate=0.01)
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],learning_rate=0.01, min_count=20, sig_cut=0.01)
mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):5845. #Correct:1964 #Trained:2501 Training Accuracy:78.5%
Progress:20.8% Speed(reviews/sec):5703. #Correct:4013 #Trained:5001 Training Accuracy:80.2%
Progress:31.2% Speed(reviews/sec):5540. #Correct:6128 #Trained:7501 Training Accuracy:81.6%
Progress:41.6% Speed(reviews/sec):5693. #Correct:8276 #Trained:10001 Training Accuracy:82.7%
Progress:52.0% Speed(reviews/sec):5727. #Correct:10433 #Trained:12501 Training Accuracy:83.4%
Progress:62.5% Speed(reviews/sec):5671. #Correct:12570 #Trained:15001 Training Accuracy:83.7%
Progress:72.9% Speed(reviews/sec):5642. #Correct:14670 #Trained:17501 Training Accuracy:83.8%
Progress:83.3% Speed(reviews/sec):5632. #Correct:16844 #Trained:20001 Training Accuracy:84.2%
Progress:93.7% Speed(reviews/sec):5675. #Correct:19008 #Trained:22501 Training Accuracy:84.4%
Progress:99.9% Speed(reviews/sec):5696. #Correct:20328 #Trained:24000 Training

In [5]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Tested:1 Testing Accuracy:100.%Progress:0.1% Speed(reviews/sec):877.2 #Correct:1 #Tested:2 Testing Accuracy:50.0%Progress:0.2% Speed(reviews/sec):1406. #Correct:2 #Tested:3 Testing Accuracy:66.6%Progress:0.3% Speed(reviews/sec):1605. #Correct:3 #Tested:4 Testing Accuracy:75.0%Progress:0.4% Speed(reviews/sec):1894. #Correct:4 #Tested:5 Testing Accuracy:80.0%Progress:0.5% Speed(reviews/sec):1874. #Correct:5 #Tested:6 Testing Accuracy:83.3%Progress:0.6% Speed(reviews/sec):1405. #Correct:6 #Tested:7 Testing Accuracy:85.7%Progress:0.7% Speed(reviews/sec):1425. #Correct:7 #Tested:8 Testing Accuracy:87.5%Progress:0.8% Speed(reviews/sec):1468. #Correct:8 #Tested:9 Testing Accuracy:88.8%Progress:0.9% Speed(reviews/sec):1173. #Correct:9 #Tested:10 Testing Accuracy:90.0%Progress:1.0% Speed(reviews/sec):1174. #Correct:10 #Tested:11 Testing Accuracy:90.9%Progress:1.1% Speed(reviews/sec):1225. #Correct:11 #Tested:12 Testing Accuracy:91.6%

Progress:40.7% Speed(reviews/sec):1994. #Correct:355 #Tested:408 Testing Accuracy:87.0%Progress:40.8% Speed(reviews/sec):1994. #Correct:356 #Tested:409 Testing Accuracy:87.0%Progress:40.9% Speed(reviews/sec):1994. #Correct:357 #Tested:410 Testing Accuracy:87.0%Progress:41.0% Speed(reviews/sec):1995. #Correct:358 #Tested:411 Testing Accuracy:87.1%Progress:41.1% Speed(reviews/sec):1995. #Correct:359 #Tested:412 Testing Accuracy:87.1%Progress:41.2% Speed(reviews/sec):1998. #Correct:360 #Tested:413 Testing Accuracy:87.1%Progress:41.3% Speed(reviews/sec):1997. #Correct:361 #Tested:414 Testing Accuracy:87.1%Progress:41.4% Speed(reviews/sec):1999. #Correct:362 #Tested:415 Testing Accuracy:87.2%Progress:41.5% Speed(reviews/sec):2000. #Correct:363 #Tested:416 Testing Accuracy:87.2%Progress:41.6% Speed(reviews/sec):2001. #Correct:364 #Tested:417 Testing Accuracy:87.2%Progress:41.7% Speed(reviews/sec):2001. #Correct:365 #Tested:418 Testing Accuracy:87.3%Progress:41.8% Speed(reviews/se

Progress:91.2% Speed(reviews/sec):2235. #Correct:779 #Tested:913 Testing Accuracy:85.3%Progress:91.3% Speed(reviews/sec):2235. #Correct:779 #Tested:914 Testing Accuracy:85.2%Progress:91.4% Speed(reviews/sec):2233. #Correct:780 #Tested:915 Testing Accuracy:85.2%Progress:91.5% Speed(reviews/sec):2231. #Correct:780 #Tested:916 Testing Accuracy:85.1%Progress:91.6% Speed(reviews/sec):2227. #Correct:780 #Tested:917 Testing Accuracy:85.0%Progress:91.7% Speed(reviews/sec):2227. #Correct:781 #Tested:918 Testing Accuracy:85.0%Progress:91.8% Speed(reviews/sec):2227. #Correct:782 #Tested:919 Testing Accuracy:85.0%Progress:91.9% Speed(reviews/sec):2227. #Correct:783 #Tested:920 Testing Accuracy:85.1%Progress:92.0% Speed(reviews/sec):2227. #Correct:783 #Tested:921 Testing Accuracy:85.0%Progress:92.1% Speed(reviews/sec):2227. #Correct:784 #Tested:922 Testing Accuracy:85.0%Progress:92.2% Speed(reviews/sec):2226. #Correct:785 #Tested:923 Testing Accuracy:85.0%Progress:92.3% Speed(reviews/se