## Exloring the movie review data
In separate sheet to keep sheets running the models from being too cluttered

In [27]:
import numpy as np
import os
import os.path
import glob
import time

from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
import nltk

[nltk_data] Downloading package punkt to /home/dns/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [28]:
import os
import wget
import tarfile

# By checking if the directory exists first, we allow people to delete the tarfile without the notebook re-downloading it
if os.path.isdir('aclImdb'):
    print("Dataset directory exists, taking no action")
else:    
    if not os.path.isfile('aclImdb_v1.tar.gz'):
        print("Downloading dataset")
        #!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
        wget.download('http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz')
    else:
        print("Dataset already downloaded")
    
    print("Unpacking dataset")
    #!tar -xf aclImdb_v1.tar.gz 
    tar = tarfile.open("aclImdb_v1.tar.gz")
    tar.extractall()
    tar.close()
    print("Dataset unpacked in aclImdb")

Dataset directory exists, taking no action


In [29]:
# configuration
SAMPLE_SIZE=1000


In [30]:
time_beginning_of_notebook = time.time()
positive_file_list = glob.glob(os.path.join('aclImdb/train/pos', "*.txt"))
positive_sample_file_list = positive_file_list[:SAMPLE_SIZE]

negative_file_list = glob.glob(os.path.join('aclImdb/train/neg', "*.txt"))
negative_sample_file_list = negative_file_list[:SAMPLE_SIZE]

import re

# load doc into memory
# regex to clean markup elements 
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding='utf8')
    # read all text
    text = re.sub('<[^>]*>', ' ', file.read())
    #text = file.read()
    # close the file
    file.close()
    return text

In [31]:
positive_strings = [load_doc(x) for x in positive_sample_file_list]
negative_strings = [load_doc(x) for x in negative_sample_file_list]

positive_tokenized = [word_tokenize(s) for s in positive_strings]
negative_tokenized = [word_tokenize(s) for s in negative_strings]

In [32]:
def pretty_print_positive_and_negative(i):
    print(positive_reviews[i][:30] + "\t:\t" + negative_reviews[i][:30] + "...")

In [33]:
print(len(positive_reviews) + len(negative_reviews))
print('\n Positive reviews \n ', positive_reviews[2137][:50])
print('\n Negative reviews \n ', negative_reviews[2137][:50])

25000

 Positive reviews 
  This movie is a journey through the mind of a scre

 Negative reviews 
  While the original First Blood had its far-fetched


In [34]:
print("positive reviews \t : \t negative reviews\n")
pretty_print_positive_and_negative(2137)
pretty_print_positive_and_negative(12444)
pretty_print_positive_and_negative(6267)
pretty_print_positive_and_negative(5297)
pretty_print_positive_and_negative(4998)

positive reviews 	 : 	 negative reviews

This movie is a journey throug	:	While the original First Blood...
are highlights of this 1917 fe	:	OK I had higher hopes for this...
Here's the kind of love story 	:	What a disappointment!  This f...
A small pleasure in life is wa	:	Some nice scenery, but the sto...
This film was amazing. It had 	:	I remember when I first saw th...


In [35]:
from collections import Counter
import numpy as np

In [36]:
# Create three Counter objects to store positive, negative and total counts
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()

In [37]:
# Loop over all the words in all the reviews and increment the counts in the appropriate counter objects
for i in range(len(positive_reviews)):
    for word in positive_reviews[i].split(" "):
        positive_counts[word] += 1
        total_counts[word] += 1
for i in range(len(negative_reviews)):
    for word in negative_reviews[i].split(" "):
        negative_counts[word] += 1
        total_counts[word] += 1


In [38]:
positive_counts.most_common()[:100]

[('the', 148466),
 ('and', 84295),
 ('a', 79438),
 ('of', 75349),
 ('to', 65216),
 ('is', 55366),
 ('in', 45802),
 ('I', 32622),
 ('that', 31948),
 ('', 27700),
 ('it', 26999),
 ('this', 26037),
 ('as', 23934),
 ('with', 22034),
 ('was', 21312),
 ('for', 20874),
 ('The', 20300),
 ('but', 16459),
 ('his', 16203),
 ('on', 15387),
 ('film', 14420),
 ('are', 14397),
 ('movie', 13375),
 ('not', 12493),
 ('you', 12416),
 ('have', 12270),
 ('he', 11771),
 ('be', 11696),
 ('by', 11462),
 ('an', 10794),
 ('one', 10686),
 ('at', 10231),
 ('who', 10152),
 ('from', 10134),
 ('all', 9159),
 ('has', 9032),
 ('her', 8999),
 ('like', 7981),
 ('about', 7829),
 ('very', 7796),
 ('they', 7714),
 ('This', 7437),
 ('so', 7383),
 ('or', 7013),
 ('more', 6825),
 ('out', 6692),
 ('some', 6664),
 ('just', 6533),
 ('It', 6238),
 ('when', 5987),
 ('what', 5903),
 ('their', 5893),
 ('good', 5797),
 ('which', 5645),
 ('she', 5402),
 ("it's", 5313),
 ('can', 5275),
 ('see', 5250),
 ('my', 5226),
 ('would', 5191),
 

In [39]:
negative_counts.most_common()[:100]

[('the', 138707),
 ('a', 75682),
 ('and', 68417),
 ('of', 67636),
 ('to', 67364),
 ('is', 47882),
 ('in', 39790),
 ('I', 37007),
 ('that', 32619),
 ('this', 31208),
 ('', 29753),
 ('it', 27455),
 ('was', 25393),
 ('The', 20694),
 ('for', 20202),
 ('with', 19694),
 ('as', 18587),
 ('but', 17340),
 ('movie', 17140),
 ('on', 15383),
 ('have', 14863),
 ('are', 14106),
 ('be', 13818),
 ('not', 13775),
 ('film', 12994),
 ('you', 12714),
 ('his', 11492),
 ('at', 11071),
 ('like', 10158),
 ('they', 10131),
 ('one', 10010),
 ('by', 9969),
 ('he', 9914),
 ('an', 9833),
 ('just', 9802),
 ('or', 9211),
 ('from', 9112),
 ('so', 8966),
 ('all', 8907),
 ('who', 8691),
 ('about', 8463),
 ('out', 7679),
 ('some', 7553),
 ('has', 7445),
 ('This', 7054),
 ('her', 6833),
 ('would', 6732),
 ('even', 6509),
 ('no', 6412),
 ('only', 6274),
 ('if', 6175),
 ('more', 6128),
 ('had', 5914),
 ('were', 5837),
 ('what', 5788),
 ('It', 5661),
 ('really', 5657),
 ('good', 5647),
 ('up', 5622),
 ('when', 5509),
 ("it'

In [40]:
print(len(positive_counts.items()))
print(len(negative_counts.items()))
print(len(total_counts.items()))
print(len(positive_counts.most_common()))
print(len(negative_counts.most_common()))
print(len(total_counts.most_common()))

169811
167430
265378
169811
167430
265378


In [41]:
pos_neg_ratios = Counter()

# Calculate the ratios of positive and negative uses of the most common words
# Consider words to be "common" if they've been used at least 100 times
for term, count in list(total_counts.most_common()):
    if(count > 100):
        pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
        pos_neg_ratios[term] = pos_neg_ratio

In [42]:
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios["the"]))
print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios["amazing"]))
print("Pos-to-neg ratio for 'terrible' = {}".format(pos_neg_ratios["terrible"]))

Pos-to-neg ratio for 'the' = 1.0703492228278109
Pos-to-neg ratio for 'amazing' = 3.77720207253886
Pos-to-neg ratio for 'terrible' = 0.23886138613861385


In [43]:
# Convert ratios to logs
for word in pos_neg_ratios:
    pos_neg_ratios[word] = np.log(pos_neg_ratios[word])

In [44]:
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios["the"]))
print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios["amazing"]))
print("Pos-to-neg ratio for 'terrible' = {}".format(pos_neg_ratios["terrible"]))

Pos-to-neg ratio for 'the' = 0.0679849716991887
Pos-to-neg ratio for 'amazing' = 1.3289835431037726
Pos-to-neg ratio for 'terrible' = -1.4318718696162098


In [45]:
# words most frequently seen in a review with a "POSITIVE" label
pos_neg_ratios.most_common()[:100]

[('7/10', 3.2733640101522705),
 ('8/10', 3.2255203675868693),
 ('Excellent', 3.1986731175506815),
 ('Highly', 2.929287174145838),
 ('9/10', 2.515678308454754),
 ('10/10', 2.4908413853078146),
 ('Matthau', 2.4849066497880004),
 ('Victoria', 2.332890442489375),
 ('perfect,', 2.312535423847214),
 ('superbly', 2.12389330425067),
 ('wonderfully', 2.120263536200091),
 ('amazing.', 2.094945728215801),
 ('superb.', 2.03688192726104),
 ('captures', 2.017566137961748),
 ('refreshing', 1.9387416595767009),
 ('wonderful.', 1.9379419794061366),
 ('Bourne', 1.9307583440347111),
 ('gripping', 1.9252908618525775),
 ('beautifully', 1.8536348729461425),
 ('breathtaking', 1.8495790401168812),
 ('perfect.', 1.8382794848629478),
 ('Powell', 1.807507826196194),
 ('excellent.', 1.8044984950054848),
 ('delightful', 1.7971214123694403),
 ('Nancy', 1.7439688053917064),
 ('brilliant.', 1.7376922479577792),
 ('finest', 1.7197859696029656),
 ('chilling', 1.7100814382137879),
 ('underrated', 1.692552819144607),
 ('

In [46]:
vocab = set(total_counts.keys())

In [47]:
vocab_size = len(vocab)
print(vocab_size)

265378


In [48]:
import random

positive_labels = []
for i in range(len(positive_tokenized)):
    positive_labels.append('POSITIVE')
negative_labels = []
for i in range(len(negative_tokenized)):
    negative_labels.append('NEGATIVE')
    
reviews = positive_tokenized + negative_tokenized
labels = positive_labels + negative_labels
reviews_and_labels = list(zip(reviews, labels))
random.shuffle(reviews_and_labels)
reviews, labels = zip(*reviews_and_labels)

In [49]:
import time
import sys
import numpy as np

# Encapsulate our neural network in a class
class SentimentNetwork:
    ## added min_count and polarity_cutoff parameters
    def __init__(self, reviews, labels, min_count = 10,polarity_cutoff = 0.1,hidden_nodes = 10, learning_rate = 0.1):
        """Create a SentimenNetwork with the given settings
        Args:
            reviews(list) - List of reviews used for training
            labels(list) - List of POSITIVE/NEGATIVE labels associated with the given reviews
            min_count(int) - Words should only be added to the vocabulary 
                             if they occur more than this many times
            polarity_cutoff(float) - The absolute value of a word's positive-to-negative
                                     ratio must be at least this big to be considered.
            hidden_nodes(int) - Number of nodes to create in the hidden layer
            learning_rate(float) - Learning rate to use while training
        
        """
        # Assign a seed to our random number generator to ensure we get
        # reproducable results during development 
        np.random.seed(1)

        # process the reviews and their associated labels so that everything
        # is ready for training
        ## added min_count and polarity_cutoff arguments to pre_process_data call
        self.pre_process_data(reviews, labels, polarity_cutoff, min_count)
        
        # Build the network to have the number of hidden nodes and the learning rate that
        # were passed into this initializer. Make the same number of input nodes as
        # there are vocabulary words and create a single output node.
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)

    ## added min_count and polarity_cutoff parameters
    def pre_process_data(self, reviews, labels, polarity_cutoff, min_count):
        
        ## Calculate positive-to-negative ratios for words before building vocabulary
        positive_counts = Counter()
        negative_counts = Counter()
        total_counts = Counter()

        for i in range(len(reviews)):
            if(labels[i] == 'POSITIVE'):
                for word in reviews[i]:
                    positive_counts[word] += 1
                    total_counts[word] += 1
            else:
                for word in reviews[i]:
                    negative_counts[word] += 1
                    total_counts[word] += 1

        pos_neg_ratios = Counter()

        for term, count in list(total_counts.most_common()):
            if(count >= 50):
                pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
                pos_neg_ratios[term] = pos_neg_ratio

        for word, ratio in pos_neg_ratios.most_common():
            if(ratio > 1):
                pos_neg_ratios[word] = np.log(ratio)
            else:
                pos_neg_ratios[word] = -np.log((1 / (ratio + 0.01)))

        # populate review_vocab with all of the words in the given reviews
        review_vocab = set()
        for review in reviews:
            for word in review:
                ## only add words that occur at least min_count times
                #                     and for words with pos/neg ratios, only add words
                #                     that meet the polarity_cutoff
                if(total_counts[word] > min_count):
                    if(word in pos_neg_ratios.keys()):
                        if((pos_neg_ratios[word] >= polarity_cutoff) or (pos_neg_ratios[word] <= -polarity_cutoff)):
                            review_vocab.add(word)
                    else:
                        review_vocab.add(word)

        # Convert the vocabulary set to a list so we can access words via indices
        self.review_vocab = list(review_vocab)
        
        # populate label_vocab with all of the words in the given labels.
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        # Convert the label vocabulary set to a list so we can access labels via indices
        self.label_vocab = list(label_vocab)
        
        # Store the sizes of the review and label vocabularies.
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        # TODO This is already done earlier - can remove duplication
        # Create a dictionary of words in the vocabulary mapped to index positions
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        # Create a dictionary of labels mapped to index positions
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i

    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Store the learning rate
        self.learning_rate = learning_rate

        # Initialize weights

        # These are the weights between the input layer and the hidden layer.
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))

        # These are the weights between the hidden layer and the output layer.
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        ## Removed self.layer_0; added self.layer_1
        # The input layer, a two-dimensional matrix with shape 1 x hidden_nodes
        self.layer_1 = np.zeros((1,hidden_nodes))
    
    ## Removed update_input_layer function
    
    def get_target_for_label(self,label):
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    ## changed name of first parameter form 'training_reviews' 
    #                     to 'training_reviews_raw'
    def train(self, training_reviews_raw, training_labels):

        ## pre-process training reviews so we can deal 
        #                     directly with the indices of non-zero inputs
        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review:
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_reviews.append(list(indices))

        # make sure out we have a matching number of reviews and labels
        assert(len(training_reviews) == len(training_labels))
        
        # Keep track of correct predictions to display accuracy during training 
        correct_so_far = 0

        # Remember when we started for printing time statistics
        start = time.time()
        
        # loop through all the given reviews and run a forward and backward pass,
        # updating weights for every item
        for i in range(len(training_reviews)):
            
            # Get the next review and its correct label
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            ## Removed call to 'update_input_layer' function
            #                     because 'layer_0' is no longer used

            # Hidden layer
            ## Add in only the weights for non-zero items
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]

            # Output layer
            ## changed to use 'self.layer_1' instead of 'local layer_1'
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))            
            
            #### Implement the backward pass here ####
            ### Backward pass ###

            # Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # Update the weights
            ## changed to use 'self.layer_1' instead of local 'layer_1'
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            
            ## Only update the weights that were used in the forward pass
            for index in review:
                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update input-to-hidden weights with gradient descent step

            # Keep track of correct predictions.
            if(layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            elif(layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the training process. 
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_reviews, testing_labels):
        """
        Attempts to predict the labels for the given testing_reviews,
        and uses the test_labels to calculate the accuracy of those predictions.
        """
        
        # keep track of how many correct predictions we make
        correct = 0

        # we'll time how many predictions per second we make
        start = time.time()

        # Loop through each of the given reviews and call run to predict
        # its label. 
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the prediction process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        """
        Returns a POSITIVE or NEGATIVE prediction for the given review.
        """
        # Run a forward pass through the network, like in the "train" function.
        
        ## Removed call to update_input_layer function
        #                     because layer_0 is no longer used

        # Hidden layer
        ## Identify the indices used in the review and then add
        #                     just those weights to layer_1 
        self.layer_1 *= 0
        unique_indices = set()
        for word in review:
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weights_0_1[index]
        
        # Output layer
        ## changed to use self.layer_1 instead of local layer_1
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
         
        # Return POSITIVE for values above greater-than-or-equal-to 0.5 in the output layer;
        # return NEGATIVE for other values
        if(layer_2[0] >= 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"

In [50]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.05,learning_rate=0.01)
mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:0.1% Speed(reviews/sec):1277. #Correct:1 #Trained:2 Training Accuracy:50.0%Progress:0.2% Speed(reviews/sec):1589. #Correct:2 #Trained:3 Training Accuracy:66.6%Progress:0.3% Speed(reviews/sec):1928. #Correct:2 #Trained:4 Training Accuracy:50.0%Progress:0.4% Speed(reviews/sec):2243. #Correct:3 #Trained:5 Training Accuracy:60.0%Progress:0.5% Speed(reviews/sec):2462. #Correct:3 #Trained:6 Training Accuracy:50.0%Progress:0.6% Speed(reviews/sec):2630. #Correct:4 #Trained:7 Training Accuracy:57.1%Progress:0.7% Speed(reviews/sec):2773. #Correct:4 #Trained:8 Training Accuracy:50.0%Progress:0.8% Speed(reviews/sec):2942. #Correct:5 #Trained:9 Training Accuracy:55.5%Progress:0.9% Speed(reviews/sec):3047. #Correct:6 #Trained:10 Training Accuracy:60.0%Progress:1.0% Speed(reviews/sec):3119. #Correct:6 #Trained:11 Training Accuracy:54.5%Progress:1.1% Speed(reviews/sec):3167. #Correct:6 #Trained:12 T

In [51]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Tested:1 Testing Accuracy:100.%Progress:0.1% Speed(reviews/sec):770.5 #Correct:2 #Tested:2 Testing Accuracy:100.%Progress:0.2% Speed(reviews/sec):1325. #Correct:2 #Tested:3 Testing Accuracy:66.6%Progress:0.3% Speed(reviews/sec):1549. #Correct:3 #Tested:4 Testing Accuracy:75.0%Progress:0.4% Speed(reviews/sec):1771. #Correct:4 #Tested:5 Testing Accuracy:80.0%Progress:0.5% Speed(reviews/sec):1778. #Correct:5 #Tested:6 Testing Accuracy:83.3%Progress:0.6% Speed(reviews/sec):1850. #Correct:6 #Tested:7 Testing Accuracy:85.7%Progress:0.7% Speed(reviews/sec):1839. #Correct:6 #Tested:8 Testing Accuracy:75.0%Progress:0.8% Speed(reviews/sec):1488. #Correct:7 #Tested:9 Testing Accuracy:77.7%Progress:0.9% Speed(reviews/sec):1543. #Correct:8 #Tested:10 Testing Accuracy:80.0%Progress:1.0% Speed(reviews/sec):1625. #Correct:9 #Tested:11 Testing Accuracy:81.8%Progress:1.1% Speed(reviews/sec):1672. #Correct:10 #Tested:12 Testing Accuracy:83.3%P

In [52]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.8,learning_rate=0.01)
mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:0.1% Speed(reviews/sec):2981. #Correct:1 #Trained:2 Training Accuracy:50.0%Progress:0.2% Speed(reviews/sec):4036. #Correct:2 #Trained:3 Training Accuracy:66.6%Progress:0.3% Speed(reviews/sec):5096. #Correct:3 #Trained:4 Training Accuracy:75.0%Progress:0.4% Speed(reviews/sec):6094. #Correct:4 #Trained:5 Training Accuracy:80.0%Progress:0.5% Speed(reviews/sec):6782. #Correct:5 #Trained:6 Training Accuracy:83.3%Progress:0.6% Speed(reviews/sec):7324. #Correct:6 #Trained:7 Training Accuracy:85.7%Progress:0.7% Speed(reviews/sec):7722. #Correct:7 #Trained:8 Training Accuracy:87.5%Progress:0.8% Speed(reviews/sec):8328. #Correct:7 #Trained:9 Training Accuracy:77.7%Progress:0.9% Speed(reviews/sec):8596. #Correct:8 #Trained:10 Training Accuracy:80.0%Progress:1.0% Speed(reviews/sec):8843. #Correct:8 #Trained:11 Training Accuracy:72.7%Progress:1.1% Speed(reviews/sec):9109. #Correct:9 #Trained:12 T

In [53]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Tested:1 Testing Accuracy:100.%Progress:0.1% Speed(reviews/sec):623.1 #Correct:1 #Tested:2 Testing Accuracy:50.0%Progress:0.2% Speed(reviews/sec):1147. #Correct:2 #Tested:3 Testing Accuracy:66.6%Progress:0.3% Speed(reviews/sec):1553. #Correct:2 #Tested:4 Testing Accuracy:50.0%Progress:0.4% Speed(reviews/sec):1923. #Correct:3 #Tested:5 Testing Accuracy:60.0%Progress:0.5% Speed(reviews/sec):2164. #Correct:4 #Tested:6 Testing Accuracy:66.6%Progress:0.6% Speed(reviews/sec):2122. #Correct:5 #Tested:7 Testing Accuracy:71.4%Progress:0.7% Speed(reviews/sec):2269. #Correct:5 #Tested:8 Testing Accuracy:62.5%Progress:0.8% Speed(reviews/sec):2102. #Correct:5 #Tested:9 Testing Accuracy:55.5%Progress:0.9% Speed(reviews/sec):2147. #Correct:6 #Tested:10 Testing Accuracy:60.0%Progress:1.0% Speed(reviews/sec):2281. #Correct:6 #Tested:11 Testing Accuracy:54.5%Progress:1.1% Speed(reviews/sec):2398. #Correct:7 #Tested:12 Testing Accuracy:58.3%Pr

In [54]:
mlp_full = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=0,polarity_cutoff=0,learning_rate=0.01)
mlp_full.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:0.1% Speed(reviews/sec):1003. #Correct:1 #Trained:2 Training Accuracy:50.0%Progress:0.2% Speed(reviews/sec):1059. #Correct:2 #Trained:3 Training Accuracy:66.6%Progress:0.3% Speed(reviews/sec):1262. #Correct:2 #Trained:4 Training Accuracy:50.0%Progress:0.4% Speed(reviews/sec):1463. #Correct:3 #Trained:5 Training Accuracy:60.0%Progress:0.5% Speed(reviews/sec):1604. #Correct:3 #Trained:6 Training Accuracy:50.0%Progress:0.6% Speed(reviews/sec):1649. #Correct:4 #Trained:7 Training Accuracy:57.1%Progress:0.7% Speed(reviews/sec):1730. #Correct:4 #Trained:8 Training Accuracy:50.0%Progress:0.8% Speed(reviews/sec):1846. #Correct:5 #Trained:9 Training Accuracy:55.5%Progress:0.9% Speed(reviews/sec):1924. #Correct:6 #Trained:10 Training Accuracy:60.0%Progress:1.0% Speed(reviews/sec):1966. #Correct:6 #Trained:11 Training Accuracy:54.5%Progress:1.1% Speed(reviews/sec):2011. #Correct:6 #Trained:12 T

In [55]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Tested:1 Testing Accuracy:100.%Progress:0.1% Speed(reviews/sec):967.3 #Correct:1 #Tested:2 Testing Accuracy:50.0%Progress:0.2% Speed(reviews/sec):1741. #Correct:2 #Tested:3 Testing Accuracy:66.6%Progress:0.3% Speed(reviews/sec):2130. #Correct:2 #Tested:4 Testing Accuracy:50.0%Progress:0.4% Speed(reviews/sec):2558. #Correct:3 #Tested:5 Testing Accuracy:60.0%Progress:0.5% Speed(reviews/sec):2778. #Correct:4 #Tested:6 Testing Accuracy:66.6%Progress:0.6% Speed(reviews/sec):3030. #Correct:5 #Tested:7 Testing Accuracy:71.4%Progress:0.7% Speed(reviews/sec):3142. #Correct:5 #Tested:8 Testing Accuracy:62.5%Progress:0.8% Speed(reviews/sec):2567. #Correct:5 #Tested:9 Testing Accuracy:55.5%Progress:0.9% Speed(reviews/sec):2563. #Correct:6 #Tested:10 Testing Accuracy:60.0%Progress:1.0% Speed(reviews/sec):2703. #Correct:6 #Tested:11 Testing Accuracy:54.5%Progress:1.1% Speed(reviews/sec):2815. #Correct:7 #Tested:12 Testing Accuracy:58.3%Pr

In [56]:
def get_most_similar_words(focus = "horrible"):
    most_similar = Counter()

    for word in mlp_full.word2index.keys():
        most_similar[word] = np.dot(mlp_full.weights_0_1[mlp_full.word2index[word]],mlp_full.weights_0_1[mlp_full.word2index[focus]])
    
    return most_similar.most_common()

In [57]:
get_most_similar_words("excellent")[:10]

[('great', 0.015354360420488525),
 ('best', 0.010701113495691146),
 ('both', 0.00913238573908229),
 ('love', 0.008762099196817223),
 ('excellent', 0.00828209417169769),
 ('shows', 0.007747378485183827),
 ('times', 0.007118581855245173),
 ('me', 0.007117547222502743),
 ('favorite', 0.007105204314175147),
 ('genre', 0.007101089080350402)]

In [58]:
get_most_similar_words("terrible")[:10]

[('bad', 0.008959456695939208),
 ('waste', 0.007479145917837915),
 ('worst', 0.006707182153404599),
 ('awful', 0.006656339497681641),
 ('?', 0.006204613639024003),
 ('plot', 0.005718377528498075),
 ('boring', 0.0054134676999341175),
 ('There', 0.005129869151607107),
 ('supposed', 0.00504921098545334),
 ('so', 0.004942955229349824)]

In [59]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [60]:
import matplotlib.colors as colors

words_to_visualize = list()
for word, ratio in pos_neg_ratios.most_common(500):
    if(word in mlp_full.word2index.keys()):
        words_to_visualize.append(word)
    
for word, ratio in list(reversed(pos_neg_ratios.most_common()))[0:500]:
    if(word in mlp_full.word2index.keys()):
        words_to_visualize.append(word)

In [61]:
pos = 0
neg = 0

colors_list = list()
vectors_list = list()
for word in words_to_visualize:
    if word in pos_neg_ratios.keys():
        vectors_list.append(mlp_full.weights_0_1[mlp_full.word2index[word]])
        if(pos_neg_ratios[word] > 0):
            pos+=1
            colors_list.append("#00ff00")
        else:
            neg+=1
            colors_list.append("#000000")

In [62]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(vectors_list)

In [63]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="vector T-SNE for most polarized words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_to_visualize,
                                    color=colors_list))

p.scatter(x="x1", y="x2", size=8, source=source, fill_color="color")

word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(word_labels)

show(p)

# green indicates positive words, black indicates negative words