### Sentiment analysis of movie (IMDB) reviews using dataset provided by the ACL 2011 paper, see http://ai.stanford.edu/~amaas/data/sentiment/.

#### Dataset can be downloaded separately from http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz, but wont be necessary as the download process has been embedded in the notebook and source file.

In [84]:
# MacOSX: See https://www.mkyong.com/mac/wget-on-mac-os-x/ for wget
import os

print('On the MacOSX, you will need to install wget, see https://www.mkyong.com/mac/wget-on-mac-os-x/')

if not os.path.isfile('aclImdb_v1.tar.gz'):
  !wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz 

if not os.path.isfile('aclImdb'):  
  !tar -xf aclImdb_v1.tar.gz 


On the MacOSX, you will need to install wget, see https://www.mkyong.com/mac/wget-on-mac-os-x/


In [85]:
!pip install nltk
!pip install --upgrade gensim

import numpy as np
import os
import os.path

from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
import nltk


import glob
from gensim.models import Word2Vec

import time

[33mYou are using pip version 18.0, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
Requirement already up-to-date: gensim in /Users/michael/miniconda3/lib/python3.6/site-packages (3.6.0)
[33mYou are using pip version 18.0, however version 18.1 is available.
You should consider upgrading via the 'pip install --upgrade pip' command.[0m
[nltk_data] Downloading package punkt to /Users/michael/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [86]:
time_beginning_of_notebook = time.time()
SAMPLE_SIZE=1000
positive_file_list = glob.glob(os.path.join('aclImdb/train/pos', "*.txt"))
positive_sample_file_list = positive_file_list[:SAMPLE_SIZE]

negative_file_list = glob.glob(os.path.join('aclImdb/train/neg', "*.txt"))
negative_sample_file_list = negative_file_list[:SAMPLE_SIZE]

import re

# load doc into memory
# regex to clean markup elements 
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding='utf8')
    # read all text
    text = re.sub('<[^>]*>', ' ', file.read())
    #text = file.read()
    # close the file
    file.close()
    return text


In [87]:
positive_reviews = [load_doc(x) for x in positive_file_list]
negative_reviews = [load_doc(x) for x in negative_file_list]

In [88]:
def pretty_print_positive_and_negative(i):
    print(positive_reviews[i][:30] + "\t:\t" + negative_reviews[i][:30] + "...")

In [96]:
print(len(positive_reviews) + len(all_negative_strings))
print('\n Positive reviews \n ', positive_reviews[2137][:50])
print('\n Negative reviews \n ', negative_reviews[2137][:50])

25000

 Positive reviews 
  Typically, "kids" films have some annoying quality

 Negative reviews 
  It is a great tragedy that both Richard Harris and


In [90]:
print("positive reviews \t : \t negative reviews\n")
pretty_print_positive_and_negative(2137)
pretty_print_positive_and_negative(12444)
pretty_print_positive_and_negative(6267)
pretty_print_positive_and_negative(5297)
pretty_print_positive_and_negative(4998)

positive reviews 	 : 	 negative reviews

Typically, "kids" films have s	:	It is a great tragedy that bot...
Superb cast, more please!  If 	:	I am uncertain what to make of...
This is one of my all-time fav	:	We have an average family. Dad...
Since Douglas MacArthur affect	:	Bela Lugosi plays a doctor who...
I work in a library and expect	:	Originally aired as an ABC Mov...


In [91]:
from collections import Counter
import numpy as np

In [44]:
# Create three Counter objects to store positive, negative and total counts
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()

In [45]:
# Loop over all the words in all the reviews and increment the counts in the appropriate counter objects
for i in range(len(positive_reviews)):
    for word in all_positive_strings[i].split(" "):
        positive_counts[word] += 1
        total_counts[word] += 1
for i in range(len(negative_reviews)):
    for word in all_negative_strings[i].split(" "):
        negative_counts[word] += 1
        total_counts[word] += 1

In [46]:
positive_counts.most_common()

[('the', 148466),
 ('and', 84295),
 ('a', 79438),
 ('of', 75349),
 ('to', 65216),
 ('is', 55366),
 ('in', 45802),
 ('I', 32622),
 ('that', 31948),
 ('', 27700),
 ('it', 26999),
 ('this', 26037),
 ('as', 23934),
 ('with', 22034),
 ('was', 21312),
 ('for', 20874),
 ('The', 20300),
 ('but', 16459),
 ('his', 16203),
 ('on', 15387),
 ('film', 14420),
 ('are', 14397),
 ('movie', 13375),
 ('not', 12493),
 ('you', 12416),
 ('have', 12270),
 ('he', 11771),
 ('be', 11696),
 ('by', 11462),
 ('an', 10794),
 ('one', 10686),
 ('at', 10231),
 ('who', 10152),
 ('from', 10134),
 ('all', 9159),
 ('has', 9032),
 ('her', 8999),
 ('like', 7981),
 ('about', 7829),
 ('very', 7796),
 ('they', 7714),
 ('This', 7437),
 ('so', 7383),
 ('or', 7013),
 ('more', 6825),
 ('out', 6692),
 ('some', 6664),
 ('just', 6533),
 ('It', 6238),
 ('when', 5987),
 ('what', 5903),
 ('their', 5893),
 ('good', 5797),
 ('which', 5645),
 ('she', 5402),
 ("it's", 5313),
 ('can', 5275),
 ('see', 5250),
 ('my', 5226),
 ('would', 5191),
 

In [47]:
negative_counts.most_common()

[('the', 138707),
 ('a', 75682),
 ('and', 68417),
 ('of', 67636),
 ('to', 67364),
 ('is', 47882),
 ('in', 39790),
 ('I', 37007),
 ('that', 32619),
 ('this', 31208),
 ('', 29753),
 ('it', 27455),
 ('was', 25393),
 ('The', 20694),
 ('for', 20202),
 ('with', 19694),
 ('as', 18587),
 ('but', 17340),
 ('movie', 17140),
 ('on', 15383),
 ('have', 14863),
 ('are', 14106),
 ('be', 13818),
 ('not', 13775),
 ('film', 12994),
 ('you', 12714),
 ('his', 11492),
 ('at', 11071),
 ('like', 10158),
 ('they', 10131),
 ('one', 10010),
 ('by', 9969),
 ('he', 9914),
 ('an', 9833),
 ('just', 9802),
 ('or', 9211),
 ('from', 9112),
 ('so', 8966),
 ('all', 8907),
 ('who', 8691),
 ('about', 8463),
 ('out', 7679),
 ('some', 7553),
 ('has', 7445),
 ('This', 7054),
 ('her', 6833),
 ('would', 6732),
 ('even', 6509),
 ('no', 6412),
 ('only', 6274),
 ('if', 6175),
 ('more', 6128),
 ('had', 5914),
 ('were', 5837),
 ('what', 5788),
 ('It', 5661),
 ('really', 5657),
 ('good', 5647),
 ('up', 5622),
 ('when', 5509),
 ("it'

In [63]:
print(len(positive_counts.items()))
print(len(negative_counts.items()))
print(len(total_counts.items()))
print(len(positive_counts.most_common()))
print(len(negative_counts.most_common()))
print(len(total_counts.most_common()))

169811
167430
265378
169811
167430
265378


In [50]:
pos_neg_ratios = Counter()

# Calculate the ratios of positive and negative uses of the most common words
# Consider words to be "common" if they've been used at least 100 times
for term, count in list(total_counts.most_common()):
    if(count > 100):
        pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
        pos_neg_ratios[term] = pos_neg_ratio

In [51]:
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios["the"]))
print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios["amazing"]))
print("Pos-to-neg ratio for 'terrible' = {}".format(pos_neg_ratios["terrible"]))

Pos-to-neg ratio for 'the' = 1.0703492228278109
Pos-to-neg ratio for 'amazing' = 3.77720207253886
Pos-to-neg ratio for 'terrible' = 0.23886138613861385


In [53]:
# Convert ratios to logs
for word in pos_neg_ratios:
    pos_neg_ratios[word] = np.log(pos_neg_ratios[word])

In [54]:
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios["the"]))
print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios["amazing"]))
print("Pos-to-neg ratio for 'terrible' = {}".format(pos_neg_ratios["terrible"]))

Pos-to-neg ratio for 'the' = 0.0679849716991887
Pos-to-neg ratio for 'amazing' = 1.3289835431037726
Pos-to-neg ratio for 'terrible' = -1.4318718696162098


In [55]:
# words most frequently seen in a review with a "POSITIVE" label
pos_neg_ratios.most_common()

[('7/10', 3.2733640101522705),
 ('8/10', 3.2255203675868693),
 ('Excellent', 3.1986731175506815),
 ('Highly', 2.929287174145838),
 ('9/10', 2.515678308454754),
 ('10/10', 2.4908413853078146),
 ('Matthau', 2.4849066497880004),
 ('Victoria', 2.332890442489375),
 ('perfect,', 2.312535423847214),
 ('superbly', 2.12389330425067),
 ('wonderfully', 2.120263536200091),
 ('amazing.', 2.094945728215801),
 ('superb.', 2.03688192726104),
 ('captures', 2.017566137961748),
 ('refreshing', 1.9387416595767009),
 ('wonderful.', 1.9379419794061366),
 ('Bourne', 1.9307583440347111),
 ('gripping', 1.9252908618525775),
 ('beautifully', 1.8536348729461425),
 ('breathtaking', 1.8495790401168812),
 ('perfect.', 1.8382794848629478),
 ('Powell', 1.807507826196194),
 ('excellent.', 1.8044984950054848),
 ('delightful', 1.7971214123694403),
 ('Nancy', 1.7439688053917064),
 ('brilliant.', 1.7376922479577792),
 ('finest', 1.7197859696029656),
 ('chilling', 1.7100814382137879),
 ('underrated', 1.692552819144607),
 ('

In [57]:
# words most frequently seen in a review with a "NEGATIVE" label
pos_neg_ratios.most_common()[:-31:-1]

[('Seagal', -4.709530201312334),
 ('Avoid', -4.605170185988091),
 ('4/10', -4.204692619390966),
 ('3/10', -3.713572066704308),
 ('MST3K', -3.545778610473263),
 ('1/10', -3.3586377672433594),
 ('horrible.', -3.283414346005772),
 ('awful.', -3.24918117436353),
 ('costs.', -2.890371757896165),
 ('unfunny', -2.8449093838194073),
 ('Worst', -2.793208009442517),
 ('terrible.', -2.7313070632664775),
 ('waste', -2.689270306450614),
 ('garbage.', -2.6127400212978853),
 ('awful,', -2.550421256898627),
 ('pointless', -2.49764567556543),
 ('crap.', -2.4553061800117097),
 ('redeeming', -2.3809169362639526),
 ('lousy', -2.373354163882253),
 ('poorly', -2.37006739017563),
 ('mess.', -2.324563999712821),
 ('worst', -2.3142244096510645),
 ('laughable.', -2.3116349285139637),
 ('terrible,', -2.2335922215070942),
 ('remotely', -2.1913594578838214),
 ('wasting', -2.189789598848701),
 ('laughable', -2.127195972975736),
 ('insult', -2.108769156774356),
 ('lame', -2.108429078553088),
 ('stupid.', -2.04769284

In [59]:
vocab = set(total_counts.keys())

In [60]:
vocab_size = len(vocab)
print(vocab_size)

265378


In [64]:
# TODO: Create layer_0 matrix with dimensions 1 by vocab_size, initially filled with zeros
layer_0 = np.ones((1, vocab_size))

In [65]:
layer_0.shape

(1, 265378)

In [66]:
# Create a dictionary of words in the vocabulary mapped to index positions
# (to be used in layer_0)
word2index = {}
for i,word in enumerate(vocab):
    word2index[word] = i
    
# display the map of words to indices
word2index

{'': 0,
 'USA?': 1,
 'Matlock/Mason': 2,
 '"kewl': 3,
 'verbiage.': 4,
 'homeless.The': 5,
 'Ferguson,': 6,
 'scream,': 7,
 '/Rick': 8,
 'clicked': 9,
 'plan,': 10,
 'trilogy.': 11,
 'Review': 12,
 'shows,': 13,
 'towel,': 14,
 'planted,': 15,
 'just...stupid.': 16,
 'crewmate,': 17,
 'DRESSING)': 18,
 'story/comedy?': 19,
 '(Knin)': 20,
 'Glasses?': 21,
 "Rawhide's": 22,
 'Hallmark,': 23,
 'fit).': 24,
 'FOLLOWING': 25,
 'Quest': 26,
 'Peeps': 27,
 '("teachers"': 28,
 'cute...': 29,
 'wild-style': 30,
 'sleep': 31,
 'seed...': 32,
 'grandiosely': 33,
 'Reds?': 34,
 'Waxwork-Tortured': 35,
 'appoints': 36,
 '"Kali!"': 37,
 'furnace!': 38,
 'Marquise)': 39,
 'equated': 40,
 "'Charlotte'": 41,
 'liiiiiiiiife': 42,
 'lecherous': 43,
 'Bannon': 44,
 'Whoop,': 45,
 "'Go'": 46,
 "Treasure',": 47,
 'tackier': 48,
 'addicting': 49,
 'selectively.': 50,
 'Rowlands.': 51,
 'Americana': 52,
 "deep!'": 53,
 'Doo!!),': 54,
 'toys."': 55,
 "love'-story,": 56,
 'cheesy"': 57,
 'zone': 58,
 'Scrappy-D

In [78]:
def update_input_layer(review):
    """ Modify the global layer_0 to represent the vector form of review.
    The element at a given index of layer_0 should represent
    how many times the given word occurs in the review.
    Args:
        review(string) - the string of the review
    Returns:
        None
    """
    global layer_0
    # clear out previous state by resetting the layer to be all 0s
    layer_0 *= 0
    
    # count how many times each word is used in the given review and store the results in layer_0 
    for word in review.split(" "):
#         print(word)
        layer_0[0][word2index[word]] += 1

In [76]:
word2index['movie']

235601

In [92]:
positive_reviews[0]

'For a movie that gets no respect there sure are a lot of memorable quotes listed for this gem. Imagine a movie where Joe Piscopo is actually funny! Maureen Stapleton is a scene stealer. The Moroni character is an absolute scream. Watch for Alan "The Skipper" Hale jr. as a police Sgt.'

In [93]:
update_input_layer(positive_reviews[0])
[l for l in layer_0[0] if l > 1]
# layer_0[0]

[5.0, 3.0, 2.0, 2.0]

In [98]:
type(positive_reviews)

list

In [112]:
import random

positive_labels = []
for i in range(len(positive_reviews)):
    positive_labels.append('POSITIVE')
negative_labels = []
for i in range(len(negative_reviews)):
    negative_labels.append('NEGATIVE')
reviews = positive_reviews + negative_reviews
labels = positive_labels + negative_labels
reviews_and_labels = list(zip(reviews, labels))
random.shuffle(reviews_and_labels)
reviews, labels = zip(*reviews_and_labels)

In [117]:
import time
import sys
import numpy as np

# Encapsulate our neural network in a class
class SentimentNetwork:
    ## added min_count and polarity_cutoff parameters
    def __init__(self, reviews, labels, min_count = 10,polarity_cutoff = 0.1,hidden_nodes = 10, learning_rate = 0.1):
        """Create a SentimenNetwork with the given settings
        Args:
            reviews(list) - List of reviews used for training
            labels(list) - List of POSITIVE/NEGATIVE labels associated with the given reviews
            min_count(int) - Words should only be added to the vocabulary 
                             if they occur more than this many times
            polarity_cutoff(float) - The absolute value of a word's positive-to-negative
                                     ratio must be at least this big to be considered.
            hidden_nodes(int) - Number of nodes to create in the hidden layer
            learning_rate(float) - Learning rate to use while training
        
        """
        # Assign a seed to our random number generator to ensure we get
        # reproducable results during development 
        np.random.seed(1)

        # process the reviews and their associated labels so that everything
        # is ready for training
        ## added min_count and polarity_cutoff arguments to pre_process_data call
        self.pre_process_data(reviews, labels, polarity_cutoff, min_count)
        
        # Build the network to have the number of hidden nodes and the learning rate that
        # were passed into this initializer. Make the same number of input nodes as
        # there are vocabulary words and create a single output node.
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)

    ## added min_count and polarity_cutoff parameters
    def pre_process_data(self, reviews, labels, polarity_cutoff, min_count):
        
        ## Calculate positive-to-negative ratios for words before building vocabulary
        positive_counts = Counter()
        negative_counts = Counter()
        total_counts = Counter()

        for i in range(len(reviews)):
            if(labels[i] == 'POSITIVE'):
                for word in reviews[i].split(" "):
                    positive_counts[word] += 1
                    total_counts[word] += 1
            else:
                for word in reviews[i].split(" "):
                    negative_counts[word] += 1
                    total_counts[word] += 1

        pos_neg_ratios = Counter()

        for term, count in list(total_counts.most_common()):
            if(count >= 50):
                pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
                pos_neg_ratios[term] = pos_neg_ratio

        for word, ratio in pos_neg_ratios.most_common():
            if(ratio > 1):
                pos_neg_ratios[word] = np.log(ratio)
            else:
                pos_neg_ratios[word] = -np.log((1 / (ratio + 0.01)))

        # populate review_vocab with all of the words in the given reviews
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                ## only add words that occur at least min_count times
                #                     and for words with pos/neg ratios, only add words
                #                     that meet the polarity_cutoff
                if(total_counts[word] > min_count):
                    if(word in pos_neg_ratios.keys()):
                        if((pos_neg_ratios[word] >= polarity_cutoff) or (pos_neg_ratios[word] <= -polarity_cutoff)):
                            review_vocab.add(word)
                    else:
                        review_vocab.add(word)

        # Convert the vocabulary set to a list so we can access words via indices
        self.review_vocab = list(review_vocab)
        
        # populate label_vocab with all of the words in the given labels.
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        # Convert the label vocabulary set to a list so we can access labels via indices
        self.label_vocab = list(label_vocab)
        
        # Store the sizes of the review and label vocabularies.
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        # Create a dictionary of words in the vocabulary mapped to index positions
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        # Create a dictionary of labels mapped to index positions
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i

    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Store the learning rate
        self.learning_rate = learning_rate

        # Initialize weights

        # These are the weights between the input layer and the hidden layer.
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))

        # These are the weights between the hidden layer and the output layer.
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        ## Removed self.layer_0; added self.layer_1
        # The input layer, a two-dimensional matrix with shape 1 x hidden_nodes
        self.layer_1 = np.zeros((1,hidden_nodes))
    
    ## Removed update_input_layer function
    
    def get_target_for_label(self,label):
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    ## changed name of first parameter form 'training_reviews' 
    #                     to 'training_reviews_raw'
    def train(self, training_reviews_raw, training_labels):

        ## pre-process training reviews so we can deal 
        #                     directly with the indices of non-zero inputs
        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_reviews.append(list(indices))

        # make sure out we have a matching number of reviews and labels
        assert(len(training_reviews) == len(training_labels))
        
        # Keep track of correct predictions to display accuracy during training 
        correct_so_far = 0

        # Remember when we started for printing time statistics
        start = time.time()
        
        # loop through all the given reviews and run a forward and backward pass,
        # updating weights for every item
        for i in range(len(training_reviews)):
            
            # Get the next review and its correct label
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            ## Removed call to 'update_input_layer' function
            #                     because 'layer_0' is no longer used

            # Hidden layer
            ## Add in only the weights for non-zero items
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]

            # Output layer
            ## changed to use 'self.layer_1' instead of 'local layer_1'
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))            
            
            #### Implement the backward pass here ####
            ### Backward pass ###

            # Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # Update the weights
            ## changed to use 'self.layer_1' instead of local 'layer_1'
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            
            ## Only update the weights that were used in the forward pass
            for index in review:
                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update input-to-hidden weights with gradient descent step

            # Keep track of correct predictions.
            if(layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            elif(layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the training process. 
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_reviews, testing_labels):
        """
        Attempts to predict the labels for the given testing_reviews,
        and uses the test_labels to calculate the accuracy of those predictions.
        """
        
        # keep track of how many correct predictions we make
        correct = 0

        # we'll time how many predictions per second we make
        start = time.time()

        # Loop through each of the given reviews and call run to predict
        # its label. 
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the prediction process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        """
        Returns a POSITIVE or NEGATIVE prediction for the given review.
        """
        # Run a forward pass through the network, like in the "train" function.
        
        ## Removed call to update_input_layer function
        #                     because layer_0 is no longer used

        # Hidden layer
        ## Identify the indices used in the review and then add
        #                     just those weights to layer_1 
        self.layer_1 *= 0
        unique_indices = set()
        for word in review.lower().split(" "):
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weights_0_1[index]
        
        # Output layer
        ## changed to use self.layer_1 instead of local layer_1
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
         
        # Return POSITIVE for values above greater-than-or-equal-to 0.5 in the output layer;
        # return NEGATIVE for other values
        if(layer_2[0] >= 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"

Train the network with a small polarity cutoff.

In [126]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.05,learning_rate=0.01)
mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):868.9 #Correct:1890 #Trained:2501 Training Accuracy:75.5%
Progress:20.8% Speed(reviews/sec):844.5 #Correct:3942 #Trained:5001 Training Accuracy:78.8%
Progress:31.2% Speed(reviews/sec):822.5 #Correct:5990 #Trained:7501 Training Accuracy:79.8%
Progress:41.6% Speed(reviews/sec):822.6 #Correct:8085 #Trained:10001 Training Accuracy:80.8%
Progress:52.0% Speed(reviews/sec):816.6 #Correct:10225 #Trained:12501 Training Accuracy:81.7%
Progress:62.5% Speed(reviews/sec):817.2 #Correct:12348 #Trained:15001 Training Accuracy:82.3%
Progress:72.9% Speed(reviews/sec):817.3 #Correct:14491 #Trained:17501 Training Accuracy:82.8%
Progress:83.3% Speed(reviews/sec):815.8 #Correct:16628 #Trained:20001 Training Accuracy:83.1%
Progress:93.7% Speed(reviews/sec):811.6 #Correct:18777 #Trained:22501 Training Accuracy:83.4%
Progress:99.9% Speed(reviews/sec):814.4 #Correct:20073 #Trained:24000 Training

Run the following to test it's performance.

In [127]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Tested:1 Testing Accuracy:100.%Progress:0.1% Speed(reviews/sec):248.8 #Correct:2 #Tested:2 Testing Accuracy:100.%Progress:0.2% Speed(reviews/sec):377.2 #Correct:3 #Tested:3 Testing Accuracy:100.%Progress:0.3% Speed(reviews/sec):484.4 #Correct:4 #Tested:4 Testing Accuracy:100.%Progress:0.4% Speed(reviews/sec):539.2 #Correct:5 #Tested:5 Testing Accuracy:100.%Progress:0.5% Speed(reviews/sec):592.0 #Correct:5 #Tested:6 Testing Accuracy:83.3%Progress:0.6% Speed(reviews/sec):657.6 #Correct:5 #Tested:7 Testing Accuracy:71.4%Progress:0.7% Speed(reviews/sec):667.2 #Correct:5 #Tested:8 Testing Accuracy:62.5%Progress:0.8% Speed(reviews/sec):719.8 #Correct:6 #Tested:9 Testing Accuracy:66.6%Progress:0.9% Speed(reviews/sec):768.9 #Correct:6 #Tested:10 Testing Accuracy:60.0%Progress:1.0% Speed(reviews/sec):812.6 #Correct:7 #Tested:11 Testing Accuracy:63.6%Progress:1.1% Speed(reviews/sec):855.7 #Correct:8 #Tested:12 Testing Accuracy:66.6%Pr

Progress:20.6% Speed(reviews/sec):1038. #Correct:178 #Tested:207 Testing Accuracy:85.9%Progress:20.7% Speed(reviews/sec):1041. #Correct:179 #Tested:208 Testing Accuracy:86.0%Progress:20.8% Speed(reviews/sec):1043. #Correct:180 #Tested:209 Testing Accuracy:86.1%Progress:20.9% Speed(reviews/sec):1045. #Correct:181 #Tested:210 Testing Accuracy:86.1%Progress:21.0% Speed(reviews/sec):1044. #Correct:182 #Tested:211 Testing Accuracy:86.2%Progress:21.1% Speed(reviews/sec):1045. #Correct:183 #Tested:212 Testing Accuracy:86.3%Progress:21.2% Speed(reviews/sec):1046. #Correct:184 #Tested:213 Testing Accuracy:86.3%Progress:21.3% Speed(reviews/sec):1048. #Correct:185 #Tested:214 Testing Accuracy:86.4%Progress:21.4% Speed(reviews/sec):1049. #Correct:186 #Tested:215 Testing Accuracy:86.5%Progress:21.5% Speed(reviews/sec):1052. #Correct:186 #Tested:216 Testing Accuracy:86.1%Progress:21.6% Speed(reviews/sec):1055. #Correct:187 #Tested:217 Testing Accuracy:86.1%Progress:21.7% Speed(reviews/se

Progress:38.6% Speed(reviews/sec):1074. #Correct:329 #Tested:387 Testing Accuracy:85.0%Progress:38.7% Speed(reviews/sec):1075. #Correct:330 #Tested:388 Testing Accuracy:85.0%Progress:38.8% Speed(reviews/sec):1076. #Correct:331 #Tested:389 Testing Accuracy:85.0%Progress:38.9% Speed(reviews/sec):1078. #Correct:331 #Tested:390 Testing Accuracy:84.8%Progress:39.0% Speed(reviews/sec):1076. #Correct:332 #Tested:391 Testing Accuracy:84.9%Progress:39.1% Speed(reviews/sec):1076. #Correct:333 #Tested:392 Testing Accuracy:84.9%Progress:39.2% Speed(reviews/sec):1077. #Correct:334 #Tested:393 Testing Accuracy:84.9%Progress:39.3% Speed(reviews/sec):1075. #Correct:335 #Tested:394 Testing Accuracy:85.0%Progress:39.4% Speed(reviews/sec):1075. #Correct:336 #Tested:395 Testing Accuracy:85.0%Progress:39.5% Speed(reviews/sec):1076. #Correct:337 #Tested:396 Testing Accuracy:85.1%Progress:39.6% Speed(reviews/sec):1078. #Correct:338 #Tested:397 Testing Accuracy:85.1%Progress:39.7% Speed(reviews/se

Progress:64.6% Speed(reviews/sec):1165. #Correct:553 #Tested:647 Testing Accuracy:85.4%Progress:64.7% Speed(reviews/sec):1164. #Correct:554 #Tested:648 Testing Accuracy:85.4%Progress:64.8% Speed(reviews/sec):1165. #Correct:555 #Tested:649 Testing Accuracy:85.5%Progress:64.9% Speed(reviews/sec):1165. #Correct:555 #Tested:650 Testing Accuracy:85.3%Progress:65.0% Speed(reviews/sec):1166. #Correct:556 #Tested:651 Testing Accuracy:85.4%Progress:65.1% Speed(reviews/sec):1166. #Correct:556 #Tested:652 Testing Accuracy:85.2%Progress:65.2% Speed(reviews/sec):1165. #Correct:557 #Tested:653 Testing Accuracy:85.2%Progress:65.3% Speed(reviews/sec):1166. #Correct:558 #Tested:654 Testing Accuracy:85.3%Progress:65.4% Speed(reviews/sec):1167. #Correct:559 #Tested:655 Testing Accuracy:85.3%Progress:65.5% Speed(reviews/sec):1167. #Correct:560 #Tested:656 Testing Accuracy:85.3%Progress:65.6% Speed(reviews/sec):1166. #Correct:561 #Tested:657 Testing Accuracy:85.3%Progress:65.7% Speed(reviews/se

Train the network with a much larger polarity cutoff.

In [118]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.8,learning_rate=0.01)
mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):844.5 #Correct:1890 #Trained:2501 Training Accuracy:75.5%
Progress:20.8% Speed(reviews/sec):827.1 #Correct:3942 #Trained:5001 Training Accuracy:78.8%
Progress:31.2% Speed(reviews/sec):821.2 #Correct:5990 #Trained:7501 Training Accuracy:79.8%
Progress:41.6% Speed(reviews/sec):819.7 #Correct:8085 #Trained:10001 Training Accuracy:80.8%
Progress:52.0% Speed(reviews/sec):814.4 #Correct:10225 #Trained:12501 Training Accuracy:81.7%
Progress:62.5% Speed(reviews/sec):817.3 #Correct:12348 #Trained:15001 Training Accuracy:82.3%
Progress:72.9% Speed(reviews/sec):806.8 #Correct:14491 #Trained:17501 Training Accuracy:82.8%
Progress:83.3% Speed(reviews/sec):807.8 #Correct:16628 #Trained:20001 Training Accuracy:83.1%
Progress:93.7% Speed(reviews/sec):806.7 #Correct:18777 #Trained:22501 Training Accuracy:83.4%
Progress:99.9% Speed(reviews/sec):809.3 #Correct:20073 #Trained:24000 Training

Run the following to test it's performance.

In [121]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Tested:1 Testing Accuracy:100.%Progress:0.1% Speed(reviews/sec):18.91 #Correct:2 #Tested:2 Testing Accuracy:100.%Progress:0.2% Speed(reviews/sec):37.36 #Correct:3 #Tested:3 Testing Accuracy:100.%Progress:0.3% Speed(reviews/sec):55.33 #Correct:4 #Tested:4 Testing Accuracy:100.%Progress:0.4% Speed(reviews/sec):71.09 #Correct:5 #Tested:5 Testing Accuracy:100.%Progress:0.5% Speed(reviews/sec):79.26 #Correct:5 #Tested:6 Testing Accuracy:83.3%Progress:0.6% Speed(reviews/sec):93.76 #Correct:5 #Tested:7 Testing Accuracy:71.4%Progress:0.7% Speed(reviews/sec):107.0 #Correct:5 #Tested:8 Testing Accuracy:62.5%Progress:0.8% Speed(reviews/sec):121.1 #Correct:6 #Tested:9 Testing Accuracy:66.6%Progress:0.9% Speed(reviews/sec):134.9 #Correct:6 #Tested:10 Testing Accuracy:60.0%Progress:1.0% Speed(reviews/sec):148.7 #Correct:7 #Tested:11 Testing Accuracy:63.6%Progress:1.1% Speed(reviews/sec):162.3 #Correct:8 #Tested:12 Testing Accuracy:66.6%Pr

Progress:13.0% Speed(reviews/sec):864.2 #Correct:111 #Tested:131 Testing Accuracy:84.7%Progress:13.1% Speed(reviews/sec):866.9 #Correct:112 #Tested:132 Testing Accuracy:84.8%Progress:13.2% Speed(reviews/sec):870.2 #Correct:113 #Tested:133 Testing Accuracy:84.9%Progress:13.3% Speed(reviews/sec):873.7 #Correct:113 #Tested:134 Testing Accuracy:84.3%Progress:13.4% Speed(reviews/sec):876.5 #Correct:114 #Tested:135 Testing Accuracy:84.4%Progress:13.5% Speed(reviews/sec):879.4 #Correct:114 #Tested:136 Testing Accuracy:83.8%Progress:13.6% Speed(reviews/sec):883.4 #Correct:115 #Tested:137 Testing Accuracy:83.9%Progress:13.7% Speed(reviews/sec):884.4 #Correct:116 #Tested:138 Testing Accuracy:84.0%Progress:13.8% Speed(reviews/sec):887.1 #Correct:117 #Tested:139 Testing Accuracy:84.1%Progress:13.9% Speed(reviews/sec):890.5 #Correct:118 #Tested:140 Testing Accuracy:84.2%Progress:14.0% Speed(reviews/sec):893.4 #Correct:119 #Tested:141 Testing Accuracy:84.3%Progress:14.1% Speed(reviews/se

Progress:42.7% Speed(reviews/sec):1135. #Correct:364 #Tested:428 Testing Accuracy:85.0%Progress:42.8% Speed(reviews/sec):1136. #Correct:365 #Tested:429 Testing Accuracy:85.0%Progress:42.9% Speed(reviews/sec):1136. #Correct:365 #Tested:430 Testing Accuracy:84.8%Progress:43.0% Speed(reviews/sec):1137. #Correct:366 #Tested:431 Testing Accuracy:84.9%Progress:43.1% Speed(reviews/sec):1138. #Correct:367 #Tested:432 Testing Accuracy:84.9%Progress:43.2% Speed(reviews/sec):1138. #Correct:367 #Tested:433 Testing Accuracy:84.7%Progress:43.3% Speed(reviews/sec):1138. #Correct:368 #Tested:434 Testing Accuracy:84.7%Progress:43.4% Speed(reviews/sec):1139. #Correct:369 #Tested:435 Testing Accuracy:84.8%Progress:43.5% Speed(reviews/sec):1140. #Correct:370 #Tested:436 Testing Accuracy:84.8%Progress:43.6% Speed(reviews/sec):1141. #Correct:371 #Tested:437 Testing Accuracy:84.8%Progress:43.7% Speed(reviews/sec):1143. #Correct:371 #Tested:438 Testing Accuracy:84.7%Progress:43.8% Speed(reviews/se

Execute the full run below:

In [122]:
mlp_full = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=0,polarity_cutoff=0,learning_rate=0.01)
mlp_full.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):631.7 #Correct:1853 #Trained:2501 Training Accuracy:74.0%
Progress:20.8% Speed(reviews/sec):609.1 #Correct:3889 #Trained:5001 Training Accuracy:77.7%
Progress:31.2% Speed(reviews/sec):605.3 #Correct:5929 #Trained:7501 Training Accuracy:79.0%
Progress:41.6% Speed(reviews/sec):612.9 #Correct:8017 #Trained:10001 Training Accuracy:80.1%
Progress:52.0% Speed(reviews/sec):612.4 #Correct:10173 #Trained:12501 Training Accuracy:81.3%
Progress:62.5% Speed(reviews/sec):615.8 #Correct:12297 #Trained:15001 Training Accuracy:81.9%
Progress:72.9% Speed(reviews/sec):615.6 #Correct:14428 #Trained:17501 Training Accuracy:82.4%
Progress:83.3% Speed(reviews/sec):616.7 #Correct:16557 #Trained:20001 Training Accuracy:82.7%
Progress:93.7% Speed(reviews/sec):614.7 #Correct:18695 #Trained:22501 Training Accuracy:83.0%
Progress:99.9% Speed(reviews/sec):615.4 #Correct:19983 #Trained:24000 Training

In [123]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Tested:1 Testing Accuracy:100.%Progress:0.1% Speed(reviews/sec):241.9 #Correct:2 #Tested:2 Testing Accuracy:100.%Progress:0.2% Speed(reviews/sec):379.4 #Correct:3 #Tested:3 Testing Accuracy:100.%Progress:0.3% Speed(reviews/sec):480.6 #Correct:4 #Tested:4 Testing Accuracy:100.%Progress:0.4% Speed(reviews/sec):503.2 #Correct:5 #Tested:5 Testing Accuracy:100.%Progress:0.5% Speed(reviews/sec):578.8 #Correct:5 #Tested:6 Testing Accuracy:83.3%Progress:0.6% Speed(reviews/sec):654.3 #Correct:5 #Tested:7 Testing Accuracy:71.4%Progress:0.7% Speed(reviews/sec):673.8 #Correct:5 #Tested:8 Testing Accuracy:62.5%Progress:0.8% Speed(reviews/sec):735.5 #Correct:6 #Tested:9 Testing Accuracy:66.6%Progress:0.9% Speed(reviews/sec):784.1 #Correct:6 #Tested:10 Testing Accuracy:60.0%Progress:1.0% Speed(reviews/sec):837.3 #Correct:7 #Tested:11 Testing Accuracy:63.6%Progress:1.1% Speed(reviews/sec):868.8 #Correct:8 #Tested:12 Testing Accuracy:66.6%Pr

Progress:14.8% Speed(reviews/sec):1225. #Correct:126 #Tested:149 Testing Accuracy:84.5%Progress:14.9% Speed(reviews/sec):1228. #Correct:127 #Tested:150 Testing Accuracy:84.6%Progress:15.0% Speed(reviews/sec):1230. #Correct:128 #Tested:151 Testing Accuracy:84.7%Progress:15.1% Speed(reviews/sec):1219. #Correct:129 #Tested:152 Testing Accuracy:84.8%Progress:15.2% Speed(reviews/sec):1219. #Correct:130 #Tested:153 Testing Accuracy:84.9%Progress:15.3% Speed(reviews/sec):1223. #Correct:131 #Tested:154 Testing Accuracy:85.0%Progress:15.4% Speed(reviews/sec):1225. #Correct:132 #Tested:155 Testing Accuracy:85.1%Progress:15.5% Speed(reviews/sec):1224. #Correct:133 #Tested:156 Testing Accuracy:85.2%Progress:15.6% Speed(reviews/sec):1224. #Correct:134 #Tested:157 Testing Accuracy:85.3%Progress:15.7% Speed(reviews/sec):1215. #Correct:135 #Tested:158 Testing Accuracy:85.4%Progress:15.8% Speed(reviews/sec):1207. #Correct:136 #Tested:159 Testing Accuracy:85.5%Progress:15.9% Speed(reviews/se

Progress:40.1% Speed(reviews/sec):1212. #Correct:343 #Tested:402 Testing Accuracy:85.3%Progress:40.2% Speed(reviews/sec):1213. #Correct:344 #Tested:403 Testing Accuracy:85.3%Progress:40.3% Speed(reviews/sec):1212. #Correct:345 #Tested:404 Testing Accuracy:85.3%Progress:40.4% Speed(reviews/sec):1212. #Correct:346 #Tested:405 Testing Accuracy:85.4%Progress:40.5% Speed(reviews/sec):1212. #Correct:347 #Tested:406 Testing Accuracy:85.4%Progress:40.6% Speed(reviews/sec):1213. #Correct:347 #Tested:407 Testing Accuracy:85.2%Progress:40.7% Speed(reviews/sec):1214. #Correct:348 #Tested:408 Testing Accuracy:85.2%Progress:40.8% Speed(reviews/sec):1214. #Correct:349 #Tested:409 Testing Accuracy:85.3%Progress:40.9% Speed(reviews/sec):1215. #Correct:350 #Tested:410 Testing Accuracy:85.3%Progress:41.0% Speed(reviews/sec):1215. #Correct:351 #Tested:411 Testing Accuracy:85.4%Progress:41.1% Speed(reviews/sec):1216. #Correct:351 #Tested:412 Testing Accuracy:85.1%Progress:41.2% Speed(reviews/se

In [124]:
def get_most_similar_words(focus = "horrible"):
    most_similar = Counter()

    for word in mlp_full.word2index.keys():
        most_similar[word] = np.dot(mlp_full.weights_0_1[mlp_full.word2index[word]],mlp_full.weights_0_1[mlp_full.word2index[focus]])
    
    return most_similar.most_common()

In [125]:
get_most_similar_words("excellent")

[('excellent', 0.07806875299825171),
 ('perfect', 0.06920943806525413),
 ('liked', 0.06430955082767051),
 ('7', 0.061700541512139356),
 ('best', 0.06005744083966053),
 ('amazing', 0.05539974365120676),
 ('enjoyed', 0.054996720805947315),
 ('great', 0.05413447089981331),
 ('wonderful', 0.053781652558586156),
 ('favorite', 0.05299381325059772),
 ('rare', 0.051734286518433925),
 ('job', 0.05142753250055513),
 ('highly', 0.05119084607716673),
 ('definitely', 0.0506399332015941),
 ('8', 0.05005587274714504),
 ('simple', 0.049601438473931676),
 ('easy', 0.048797004065652634),
 ('loved', 0.04821338139896825),
 ('Great', 0.0478888143799176),
 ('love', 0.0478775484870267),
 ('beautiful', 0.04771718735864792),
 ('surprised', 0.04754633151740958),
 ('still', 0.046988937272664254),
 ('strong', 0.046656400482027786),
 ('fantastic', 0.04637645677402626),
 ('great.', 0.04562233970027345),
 ('7/10', 0.04527789836589064),
 ('excellent.', 0.045033392061675726),
 ('right', 0.04457481245483679),
 ('superb

In [128]:
get_most_similar_words("terrible")

[('worst', 0.11549985898816102),
 ('waste', 0.09375479783798141),
 ('boring', 0.06328083589293755),
 ('awful', 0.06149496607432904),
 ('Unfortunately,', 0.05868401317116882),
 ('poorly', 0.057880508352663014),
 ('bad.', 0.05723910045720267),
 ('fails', 0.05686939173341478),
 ('dull', 0.05334604658326441),
 ('poor', 0.05232571053467909),
 ('awful.', 0.05227935886444152),
 ('worse', 0.049896966467670265),
 ('terrible', 0.049691771823763935),
 ('supposed', 0.04903906608124028),
 ('badly', 0.04869879769033016),
 ('annoying', 0.048641010708424456),
 ('lame', 0.04801808057931778),
 ('disappointing', 0.04655672206814589),
 ('stupid', 0.045611073732107196),
 ('bad', 0.044615857693773575),
 ('save', 0.04270327963792701),
 ('lacks', 0.0416255945051155),
 ('nothing', 0.041490994289632696),
 ('cheap', 0.03931590412010229),
 ('money', 0.03890558419812111),
 ('bad,', 0.03748477331453864),
 ('4/10', 0.03674162272166206),
 ('boring.', 0.03628516433591889),
 ('ridiculous', 0.035549859902960494),
 ('hor

In [129]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [130]:
import matplotlib.colors as colors

words_to_visualize = list()
for word, ratio in pos_neg_ratios.most_common(500):
    if(word in mlp_full.word2index.keys()):
        words_to_visualize.append(word)
    
for word, ratio in list(reversed(pos_neg_ratios.most_common()))[0:500]:
    if(word in mlp_full.word2index.keys()):
        words_to_visualize.append(word)

In [133]:
pos = 0
neg = 0

colors_list = list()
vectors_list = list()
for word in words_to_visualize:
    if word in pos_neg_ratios.keys():
        vectors_list.append(mlp_full.weights_0_1[mlp_full.word2index[word]])
        if(pos_neg_ratios[word] > 0):
            pos+=1
            colors_list.append("#00ff00")
        else:
            neg+=1
            colors_list.append("#000000")

In [134]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(vectors_list)

In [135]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="vector T-SNE for most polarized words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_to_visualize,
                                    color=colors_list))

p.scatter(x="x1", y="x2", size=8, source=source, fill_color="color")

word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(word_labels)

show(p)

# green indicates positive words, black indicates negative words