## Exloring the movie review data
In separate sheet to keep sheets running the models from being too cluttered

In [1]:
import numpy as np
import os
import os.path
import glob
import time

from nltk.tokenize import word_tokenize
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
nltk.download('punkt')
import nltk

[nltk_data] Downloading package punkt to /home/jeremie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
import os
import wget
import tarfile

# By checking if the directory exists first, we allow people to delete the tarfile without the notebook re-downloading it
if os.path.isdir('aclImdb'):
    print("Dataset directory exists, taking no action")
else:    
    if not os.path.isfile('aclImdb_v1.tar.gz'):
        print("Downloading dataset")
        #!wget http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz
        wget.download('http://ai.stanford.edu/~amaas/data/sentiment/aclImdb_v1.tar.gz')
    else:
        print("Dataset already downloaded")
    
    print("Unpacking dataset")
    #!tar -xf aclImdb_v1.tar.gz 
    tar = tarfile.open("aclImdb_v1.tar.gz")
    tar.extractall()
    tar.close()
    print("Dataset unpacked in aclImdb")

Dataset directory exists, taking no action


In [3]:
# configuration
SAMPLE_SIZE=1000


In [4]:
time_beginning_of_notebook = time.time()
positive_file_list = glob.glob(os.path.join('aclImdb/train/pos', "*.txt"))
positive_sample_file_list = positive_file_list[:SAMPLE_SIZE]

negative_file_list = glob.glob(os.path.join('aclImdb/train/neg', "*.txt"))
negative_sample_file_list = negative_file_list[:SAMPLE_SIZE]

import re

# load doc into memory
# regex to clean markup elements 
def load_doc(filename):
    # open the file as read only
    file = open(filename, 'r', encoding='utf8')
    # read all text
    text = re.sub('<[^>]*>', ' ', file.read())
    #text = file.read()
    # close the file
    file.close()
    return text

In [5]:
positive_strings = [load_doc(x) for x in positive_sample_file_list]
negative_strings = [load_doc(x) for x in negative_sample_file_list]

positive_tokenized = [word_tokenize(s) for s in positive_strings]
negative_tokenized = [word_tokenize(s) for s in negative_strings]

In [6]:
def pretty_print_positive_and_negative(i):
    print(positive_strings[i][:30] + "\t:\t" + negative_strings[i][:30] + "...")

In [7]:
print(len(positive_strings) + len(negative_strings))
print('\n Positive reviews \n ', positive_strings[0][:50])
print('\n Negative reviews \n ', negative_strings[0][:50])

2000

 Positive reviews 
  I thought I should qualify my position after readi

 Negative reviews 
  I feel totally ripped off. Someone needs to refund


In [8]:
print("positive reviews \t : \t negative reviews\n")
pretty_print_positive_and_negative(0)
pretty_print_positive_and_negative(250)
pretty_print_positive_and_negative(500)
pretty_print_positive_and_negative(750)
pretty_print_positive_and_negative(999)

positive reviews 	 : 	 negative reviews

I thought I should qualify my 	:	I feel totally ripped off. Som...
In the same vein as Natural Bo	:	The makers of this film have c...
Airwolf The Movie, A variation	:	**SPOILERS** I rented "Tesis" ...
Revenge is the theme of this D	:	this, is NOT one of those film...
"A Girl's Folly" is a sort of 	:	The premise was intriguing, bu...


In [9]:
from collections import Counter
import numpy as np

In [10]:
# Create three Counter objects to store positive, negative and total counts
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()

In [11]:
# Loop over all the words in all the reviews and increment the counts in the appropriate counter objects
for i in range(len(positive_strings)):
    for word in positive_strings[i].split(" "):
        positive_counts[word] += 1
        total_counts[word] += 1
for i in range(len(negative_strings)):
    for word in negative_strings[i].split(" "):
        negative_counts[word] += 1
        total_counts[word] += 1


In [12]:
positive_counts.most_common()[:100]

[('the', 11793),
 ('and', 6690),
 ('a', 6455),
 ('of', 6028),
 ('to', 5149),
 ('is', 4456),
 ('in', 3690),
 ('I', 2697),
 ('that', 2463),
 ('it', 2160),
 ('', 2142),
 ('this', 2062),
 ('as', 1843),
 ('with', 1814),
 ('for', 1718),
 ('was', 1682),
 ('The', 1653),
 ('his', 1301),
 ('but', 1300),
 ('film', 1219),
 ('are', 1210),
 ('on', 1172),
 ('movie', 1076),
 ('not', 1034),
 ('you', 1030),
 ('be', 955),
 ('have', 932),
 ('he', 902),
 ('by', 893),
 ('an', 868),
 ('one', 835),
 ('from', 813),
 ('at', 813),
 ('who', 812),
 ('has', 747),
 ('all', 732),
 ('her', 730),
 ('like', 667),
 ('about', 629),
 ('very', 605),
 ('This', 600),
 ('they', 587),
 ('so', 576),
 ('more', 553),
 ('good', 532),
 ('out', 530),
 ('or', 526),
 ('some', 524),
 ('just', 517),
 ('their', 493),
 ('It', 481),
 ('what', 480),
 ('which', 439),
 ("it's", 432),
 ('will', 425),
 ('when', 416),
 ('great', 415),
 ('can', 413),
 ('see', 411),
 ('up', 410),
 ('she', 408),
 ('really', 405),
 ('would', 400),
 ('than', 394),
 ('

In [13]:
negative_counts.most_common()[:100]

[('the', 10631),
 ('a', 5979),
 ('and', 5438),
 ('to', 5399),
 ('of', 5186),
 ('is', 3823),
 ('in', 3128),
 ('I', 3047),
 ('that', 2631),
 ('this', 2514),
 ('', 2427),
 ('it', 2270),
 ('was', 2109),
 ('The', 1660),
 ('for', 1624),
 ('with', 1517),
 ('as', 1476),
 ('movie', 1438),
 ('but', 1369),
 ('on', 1276),
 ('have', 1168),
 ('be', 1095),
 ('are', 1081),
 ('not', 1057),
 ('film', 979),
 ('you', 944),
 ('at', 921),
 ('his', 898),
 ('an', 826),
 ('one', 822),
 ('by', 807),
 ('just', 806),
 ('like', 795),
 ('they', 780),
 ('he', 754),
 ('or', 746),
 ('from', 711),
 ('all', 696),
 ('so', 679),
 ('who', 674),
 ('about', 660),
 ('out', 621),
 ('some', 597),
 ('has', 591),
 ('This', 577),
 ('would', 546),
 ('even', 528),
 ('only', 517),
 ('her', 508),
 ('more', 506),
 ('no', 505),
 ('if', 501),
 ('had', 472),
 ('It', 471),
 ('were', 466),
 ('when', 458),
 ('what', 456),
 ('really', 448),
 ('up', 445),
 ('there', 444),
 ('very', 443),
 ("it's", 425),
 ('can', 424),
 ('than', 421),
 ('good',

In [14]:
print(len(positive_counts.items()))
print(len(negative_counts.items()))
print(len(total_counts.items()))
print(len(positive_counts.most_common()))
print(len(negative_counts.most_common()))
print(len(total_counts.most_common()))

34325
32292
53557
34325
32292
53557


In [15]:
pos_neg_ratios = Counter()

# Calculate the ratios of positive and negative uses of the most common words
# Consider words to be "common" if they've been used at least 100 times
for term, count in list(total_counts.most_common()):
    if(count > 100):
        pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
        pos_neg_ratios[term] = pos_neg_ratio

In [16]:
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios["the"]))
print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios["amazing"]))
print("Pos-to-neg ratio for 'terrible' = {}".format(pos_neg_ratios["terrible"]))

Pos-to-neg ratio for 'the' = 1.1091986455981941
Pos-to-neg ratio for 'amazing' = 0
Pos-to-neg ratio for 'terrible' = 0


In [17]:
# Convert ratios to logs
for word in pos_neg_ratios:
    pos_neg_ratios[word] = np.log(pos_neg_ratios[word])

In [18]:
print("Pos-to-neg ratio for 'the' = {}".format(pos_neg_ratios["the"]))
print("Pos-to-neg ratio for 'amazing' = {}".format(pos_neg_ratios["amazing"]))
print("Pos-to-neg ratio for 'terrible' = {}".format(pos_neg_ratios["terrible"]))

Pos-to-neg ratio for 'the' = 0.10363781369688876
Pos-to-neg ratio for 'amazing' = 0
Pos-to-neg ratio for 'terrible' = 0


In [19]:
# words most frequently seen in a review with a "POSITIVE" label
pos_neg_ratios.most_common()[:100]

[('!', 2.363645349755292),
 ('excellent', 1.4469189829363254),
 ('beautiful', 1.3862943611198906),
 ('great', 1.1010248350734937),
 ('definitely', 1.0271533246859648),
 ('His', 0.9985288301111273),
 ('liked', 0.9808292530117262),
 ('enjoy', 0.9227216222044455),
 ('world', 0.9118363815247748),
 ('best', 0.8747310021673603),
 ('well', 0.7689660171172966),
 ('own', 0.7381708619339005),
 ('young', 0.7359671777428735),
 ('role', 0.6931471805599453),
 ('gives', 0.6931471805599453),
 ('job', 0.6811709895132296),
 ('life', 0.6714071939235394),
 ('love', 0.666478933477784),
 ('true', 0.6384887680220812),
 ('John', 0.620387826277517),
 ('plays', 0.6176396280518002),
 ('still', 0.6115485366770659),
 ('different', 0.6074917359814515),
 ('She', 0.6017974019717174),
 ('small', 0.581921545449721),
 ('especially', 0.5717863235556778),
 ('year', 0.5658077581833438),
 ('each', 0.5280674302004967),
 ('New', 0.5212969236332861),
 ('performance', 0.5012561727498399),
 ('Hollywood', 0.49643688631389105),
 (

In [20]:
vocab = set(total_counts.keys())

In [21]:
vocab_size = len(vocab)
print(vocab_size)

53557


In [22]:
import random

positive_labels = []
for i in range(len(positive_tokenized)):
    positive_labels.append('POSITIVE')
negative_labels = []
for i in range(len(negative_tokenized)):
    negative_labels.append('NEGATIVE')
    
reviews = positive_tokenized + negative_tokenized
labels = positive_labels + negative_labels
reviews_and_labels = list(zip(reviews, labels))
random.shuffle(reviews_and_labels)
reviews, labels = zip(*reviews_and_labels)

In [23]:
import time
import sys
import numpy as np

# Encapsulate our neural network in a class
class SentimentNetwork:
    ## added min_count and polarity_cutoff parameters
    def __init__(self, reviews, labels, min_count = 10,polarity_cutoff = 0.1,hidden_nodes = 10, learning_rate = 0.1):
        """Create a SentimenNetwork with the given settings
        Args:
            reviews(list) - List of reviews used for training
            labels(list) - List of POSITIVE/NEGATIVE labels associated with the given reviews
            min_count(int) - Words should only be added to the vocabulary 
                             if they occur more than this many times
            polarity_cutoff(float) - The absolute value of a word's positive-to-negative
                                     ratio must be at least this big to be considered.
            hidden_nodes(int) - Number of nodes to create in the hidden layer
            learning_rate(float) - Learning rate to use while training
        
        """
        # Assign a seed to our random number generator to ensure we get
        # reproducable results during development 
        np.random.seed(1)

        # process the reviews and their associated labels so that everything
        # is ready for training
        ## added min_count and polarity_cutoff arguments to pre_process_data call
        self.pre_process_data(reviews, labels, polarity_cutoff, min_count)
        
        # Build the network to have the number of hidden nodes and the learning rate that
        # were passed into this initializer. Make the same number of input nodes as
        # there are vocabulary words and create a single output node.
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)

    ## added min_count and polarity_cutoff parameters
    def pre_process_data(self, reviews, labels, polarity_cutoff, min_count):
        
        ## Calculate positive-to-negative ratios for words before building vocabulary
        positive_counts = Counter()
        negative_counts = Counter()
        total_counts = Counter()

        for i in range(len(reviews)):
            if(labels[i] == 'POSITIVE'):
                for word in reviews[i]:
                    positive_counts[word] += 1
                    total_counts[word] += 1
            else:
                for word in reviews[i]:
                    negative_counts[word] += 1
                    total_counts[word] += 1

        pos_neg_ratios = Counter()

        for term, count in list(total_counts.most_common()):
            if(count >= 50):
                pos_neg_ratio = positive_counts[term] / float(negative_counts[term]+1)
                pos_neg_ratios[term] = pos_neg_ratio

        for word, ratio in pos_neg_ratios.most_common():
            if(ratio > 1):
                pos_neg_ratios[word] = np.log(ratio)
            else:
                pos_neg_ratios[word] = -np.log((1 / (ratio + 0.01)))

        # populate review_vocab with all of the words in the given reviews
        review_vocab = set()
        for review in reviews:
            for word in review:
                ## only add words that occur at least min_count times
                #                     and for words with pos/neg ratios, only add words
                #                     that meet the polarity_cutoff
                if(total_counts[word] > min_count):
                    if(word in pos_neg_ratios.keys()):
                        if((pos_neg_ratios[word] >= polarity_cutoff) or (pos_neg_ratios[word] <= -polarity_cutoff)):
                            review_vocab.add(word)
                    else:
                        review_vocab.add(word)

        # Convert the vocabulary set to a list so we can access words via indices
        self.review_vocab = list(review_vocab)
        
        # populate label_vocab with all of the words in the given labels.
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        # Convert the label vocabulary set to a list so we can access labels via indices
        self.label_vocab = list(label_vocab)
        
        # Store the sizes of the review and label vocabularies.
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        # TODO This is already done earlier - can remove duplication
        # Create a dictionary of words in the vocabulary mapped to index positions
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        # Create a dictionary of labels mapped to index positions
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i

    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Store the learning rate
        self.learning_rate = learning_rate

        # Initialize weights

        # These are the weights between the input layer and the hidden layer.
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))

        # These are the weights between the hidden layer and the output layer.
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        ## Removed self.layer_0; added self.layer_1
        # The input layer, a two-dimensional matrix with shape 1 x hidden_nodes
        self.layer_1 = np.zeros((1,hidden_nodes))
    
    ## Removed update_input_layer function
    
    def get_target_for_label(self,label):
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    ## changed name of first parameter form 'training_reviews' 
    #                     to 'training_reviews_raw'
    def train(self, training_reviews_raw, training_labels):

        ## pre-process training reviews so we can deal 
        #                     directly with the indices of non-zero inputs
        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review:
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_reviews.append(list(indices))

        # make sure out we have a matching number of reviews and labels
        assert(len(training_reviews) == len(training_labels))
        
        # Keep track of correct predictions to display accuracy during training 
        correct_so_far = 0

        # Remember when we started for printing time statistics
        start = time.time()
        
        # loop through all the given reviews and run a forward and backward pass,
        # updating weights for every item
        for i in range(len(training_reviews)):
            
            # Get the next review and its correct label
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            ## Removed call to 'update_input_layer' function
            #                     because 'layer_0' is no longer used

            # Hidden layer
            ## Add in only the weights for non-zero items
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]

            # Output layer
            ## changed to use 'self.layer_1' instead of 'local layer_1'
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))            
            
            #### Implement the backward pass here ####
            ### Backward pass ###

            # Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # Update the weights
            ## changed to use 'self.layer_1' instead of local 'layer_1'
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            
            ## Only update the weights that were used in the forward pass
            for index in review:
                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update input-to-hidden weights with gradient descent step

            # Keep track of correct predictions.
            if(layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            elif(layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the training process. 
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
    
    def test(self, testing_reviews, testing_labels):
        """
        Attempts to predict the labels for the given testing_reviews,
        and uses the test_labels to calculate the accuracy of those predictions.
        """
        
        # keep track of how many correct predictions we make
        correct = 0

        # we'll time how many predictions per second we make
        start = time.time()

        # Loop through each of the given reviews and call run to predict
        # its label. 
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
            
            # For debug purposes, print out our prediction accuracy and speed 
            # throughout the prediction process. 

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        """
        Returns a POSITIVE or NEGATIVE prediction for the given review.
        """
        # Run a forward pass through the network, like in the "train" function.
        
        ## Removed call to update_input_layer function
        #                     because layer_0 is no longer used

        # Hidden layer
        ## Identify the indices used in the review and then add
        #                     just those weights to layer_1 
        self.layer_1 *= 0
        unique_indices = set()
        for word in review:
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weights_0_1[index]
        
        # Output layer
        ## changed to use self.layer_1 instead of local layer_1
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
         
        # Return POSITIVE for values above greater-than-or-equal-to 0.5 in the output layer;
        # return NEGATIVE for other values
        if(layer_2[0] >= 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"

In [24]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.05,learning_rate=0.01)
mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:99.9% Speed(reviews/sec):2156. #Correct:733 #Trained:1000 Training Accuracy:73.3%

In [25]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:99.9% Speed(reviews/sec):2665. #Correct:771 #Tested:1000 Testing Accuracy:77.1%

In [26]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.8,learning_rate=0.01)
mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:99.9% Speed(reviews/sec):3305. #Correct:782 #Trained:1000 Training Accuracy:78.2%

In [27]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:99.9% Speed(reviews/sec):4273. #Correct:782 #Tested:1000 Testing Accuracy:78.2%

In [28]:
mlp_full = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=0,polarity_cutoff=0,learning_rate=0.01)
mlp_full.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:99.9% Speed(reviews/sec):1042. #Correct:710 #Trained:1000 Training Accuracy:71.0%

In [29]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:99.9% Speed(reviews/sec):5146. #Correct:782 #Tested:1000 Testing Accuracy:78.2%

In [30]:
def get_most_similar_words(focus = "horrible"):
    most_similar = Counter()

    for word in mlp_full.word2index.keys():
        most_similar[word] = np.dot(mlp_full.weights_0_1[mlp_full.word2index[word]],mlp_full.weights_0_1[mlp_full.word2index[focus]])
    
    return most_similar.most_common()

In [31]:
get_most_similar_words("excellent")[:10]

[('great', 0.01794088658139491),
 ('love', 0.013920444609607658),
 ('beautiful', 0.011485460046508661),
 ('best', 0.009313694252545914),
 ('world', 0.009157208661895197),
 ('well', 0.008496713215832452),
 ('excellent', 0.008406002277286606),
 ('very', 0.008172055875934203),
 ('everyone', 0.007293758055630865),
 ('think', 0.007066441565353181)]

In [32]:
get_most_similar_words("terrible")[:10]

[('worst', 0.010754856418980665),
 ('bad', 0.008711755262888135),
 ('waste', 0.0071232062821647265),
 ('no', 0.006903202996132405),
 ('?', 0.006470757216700413),
 ('awful', 0.006342002606895748),
 ('nothing', 0.006189831081304518),
 ('poor', 0.005867964223638356),
 ('had', 0.005205513402707153),
 ('instead', 0.0051714503635667776)]

In [33]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [34]:
import matplotlib.colors as colors

words_to_visualize = list()
for word, ratio in pos_neg_ratios.most_common(500):
    if(word in mlp_full.word2index.keys()):
        words_to_visualize.append(word)
    
for word, ratio in list(reversed(pos_neg_ratios.most_common()))[0:500]:
    if(word in mlp_full.word2index.keys()):
        words_to_visualize.append(word)

In [35]:
pos = 0
neg = 0

colors_list = list()
vectors_list = list()
for word in words_to_visualize:
    if word in pos_neg_ratios.keys():
        vectors_list.append(mlp_full.weights_0_1[mlp_full.word2index[word]])
        if(pos_neg_ratios[word] > 0):
            pos+=1
            colors_list.append("#00ff00")
        else:
            neg+=1
            colors_list.append("#000000")

In [36]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(vectors_list)

In [37]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="vector T-SNE for most polarized words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_to_visualize,
                                    color=colors_list))

p.scatter(x="x1", y="x2", size=8, source=source, fill_color="color")

word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(word_labels)

show(p)

# green indicates positive words, black indicates negative words