In [5]:
from collections import Counter
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()


def pretty_print_review_and_label(i):
    print(labels[i] + "\t:\t" + reviews[i][:80] + "...")

g = open('reviews.txt','r') # What we know!
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()

g = open('labels.txt','r') # What we WANT to know!
labels = list(map(lambda x:x[:-1].upper(),g.readlines()))
g.close()

In [2]:
print("labels.txt \t : \t reviews.txt\n")
pretty_print_review_and_label(2137)
pretty_print_review_and_label(12816)
pretty_print_review_and_label(6267)
pretty_print_review_and_label(21934)
pretty_print_review_and_label(5297)
pretty_print_review_and_label(4998)

labels.txt 	 : 	 reviews.txt

NEGATIVE	:	this movie is terrible but it has some good effects .  ...
POSITIVE	:	adrian pasdar is excellent is this film . he makes a fascinating woman .  ...
NEGATIVE	:	comment this movie is impossible . is terrible  very improbable  bad interpretat...
POSITIVE	:	excellent episode movie ala pulp fiction .  days   suicides . it doesnt get more...
NEGATIVE	:	if you haven  t seen this  it  s terrible . it is pure trash . i saw this about ...
POSITIVE	:	this schiffer guy is a real genius  the movie is of excellent quality and both e...


# Reducing Noise by Strategically Reducing the Vocabulary

In [9]:
import time
import sys
import numpy as np
from collections import Counter

# Let's tweak our network from before to model these phenomena
class SentimentNetwork:
    def __init__(self, reviews,labels,min_count = 10,polarity_cutoff = 0.1,hidden_nodes = 10, learning_rate = 0.1):
       
        np.random.seed(1)
    
        self.pre_process_data(reviews, polarity_cutoff, min_count)
        
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)
        
        
    def pre_process_data(self,reviews, polarity_cutoff,min_count):
        
        self.positive_counts = Counter()
        self.negative_counts = Counter()
        self.total_counts = Counter()

        for i in range(len(reviews)):
            if(labels[i] == 'POSITIVE'):
                for word in reviews[i].split(" "):
                    self.positive_counts[word] += 1
                    self.total_counts[word] += 1
            else:
                for word in reviews[i].split(" "):
                    self.negative_counts[word] += 1
                    self.total_counts[word] += 1

        self.pos_neg_ratios = Counter()

        for term,cnt in list(self.total_counts.most_common()):
            if(cnt >= 50):
                pos_neg_ratio = self.positive_counts[term] / float(self.negative_counts[term]+1)
                self.pos_neg_ratios[term] = pos_neg_ratio

        for word,ratio in self.pos_neg_ratios.most_common():
            if(ratio > 1):
                self.pos_neg_ratios[word] = np.log(ratio)
            else:
                self.pos_neg_ratios[word] = -np.log((1 / (ratio + 0.01)))
        
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                if(self.total_counts[word] > min_count):
                    if(word in self.pos_neg_ratios.keys()):
                        if((self.pos_neg_ratios[word] >= polarity_cutoff) or (self.pos_neg_ratios[word] <= -polarity_cutoff)):
                            review_vocab.add(word)
                    else:
                        review_vocab.add(word)
        self.review_vocab = list(review_vocab)
        
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        self.label_vocab = list(label_vocab)
        
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i
         
        
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Initialize weights
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
    
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        self.learning_rate = learning_rate
        
        self.layer_0 = np.zeros((1,input_nodes))
        self.layer_1 = np.zeros((1,hidden_nodes))
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    def update_input_layer(self,review):

        # clear out previous state, reset the layer to be all 0s
        self.layer_0 *= 0
        for word in review.split(" "):
            self.layer_0[0][self.word2index[word]] = 1

    def get_target_for_label(self,label):
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def train(self, training_reviews_raw, training_labels):
        
        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_reviews.append(list(indices))
        
        assert(len(training_reviews) == len(training_labels))
        
        correct_so_far = 0
        
        start = time.time()
        
        for i in range(len(training_reviews)):
            
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            # Input Layer

            # Hidden layer
#             layer_1 = self.layer_0.dot(self.weights_0_1)
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]
            
            # Output layer
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))

            #### Implement the backward pass here ####
            ### Backward pass ###

            # Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # Update the weights
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            
            for index in review:
                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update input-to-hidden weights with gradient descent step

            if(layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            if(layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
            elapsed = float(time.time() - start)
            if elapsed == 0: continue
            reviews_per_second = i / elapsed
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
        
    
    def test(self, testing_reviews, testing_labels):
        
        correct = 0
        
        start = time.time()
        
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1
                elapsed = float(time.time() - start)
                if elapsed == 0:
                    continue
                reviews_per_second = i / elapsed
            
                sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                            + "% #Correct:" + str(correct) + " #Tested:" + str(i+1) + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        
        # Input Layer


        # Hidden layer
        self.layer_1 *= 0
        unique_indices = set()
        for word in review.lower().split(" "):
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weights_0_1[index]
        
        # Output layer
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
        
        if(layer_2[0] >= 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"
        

In [10]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.8,learning_rate=0.01)

In [11]:

hist, edges = np.histogram(list(map(lambda x:x[1],mlp.pos_neg_ratios.most_common())), density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="Word Positive/Negative Affinity Distribution")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)


In [12]:

frequency_frequency = Counter()

for word, cnt in mlp.total_counts.most_common():
    frequency_frequency[cnt] += 1
hist, edges = np.histogram(list(map(lambda x:x[1],frequency_frequency.most_common())), density=True, bins=100, normed=True)

p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="The frequency distribution of the words in our corpus")
p.quad(top=hist, bottom=0, left=edges[:-1], right=edges[1:], line_color="#555555")
show(p)


In [30]:

mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.8,learning_rate=0.01)
mlp.hidden_nodes = 1000
mlp.train(reviews[:-1000],labels[:-1000])

Progress:99.9% Speed(reviews/sec):5004. #Correct:20552 #Trained:24000 Training Accuracy:85.6%

In [31]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:1.1% Speed(reviews/sec):1099.% #Correct:10 #Tested:12 Testing Accuracy:83.3%Progress:1.3% Speed(reviews/sec):1299.% #Correct:11 #Tested:14 Testing Accuracy:78.5%Progress:1.5% Speed(reviews/sec):1499.% #Correct:12 #Tested:16 Testing Accuracy:75.0%Progress:1.6% Speed(reviews/sec):1599.% #Correct:13 #Tested:17 Testing Accuracy:76.4%Progress:1.7% Speed(reviews/sec):1699.% #Correct:14 #Tested:18 Testing Accuracy:77.7%Progress:1.8% Speed(reviews/sec):1799.% #Correct:15 #Tested:19 Testing Accuracy:78.9%Progress:1.9% Speed(reviews/sec):1899.% #Correct:16 #Tested:20 Testing Accuracy:80.0%Progress:2.0% Speed(reviews/sec):1999.% #Correct:17 #Tested:21 Testing Accuracy:80.9%Progress:2.1% Speed(reviews/sec):2099.% #Correct:18 #Tested:22 Testing Accuracy:81.8%Progress:2.2% Speed(reviews/sec):2199.% #Correct:19 #Tested:23 Testing Accuracy:82.6%Progress:2.3% Speed(reviews/sec):2299.% #Correct:20 #Tested:24 Testing Accuracy:83.3%Progress:2.4% Speed(reviews/sec):2399.% #Correct:21 #

# Analysis: What's Going on in the Weights?

In [20]:
#mlp_full = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=0,polarity_cutoff=0,learning_rate=0.01)

In [21]:
#mlp_full.train(reviews[:-1000],labels[:-1000])

In [57]:
def get_most_similar_words(focus = "horrible"):
    most_similar = Counter()

    for word in mlp.word2index.keys():
        most_similar[word] = np.dot(mlp.weights_0_1[mlp.word2index[word]],mlp.weights_0_1[mlp.word2index[focus]])
    
    return most_similar.most_common()

In [35]:
get_most_similar_words("excellent")[:15]

[('excellent', 0.071711070102026334),
 ('perfect', 0.071562291008163512),
 ('wonderful', 0.06600021330193355),
 ('amazing', 0.060391296623674491),
 ('funniest', 0.057971307369994417),
 ('today', 0.057658446436200564),
 ('favorite', 0.057134793322710153),
 ('fantastic', 0.05320013000254227),
 ('refreshing', 0.052506049184346183),
 ('heart', 0.05237692444528777),
 ('gem', 0.051247702292890995),
 ('wonderfully', 0.048625868548358819),
 ('superb', 0.047317151396895331),
 ('rare', 0.046476520843334597),
 ('awesome', 0.044850408022960329)]

In [36]:
get_most_similar_words("terrible")[:15]

[('worst', 0.07693698285517088),
 ('waste', 0.074425364899813634),
 ('awful', 0.071640101471608181),
 ('poorly', 0.061800899600302231),
 ('fails', 0.055480982994218705),
 ('terrible', 0.053540037229526681),
 ('dull', 0.053343734187143159),
 ('horrible', 0.052471776783600925),
 ('mess', 0.052195290542196446),
 ('wasted', 0.051385043875082033),
 ('disappointment', 0.050746457854144382),
 ('lacks', 0.049779829098423461),
 ('disappointing', 0.0492577582133851),
 ('worse', 0.044512177605133756),
 ('avoid', 0.043639136618360733)]

In [46]:
import matplotlib.colors as colors

words_to_visualize = list()
for word, ratio in mlp.pos_neg_ratios.most_common(500):
    if(word in mlp.word2index.keys()):
        words_to_visualize.append(word)
    
for word, ratio in list(reversed(mlp.pos_neg_ratios.most_common()))[0:500]:
    if(word in mlp.word2index.keys()):
        words_to_visualize.append(word)

In [47]:
pos = 0
neg = 0

colors_list = list()
vectors_list = list()
for word in words_to_visualize:
    if word in mlp.pos_neg_ratios.keys():
        vectors_list.append(mlp.weights_0_1[mlp.word2index[word]])
        if(mlp.pos_neg_ratios[word] > 0):
            pos+=1
            colors_list.append("#00ff00")
        else:
            neg+=1
            colors_list.append("#000000")
    

In [48]:
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(vectors_list)

In [49]:
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="vector T-SNE for most polarized words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_to_visualize))

p.scatter(x="x1", y="x2", size=8, source=source,color=colors_list)

word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(word_labels)

show(p)

# green indicates positive words, black indicates negative words

Supplying a user-defined data source AND iterable values to glyph methods is deprecated.

See https://github.com/bokeh/bokeh/issues/2056 for more information.

  warn(message)
Supplying a user-defined data source AND iterable values to glyph methods is deprecated.

See https://github.com/bokeh/bokeh/issues/2056 for more information.

  warn(message)
