In [1]:
from bokeh.models import ColumnDataSource, LabelSet
from bokeh.plotting import figure, show, output_file
from bokeh.io import output_notebook
output_notebook()

In [2]:
g = open('finefoods/foods.txt','r') # What we know!
reviews_txt = list(map(lambda x:x[:-1],g.readlines()))
g.close()


In [3]:
RECORD_LENGTH = 9
for i in range(RECORD_LENGTH):
    print (reviews_txt[i])

product/productId: B001E4KFG0
review/userId: A3SGXH7AUHU8GW
review/profileName: delmartian
review/helpfulness: 1/1
review/score: 5.0
review/time: 1303862400
review/summary: Good Quality Dog Food
review/text: I have bought several of the Vitality canned dog food products and have found them all to be of good quality. The product looks more like a stew than a processed meat and it smells better. My Labrador is finicky and she appreciates this product better than  most.



In [4]:
import re
class my_records:

    RECORD_LENGTH = 9
    def __init__(self):
        self.pid = []
        self.uid = []
        self.helpful = []
        self.score = []
        self.summary = []
        self.text = []

    def parse(self, reviews, num_rec=1):
        
        i = 0
        for ln in reviews:
            #print (ln)
            if ln == '' : 
                i = 0
                continue
            ln = ln.split(": ")            
            if i == 0:
                if len(self.pid) == num_rec:
                    return
                self.pid.append(ln[1])
            elif i == 4:
                self.score.append('POSITIVE' if float(ln[1])>=4.0 else 'NEGATIVE')
            elif i == 6: 
                #remove new-line
                txt = re.sub('\<br \/\>', ' ', ln[1])
                self.summary.append(txt)
            elif i == 7:
                txt = re.sub('\<br \/\>', ' ', ln[1])
                self.text.append(txt)
            i += 1

In [5]:

reviews = my_records()
reviews.parse(reviews_txt, 80000)


In [6]:
print (len(reviews.pid))
print (reviews.score.count('POSITIVE'))

80000
61609


In [7]:
i = 1100
print (reviews.text[i])
print (reviews.score[i])

We have ordered Uncle Rays BBQ chips many times before. Always been pleased, except this order!  Just NOT the same Seasoning as usual. Generally lacked much for taste! Not sure if they have reduced the seasoning, in their processing? But out of case we bought, we have opened and thrown away 3 bags already, VERY disappointed with this order!
NEGATIVE


In [8]:
import pickle
g = open('./reviews.pkl','wb')
pickle.dump(reviews, g)
g.close()

g = open ('./reviews.pkl', 'rb')
test_reviews = pickle.load(g)
g.close()

In [24]:
import time
import sys
import numpy as np
from collections import Counter

# Let's tweak our network from before to model these phenomena
class SentimentNetwork:
    def __init__(self, reviews,labels,min_count = 10,polarity_cutoff = 0.1,hidden_nodes = 10, learning_rate = 0.1):
       
        np.random.seed(1)
    
        self.pre_process_data(reviews, labels, polarity_cutoff, min_count)
        
        self.init_network(len(self.review_vocab),hidden_nodes, 1, learning_rate)
        
        
    def pre_process_data(self,reviews, labels, polarity_cutoff,min_count):
        
        self.positive_counts = Counter()
        self.negative_counts = Counter()
        self.total_counts = Counter()

        for i in range(len(reviews)):
            if(labels[i] == 'POSITIVE'):
                for word in reviews[i].split(" "):
                    self.positive_counts[word] += 1
                    self.total_counts[word] += 1
            else:
                for word in reviews[i].split(" "):
                    self.negative_counts[word] += 1
                    self.total_counts[word] += 1

        self.pos_neg_ratios = Counter()

        for term,cnt in list(self.total_counts.most_common()):
            if(cnt >= 50):
                pos_neg_ratio = self.positive_counts[term] / float(self.negative_counts[term]+1)
                self.pos_neg_ratios[term] = pos_neg_ratio

        for word,ratio in self.pos_neg_ratios.most_common():
            if(ratio > 1):
                self.pos_neg_ratios[word] = np.log(ratio)
            else:
                self.pos_neg_ratios[word] = -np.log((1 / (ratio + 0.01)))
        
        review_vocab = set()
        for review in reviews:
            for word in review.split(" "):
                if(self.total_counts[word] > min_count):
                    if(word in self.pos_neg_ratios.keys()):
                        if((self.pos_neg_ratios[word] >= polarity_cutoff) or (self.pos_neg_ratios[word] <= -polarity_cutoff)):
                            review_vocab.add(word)
                    else:
                        review_vocab.add(word)
        self.review_vocab = list(review_vocab)
        
        label_vocab = set()
        for label in labels:
            label_vocab.add(label)
        
        self.label_vocab = list(label_vocab)
        
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        self.word2index = {}
        for i, word in enumerate(self.review_vocab):
            self.word2index[word] = i
        
        self.label2index = {}
        for i, label in enumerate(self.label_vocab):
            self.label2index[label] = i
         
        
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        # Set number of nodes in input, hidden and output layers.
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes

        # Initialize weights
        self.weights_0_1 = np.zeros((self.input_nodes,self.hidden_nodes))
    
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, 
                                                (self.hidden_nodes, self.output_nodes))
        
        self.learning_rate = learning_rate
        
        self.layer_0 = np.zeros((1,input_nodes))
        self.layer_1 = np.zeros((1,hidden_nodes))
        
    def sigmoid(self,x):
        return 1 / (1 + np.exp(-x))
    
    
    def sigmoid_output_2_derivative(self,output):
        return output * (1 - output)
    
    def update_input_layer(self,review):

        # clear out previous state, reset the layer to be all 0s
        self.layer_0 *= 0
        for word in review.split(" "):
            self.layer_0[0][self.word2index[word]] = 1

    def get_target_for_label(self,label):
        if(label == 'POSITIVE'):
            return 1
        else:
            return 0
        
    def train(self, training_reviews_raw, training_labels):
        
        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if(word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_reviews.append(list(indices))
        
        assert(len(training_reviews) == len(training_labels))
        
        correct_so_far = 0
        
        start = time.time()
        
        for i in range(len(training_reviews)):
            
            review = training_reviews[i]
            label = training_labels[i]
            
            #### Implement the forward pass here ####
            ### Forward pass ###

            # Input Layer

            # Hidden layer
#             layer_1 = self.layer_0.dot(self.weights_0_1)
            self.layer_1 *= 0
            for index in review:
                self.layer_1 += self.weights_0_1[index]
            
            # Output layer
            layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))

            #### Implement the backward pass here ####
            ### Backward pass ###

            # Output error
            layer_2_error = layer_2 - self.get_target_for_label(label) # Output layer error is the difference between desired target and actual output.
            layer_2_delta = layer_2_error * self.sigmoid_output_2_derivative(layer_2)

            # Backpropagated error
            layer_1_error = layer_2_delta.dot(self.weights_1_2.T) # errors propagated to the hidden layer
            layer_1_delta = layer_1_error # hidden layer gradients - no nonlinearity so it's the same as the error

            # Update the weights
            self.weights_1_2 -= self.layer_1.T.dot(layer_2_delta) * self.learning_rate # update hidden-to-output weights with gradient descent step
            
            for index in review:
                self.weights_0_1[index] -= layer_1_delta[0] * self.learning_rate # update input-to-hidden weights with gradient descent step

            if(layer_2 >= 0.5 and label == 'POSITIVE'):
                correct_so_far += 1
            if(layer_2 < 0.5 and label == 'NEGATIVE'):
                correct_so_far += 1
            elapsed = float(time.time() - start)
            if elapsed == 0: continue
            reviews_per_second = i / elapsed
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
        
    
    def test(self, testing_reviews, testing_labels=None):
        
        correct = 0
        
        start = time.time()
        
        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if not testing_labels:
                if pred == "POSITIVE":
                    print (pred, " You are welcome.")
                else:
                    print (pred, " Well, that's your problem.")
                continue
            if(pred == testing_labels[i]):
                correct += 1
                elapsed = float(time.time() - start)
                if elapsed == 0:
                    continue
                reviews_per_second = i / elapsed
            
                sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                            + "% #Correct:" + str(correct) + " #Tested:" + str(i+1) + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
    
    def run(self, review):
        
        # Input Layer


        # Hidden layer
        self.layer_1 *= 0
        unique_indices = set()
        for word in review.lower().split(" "):
            if word in self.word2index.keys():
                unique_indices.add(self.word2index[word])
        for index in unique_indices:
            self.layer_1 += self.weights_0_1[index]
        
        # Output layer
        layer_2 = self.sigmoid(self.layer_1.dot(self.weights_1_2))
        
        if(layer_2[0] >= 0.5):
            return "POSITIVE"
        else:
            return "NEGATIVE"
        

In [26]:
given_reviews = reviews.summary
given_labels = reviews.score

In [27]:
mlp = SentimentNetwork(given_reviews[:-1000],given_labels[:-1000],min_count=20,polarity_cutoff=0.8,learning_rate=0.01)

In [28]:
mlp.train(given_reviews[:-1000],given_labels[:-1000])

Progress:99.9% Speed(reviews/sec):7140. #Correct:64936 #Trained:79000 Training Accuracy:82.1%

In [13]:
mlp.test(given_reviews[-1000:],given_labels[-1000:])

Progress:31.8% Speed(reviews/sec):20384% #Correct:259 #Tested:319 Testing Accuracy:81.1%Progress:31.9% Speed(reviews/sec):20448% #Correct:260 #Tested:320 Testing Accuracy:81.2%Progress:32.0% Speed(reviews/sec):20512% #Correct:261 #Tested:321 Testing Accuracy:81.3%Progress:32.1% Speed(reviews/sec):20576% #Correct:262 #Tested:322 Testing Accuracy:81.3%Progress:32.2% Speed(reviews/sec):20640% #Correct:263 #Tested:323 Testing Accuracy:81.4%Progress:32.3% Speed(reviews/sec):20704% #Correct:264 #Tested:324 Testing Accuracy:81.4%Progress:32.4% Speed(reviews/sec):20768% #Correct:265 #Tested:325 Testing Accuracy:81.5%Progress:32.5% Speed(reviews/sec):20833% #Correct:266 #Tested:326 Testing Accuracy:81.5%Progress:32.6% Speed(reviews/sec):20897% #Correct:267 #Tested:327 Testing Accuracy:81.6%Progress:32.8% Speed(reviews/sec):21025% #Correct:268 #Tested:329 Testing Accuracy:81.4%Progress:32.9% Speed(reviews/sec):21089% #Correct:269 #Tested:330 Testing Accuracy:81.5%Progress:33.1% Speed

In [14]:
mlp.pos_neg_ratios.most_common()


[('Yummy!', 5.6167710976665717),
 ('Delicious!', 5.3729609095438029),
 ('Fantastic', 4.7999142627806028),
 ('best!', 4.7361984483944957),
 ('BEST', 4.6759392568762985),
 ('Perfect', 4.6653241088078383),
 ('yum', 4.4659081186545837),
 ('Awesome!', 4.4543472962535073),
 ('Excellent!', 4.4426512564903167),
 ('Wonderful', 4.4379342666121779),
 ('YUM!', 4.4308167988433134),
 ('Tasty!', 4.3694478524670215),
 ('Excellent', 4.3274384443894789),
 ('Yum!', 4.214593690373678),
 ('Fantastic!', 4.2046926193909657),
 ('delicious!', 4.1743872698956368),
 ('Best', 4.1554048189833468),
 ('YUMMY', 4.0775374439057197),
 ('Yum', 4.0718717063700423),
 ('Wonderful!', 4.0430512678345503),
 ('YUM', 4.0430512678345503),
 ('Wow!', 4.0253516907351496),
 ('snack!', 4.0253516907351496),
 ('Amazing!', 4.0253516907351496),
 ('Addictive', 4.0073331852324712),
 ('AWESOME', 4.0073331852324712),
 ('Best!', 3.9512437185814275),
 ('delicious', 3.9164576024960116),
 ('Good!', 3.912023005428146),
 ('these!', 3.9080149840306

In [15]:
def get_most_similar_words(focus = "satisfied"):
    most_similar = Counter()

    for word in mlp.word2index.keys():
        most_similar[word] = np.dot(mlp.weights_0_1[mlp.word2index[word]],mlp.weights_0_1[mlp.word2index[focus]])
    
    return most_similar.most_common()


In [16]:
get_most_similar_words('salad')[0:20]

[('Excellent', 0.021075473515649226),
 ('Best', 0.02081135336928926),
 ('Delicious', 0.02003095571394618),
 ('Great', 0.019047161114435728),
 ('Love', 0.018543625717277086),
 ('Delicious!', 0.01812228316964952),
 ('Yummy', 0.016945824060413121),
 ('Awesome', 0.016492054546826999),
 ('Wonderful', 0.01647227497590914),
 ('Perfect', 0.016370972775579414),
 ('love', 0.016002303764483874),
 ('Yummy!', 0.015810671117659266),
 ('delicious', 0.015649951854218098),
 ('GREAT', 0.014457511925015296),
 ('LOVE', 0.013951401949071076),
 ('Fantastic', 0.013731439983960377),
 ('Yum!', 0.01365039726463217),
 ('Amazing', 0.013580921464495306),
 ('BEST', 0.013538614912961128),
 ('Tasty', 0.013506646893745788)]

In [17]:
#most_used = Counter()
#print (mlp.weights_0_1.shape)
#for word in mlp.word2index.keys():
#    most_used[word] = mlp.weights_0_1.T[1][mlp.word2index[word]]
#most_used.most_common()

In [18]:
import matplotlib.colors as colors

words_to_visualize = list()
for word, ratio in mlp.pos_neg_ratios.most_common(500):
    if(word in mlp.word2index.keys()):
        words_to_visualize.append(word)
    
for word, ratio in list(reversed(mlp.pos_neg_ratios.most_common()))[0:500]:
    if(word in mlp.word2index.keys()):
        words_to_visualize.append(word)

pos = 0
neg = 0

colors_list = list()
vectors_list = list()
for word in words_to_visualize:
    if word in mlp.pos_neg_ratios.keys():
        vectors_list.append(mlp.weights_0_1[mlp.word2index[word]])
        if(mlp.pos_neg_ratios[word] > 0):
            pos+=1
            colors_list.append("#00ff00")
        else:
            neg+=1
            colors_list.append("#000000")
from sklearn.manifold import TSNE
tsne = TSNE(n_components=2, random_state=0)
words_top_ted_tsne = tsne.fit_transform(vectors_list)
p = figure(tools="pan,wheel_zoom,reset,save",
           toolbar_location="above",
           title="vector T-SNE for most polarized words")

source = ColumnDataSource(data=dict(x1=words_top_ted_tsne[:,0],
                                    x2=words_top_ted_tsne[:,1],
                                    names=words_to_visualize))

p.scatter(x="x1", y="x2", size=8, source=source,color=colors_list)

word_labels = LabelSet(x="x1", y="x2", text="names", y_offset=6,
                  text_font_size="8pt", text_color="#555555",
                  source=source, text_align='center')
p.add_layout(word_labels)

show(p)

# green indicates positive words, black indicates negative words

Supplying a user-defined data source AND iterable values to glyph methods is deprecated.

See https://github.com/bokeh/bokeh/issues/2056 for more information.

  warn(message)
Supplying a user-defined data source AND iterable values to glyph methods is deprecated.

See https://github.com/bokeh/bokeh/issues/2056 for more information.

  warn(message)


In [38]:
chef = mlp


In [37]:
chef.test(['what a wonderful salad'])

POSITIVE  You are welcome.


In [32]:
chef.test(['It is terrible'])

NEGATIVE  Well, that's your problem.


In [31]:
chef.test(['It is unbelievable bad'])

POSITIVE  You are welcome.


In [35]:
chef.test(['I like it'])

POSITIVE  You are welcome.


In [36]:
chef.test(['I hate it'])

NEGATIVE  Well, that's your problem.
