# Reviews Word-by-word
### Feedforward Neural Network for sentiment analysis of movie reviews

This neural network gives predictions of movie reviews' sentiments based on the words they contain. It was composed in response to Udacity's Deep Learning Foundations course, specifically the lessons implemented by Andrew Trask. It was designed as a practice in both bare-bones neural networks (built without the aid of frameworks like Tensorflow) and practice in neural networks structured for natural language processing.

## Imports

In [1]:
%matplotlib inline
%config InlineBackend.figure_format = "retina"

import sys
import time
import numpy as np
from collections import Counter

## Loading

In [2]:
g = open("reviews.txt","r")
reviews = list(map(lambda x:x[:-1],g.readlines()))
g.close()

g = open("labels.txt","r")
labels = list(map(lambda x:x[:-1].lower(),g.readlines()))
g.close()

## Exploration

In [3]:
positive_counts = Counter()
negative_counts = Counter()
total_counts = Counter()
for i in range(len(labels)):
    for word in reviews[i].split(" "):
        if labels[i] == "positive":
            positive_counts[word] += 1
        elif labels[i] == "negative":
            negative_counts[word] += 1
        total_counts[word] += 1
        
pos_neg_ratios = Counter()
for word,count in list(total_counts.most_common()):
    if count > 100:
        pos_neg_ratios[word] = float((positive_counts[word]+1) / (negative_counts[word]+1))
for word in pos_neg_ratios:
    pos_neg_ratios[word] = np.log(pos_neg_ratios[word])
    
print("Positive \t \t \t \t Negative")
for i in range(10):
    print("{} \t \t {}".format(
        pos_neg_ratios.most_common()[i],
        list(reversed(pos_neg_ratios.most_common()))[i]))

Positive 	 	 	 	 Negative
('edie', 4.7004803657924166) 	 	 ('boll', -4.2766661190160553)
('paulie', 4.0859763125515842) 	 	 ('uwe', -3.9318256327243257)
('felix', 3.1612467120315646) 	 	 ('seagal', -3.4210000089583352)
('polanski', 2.8332133440562162) 	 	 ('unwatchable', -3.0349529867072724)
('matthau', 2.8134107167600364) 	 	 ('stinker', -2.9856819377004897)
('victoria', 2.6855773452501515) 	 	 ('mst', -2.8449093838194073)
('mildred', 2.6119063405493077) 	 	 ('incoherent', -2.8033603809065348)
('gandhi', 2.5477075510270306) 	 	 ('unfunny', -2.635081181235619)
('flawless', 2.4595888418037104) 	 	 ('waste', -2.6093342281630525)
('superbly', 2.2686835413183641) 	 	 ('blah', -2.5014359517392109)


## Preprocessing

In [4]:
vocab = set()
for i in range(len(reviews)):
    vocab |= set(reviews[i].split(" "))
vocab_size = len(vocab)

word2index = {}
for i,word in enumerate(vocab):
    word2index[word] = i

## Layers

In [5]:
layer_0 = np.zeros((1, vocab_size))

In [6]:
def update_input_layer(review):
    global layer_0
    layer_0 *= 0
    
    for word in review.split(" "):
        layer_0[:, word2index[word]] += 1
        
def get_target_for_label(label):
    if label == "negative":
        return 0
    elif label == "positive":
        return 1

## Network

In [7]:
class SentimentNetwork(object):
    def __init__(self, reviews, labels, hidden_nodes = 10, learning_rate = 0.1, min_count = 50, polarity_cutoff = 3):
        np.random.seed(1)
        self.pre_process_data(reviews, labels, min_count, polarity_cutoff)
        self.init_network(len(self.review_vocab), hidden_nodes, 1, learning_rate)
        
    def sigmoid(self, x):
        return 1 / (1 + np.exp(-x))
    
    def sigmoid_output_2_derivative(self, x):
        return x * (1 - x)
        
    def pre_process_data(self, reviews, labels, min_count, pol_cutoff):
        pos_counts = Counter()
        neg_counts = Counter()
        total_counts = Counter()
        for i in range(len(labels)):
            for word in reviews[i].split(" "):
                if labels[i] == "positive":
                    pos_counts[word] += 1
                elif labels[i] == "negative":
                    neg_counts[word] += 1
                total_counts[word] += 1
        
        pos_neg_ratios = Counter()
        for word,count in list(total_counts.most_common()):
            if count >= min_count:
                pos_neg_ratios[word] = float((positive_counts[word]+1) / (negative_counts[word]+1))
        trimmed_reviews = list()
        for word in pos_neg_ratios:
            pos_neg_ratios[word] = np.log(pos_neg_ratios[word])
            if abs(pos_neg_ratios[word]) >= pol_cutoff:
                trimmed_reviews.append(word)
        
        #Set vocab
        review_vocab = list()
        for word in trimmed_reviews:
            review_vocab.append(word)
        self.review_vocab = review_vocab
        
        label_vocab = set()
        label_vocab |= set(labels)
        self.label_vocab = list(label_vocab)
        
        self.review_vocab_size = len(self.review_vocab)
        self.label_vocab_size = len(self.label_vocab)
        
        self.word2index = {}
        for i,word in enumerate(self.review_vocab):
            self.word2index[word] = i
            
        self.label2index = {}
        for i,label in enumerate(self.label_vocab):
            self.label2index[label] = i
            
    def init_network(self, input_nodes, hidden_nodes, output_nodes, learning_rate):
        self.input_nodes = input_nodes
        self.hidden_nodes = hidden_nodes
        self.output_nodes = output_nodes
        self.learning_rate = learning_rate
        
        self.weights_0_1 = np.zeros((self.input_nodes, self.hidden_nodes))
        self.weights_1_2 = np.random.normal(0.0, self.output_nodes**-0.5, (self.hidden_nodes, self.output_nodes)) #np.zeros((self.hidden_nodes, self.output_nodes))
        
        self.layer_1 = np.zeros((1, self.hidden_nodes))
            
    def get_target_for_layer(self, label):
        if label == "positive":
            return 1
        else:
            return 0
    
    def train(self, training_reviews_raw, training_labels):        
        assert(len(training_reviews_raw) == len(training_labels))
        correct_so_far = 0
        start = time.time()
        
        training_reviews = list()
        for review in training_reviews_raw:
            indices = set()
            for word in review.split(" "):
                if (word in self.word2index.keys()):
                    indices.add(self.word2index[word])
            training_reviews.append(list(indices))
        
        for i in range(len(training_reviews)):
            review = training_reviews[i]
            label = training_labels[i]
            
            self.layer_1 *= 0
            
            for index in review:
                self.layer_1 += self.weights_0_1[index]
            layer_2 = self.sigmoid(np.dot(self.layer_1, self.weights_1_2))

            error_2 = layer_2 - self.get_target_for_layer(label)
            error_del_2 = error_2 * self.sigmoid_output_2_derivative(layer_2)
            error_del_1 = np.dot(error_del_2, self.weights_1_2.T)
            
            self.weights_1_2 -= self.learning_rate * np.dot(self.layer_1.T, error_del_2)
            for index in review:
                self.weights_0_1[index] -= self.learning_rate * error_del_1[0]
            
            #Progress tracking
            if layer_2 >= 0.5 and label == "positive":
                correct_so_far += 1
            elif layer_2 < 0.5 and label == "negative":
                correct_so_far += 1
                
            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(training_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct_so_far) + " #Trained:" + str(i+1) \
                             + " Training Accuracy:" + str(correct_so_far * 100 / float(i+1))[:4] + "%")
            if(i % 2500 == 0):
                print("")
        
    def test(self, testing_reviews, testing_labels):
        correct = 0
        start = time.time()

        for i in range(len(testing_reviews)):
            pred = self.run(testing_reviews[i])
            if(pred == testing_labels[i]):
                correct += 1

            elapsed_time = float(time.time() - start)
            reviews_per_second = i / elapsed_time if elapsed_time > 0 else 0
            
            sys.stdout.write("\rProgress:" + str(100 * i/float(len(testing_reviews)))[:4] \
                             + "% Speed(reviews/sec):" + str(reviews_per_second)[0:5] \
                             + " #Correct:" + str(correct) + " #Tested:" + str(i+1) \
                             + " Testing Accuracy:" + str(correct * 100 / float(i+1))[:4] + "%")
 
    def run(self, review):
        self.layer_1 *= 0
        
        un_indices = set()
        for word in review.split(" "):
            if (word in self.word2index.keys()):
                un_indices.add(self.word2index[word])
        
        for index in un_indices:
            self.layer_1 += self.weights_0_1[index]
        
        layer_2 = self.sigmoid(np.dot(self.layer_1, self.weights_1_2))
        
        if layer_2[0] >= 0.5:
            return "positive"
        else:
            return "negative"

## Training

In [8]:
mlp = SentimentNetwork(reviews[:-1000],labels[:-1000],min_count=20,polarity_cutoff=0.8,learning_rate=0.01)
mlp.train(reviews[:-1000],labels[:-1000])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:1 #Trained:1 Training Accuracy:100.%
Progress:10.4% Speed(reviews/sec):794.7 #Correct:2119 #Trained:2501 Training Accuracy:84.7%
Progress:20.8% Speed(reviews/sec):1391. #Correct:4235 #Trained:5001 Training Accuracy:84.6%
Progress:31.2% Speed(reviews/sec):1861. #Correct:6372 #Trained:7501 Training Accuracy:84.9%
Progress:41.6% Speed(reviews/sec):2260. #Correct:8520 #Trained:10001 Training Accuracy:85.1%
Progress:52.0% Speed(reviews/sec):2461. #Correct:10663 #Trained:12501 Training Accuracy:85.2%
Progress:62.5% Speed(reviews/sec):2641. #Correct:12818 #Trained:15001 Training Accuracy:85.4%
Progress:72.9% Speed(reviews/sec):2787. #Correct:14933 #Trained:17501 Training Accuracy:85.3%
Progress:83.3% Speed(reviews/sec):2962. #Correct:17119 #Trained:20001 Training Accuracy:85.5%
Progress:93.7% Speed(reviews/sec):2994. #Correct:19307 #Trained:22501 Training Accuracy:85.8%
Progress:99.9% Speed(reviews/sec):3051. #Correct:20599 #Trained:24000 Training

## Testing

In [9]:
mlp.test(reviews[-1000:],labels[-1000:])

Progress:0.0% Speed(reviews/sec):0.0 #Correct:0 #Tested:1 Testing Accuracy:0.0%Progress:0.1% Speed(reviews/sec):1003. #Correct:1 #Tested:2 Testing Accuracy:50.0%Progress:0.2% Speed(reviews/sec):1613. #Correct:2 #Tested:3 Testing Accuracy:66.6%Progress:0.3% Speed(reviews/sec):1859. #Correct:3 #Tested:4 Testing Accuracy:75.0%Progress:0.4% Speed(reviews/sec):2208. #Correct:3 #Tested:5 Testing Accuracy:60.0%Progress:0.5% Speed(reviews/sec):2332. #Correct:4 #Tested:6 Testing Accuracy:66.6%Progress:0.6% Speed(reviews/sec):2405. #Correct:5 #Tested:7 Testing Accuracy:71.4%Progress:0.7% Speed(reviews/sec):2320. #Correct:6 #Tested:8 Testing Accuracy:75.0%Progress:0.8% Speed(reviews/sec):2204. #Correct:7 #Tested:9 Testing Accuracy:77.7%Progress:0.9% Speed(reviews/sec):2150. #Correct:8 #Tested:10 Testing Accuracy:80.0%Progress:1.0% Speed(reviews/sec):2222. #Correct:9 #Tested:11 Testing Accuracy:81.8%Progress:1.1% Speed(reviews/sec):2366. #Correct:10 #Tested:12 Testing Accuracy:83.3%Pr

Progress:87.4% Speed(reviews/sec):4754. #Correct:744 #Tested:875 Testing Accuracy:85.0%Progress:87.5% Speed(reviews/sec):4756. #Correct:745 #Tested:876 Testing Accuracy:85.0%Progress:87.6% Speed(reviews/sec):4760. #Correct:746 #Tested:877 Testing Accuracy:85.0%Progress:87.7% Speed(reviews/sec):4751. #Correct:747 #Tested:878 Testing Accuracy:85.0%Progress:87.8% Speed(reviews/sec):4753. #Correct:748 #Tested:879 Testing Accuracy:85.0%Progress:87.9% Speed(reviews/sec):4754. #Correct:749 #Tested:880 Testing Accuracy:85.1%Progress:88.0% Speed(reviews/sec):4755. #Correct:750 #Tested:881 Testing Accuracy:85.1%Progress:88.1% Speed(reviews/sec):4757. #Correct:751 #Tested:882 Testing Accuracy:85.1%Progress:88.2% Speed(reviews/sec):4758. #Correct:752 #Tested:883 Testing Accuracy:85.1%Progress:88.3% Speed(reviews/sec):4759. #Correct:753 #Tested:884 Testing Accuracy:85.1%Progress:88.4% Speed(reviews/sec):4761. #Correct:754 #Tested:885 Testing Accuracy:85.1%Progress:88.5% Speed(reviews/se

## Results

In [16]:
def get_most_similar_words(focus = "horrible"):
    most_similar = Counter()

    for word in mlp.word2index.keys():
        most_similar[word] = np.dot(mlp.weights_0_1[mlp.word2index[word]], mlp.weights_0_1[mlp.word2index[focus]])
    
    return most_similar.most_common()

In [17]:
get_most_similar_words("excellent")

[('perfect', 0.070885996535763884),
 ('excellent', 0.068866727278969753),
 ('wonderful', 0.064598107226035095),
 ('amazing', 0.06111004916951393),
 ('today', 0.057809137269058709),
 ('favorite', 0.056718105823209082),
 ('funniest', 0.056322477114252277),
 ('refreshing', 0.05294556236635517),
 ('fantastic', 0.051150388636075031),
 ('heart', 0.050247808806010236),
 ('gem', 0.050047286767065401),
 ('wonderfully', 0.047124570333045901),
 ('rare', 0.047052476720145675),
 ('superb', 0.04649555166524115),
 ('awesome', 0.045094533664493161),
 ('perfectly', 0.044345296645669607),
 ('pleasantly', 0.044337886895825199),
 ('great', 0.043515819827257912),
 ('enjoyed', 0.042706513917913524),
 ('recommended', 0.042447967652317141),
 ('fascinating', 0.042044182105765771),
 ('captures', 0.04122109463785846),
 ('incredible', 0.040861814110923107),
 ('touching', 0.040475197541518342),
 ('solid', 0.039308358244672215),
 ('vhs', 0.038899239145900856),
 ('subtle', 0.038006523297370143),
 ('powerful', 0.0370

In [18]:
get_most_similar_words("terrible")

[('worst', 0.076047724453443696),
 ('waste', 0.073361135025049998),
 ('awful', 0.07213094012507755),
 ('poorly', 0.061225444698427139),
 ('fails', 0.056344946253459635),
 ('terrible', 0.055027863946259775),
 ('mess', 0.054012112690089771),
 ('dull', 0.053400637212056783),
 ('disappointment', 0.051836588490703751),
 ('horrible', 0.05140440172779609),
 ('wasted', 0.050626174553732591),
 ('lacks', 0.049058006050984322),
 ('disappointing', 0.048268121324392163),
 ('worse', 0.045630873346125159),
 ('avoid', 0.043736471424284988),
 ('pointless', 0.041987979261186227),
 ('mediocre', 0.041307376423670036),
 ('pathetic', 0.041204828646392633),
 ('redeeming', 0.041074567661535402),
 ('annoying', 0.040126808268247431),
 ('boring', 0.039812199513047389),
 ('ridiculous', 0.039456571470721229),
 ('laughable', 0.039208895052872866),
 ('badly', 0.039113357901201108),
 ('wooden', 0.037894624710613417),
 ('unfunny', 0.03785654021844069),
 ('poor', 0.037833217204623446),
 ('lame', 0.036117625798810532),
