In [4]:
import numpy as np

onehots = {}
onehots['cat'] = np.array([1, 0, 0, 0])
onehots['the'] = np.array([0, 1, 0, 0])
onehots['dog'] = np.array([0, 0, 1, 0])
onehots['sat'] = np.array([0, 0, 0, 1])

sentence = ['the', 'cat', 'sat']

x = onehots[sentence[0]] + onehots[sentence[1]] + onehots[sentence[2]]

print(f"Sent Encoding: {x}")

Sent Encoding: [1 1 0 1]


In [15]:
# Encoding

import sys

f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

f = open('labels.txt')
raw_labels = f.readlines()
f.close()

tokens = list(map(lambda x:set(x.split(" ")), raw_reviews))

vocab = set()
for sent in tokens:
    for word in sent:
        if(len(word)>0):
            vocab.add(word)

vocab = list(vocab)

word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i

input_dataset = list()
for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            ""
    input_dataset.append(list(set(sent_indices)))

target_dataset = list()
for label in raw_labels:
    if label == 'positive\n':
        target_dataset.append(1)
    else:
        target_dataset.append(0)


In [21]:
import numpy as np
np.random.seed(1)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

alpha, iterations = (0.01, 2)
hidden_size = 100

print(f"len(vocab): {len(vocab)}")
print(f"len(input_dataset): {len(input_dataset)}")
print(f"len(target_dataset): {len(target_dataset)}")
print(f"input_dataset[0]: {input_dataset[0]}")
print(f"target_dataset[0]: {target_dataset[0]}")

weight_0_1 = 0.2 * np.random.random((len(vocab), hidden_size)) - 0.1
weight_1_2 = 0.2 * np.random.random((hidden_size, 1)) - 0.1

correct, total = (0, 0)
for iter in range(iterations):
    for i in range(len(input_dataset) - 1000):
        x, y = (input_dataset[i], target_dataset[i])
        layer_1 = sigmoid(np.sum(weight_0_1[x], axis=0))
        layer_2 = sigmoid(np.dot(layer_1, weight_1_2))

        layer_2_delta = layer_2 - y
        layer_1_delta = layer_2_delta.dot(weight_1_2.T)

        weight_0_1[x] -= layer_1_delta * alpha
        weight_1_2 -= np.outer(layer_1, layer_2_delta) * alpha

        if(np.abs(layer_2_delta) < 0.5):
            correct += 1
        total += 1

        if(i % 10 == 9):
            progress = str(i/float(len(input_dataset)))
            sys.stdout.write('\rIter: '+str(iter) \
                             + ' Progress: '+progress[2:4] \
                                +'.'+progress[4:6] \
                                    +' % Training Accuracy: '\
                                        + str(correct/float(total))+'%')
    print()
correct, total = (0, 0)
for i in range(len(input_dataset) - 1000, len(input_dataset)):
    x = input_dataset[i]
    y = target_dataset[i]

    layer_1 = sigmoid(np.sum(weight_0_1[x], axis=0))
    layer_2 = sigmoid(np.dot(layer_1, weight_1_2))

    if(np.abs(layer_2 - y) < 0.5):
        correct += 1
    total += 1
print("Test Accuracy: "+str(correct/float(total)))

len(vocab): 74074
len(input_dataset): 25000
len(target_dataset): 25000
input_dataset[0]: [64512, 3075, 17923, 53769, 27659, 7194, 51740, 9758, 39460, 42534, 37417, 26158, 63030, 49209, 73281, 37956, 9797, 37960, 20554, 32845, 59474, 43097, 57440, 14944, 12408, 33400, 43642, 10365, 57472, 44168, 42132, 55957, 49823, 49836, 6316, 28846, 60081, 16052, 47288, 58555, 24270, 10449, 9442, 30948, 64229, 38119, 11498, 55024, 43249, 53497, 4345, 70405, 61195, 17684, 18710, 19230, 28454, 29479, 71988, 27966, 832, 59716, 35141, 58692, 59210, 66385, 44890, 73056, 23905, 354, 72548, 49518, 68978, 19316, 1909, 62330, 15759, 64911, 17304, 13210, 50592, 5540, 17831, 41394, 35775, 59850, 19929, 63455, 16352, 50159, 37875, 49146, 70655]
target_dataset[0]: 1
Iter: 0 Progress: 95.99 % Training Accuracy: 0.83225%1271363068%%
Iter: 1 Progress: 95.99 % Training Accuracy: 0.8665%38570535528%
Test Accuracy: 0.853


In [22]:
from collections import Counter
import math

def similar(target = 'beautiful'):
    target_index = word2index[target]
    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weight_0_1[index] - (weight_0_1[target_index])
        square_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(square_difference))

    return scores.most_common(10)

print(similar('beautiful'))
print(similar('terrible'))

[('beautiful', -0.0), ('best', -0.7806569350918381), ('powerful', -0.7830514231713361), ('will', -0.7945192127457067), ('fun', -0.8036350314292777), ('appreciate', -0.8044728604591667), ('subtle', -0.8055962618144833), ('enjoy', -0.8083514421986505), ('deserves', -0.8132001234598726), ('underrated', -0.8183699051883105)]
[('terrible', -0.0), ('worse', -0.6655898503422804), ('disappointment', -0.7494341421601955), ('horrible', -0.7942725922976795), ('dull', -0.7947739792479398), ('poor', -0.8002903175914787), ('fails', -0.8006564398577826), ('badly', -0.804005098396182), ('disappointing', -0.8115300735464647), ('bad', -0.8156775855086102)]


In [32]:
import sys, random, math
from collections import Counter
import numpy as np

np.random.seed(1)
random.seed(1)

f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

tokens = list(map(lambda x: (x.split(" ")), raw_reviews))

wordcnt = Counter()
for sent in tokens:
    for word in sent:
        wordcnt[word] -= 1

vocab = list(set(map(lambda x: x[0], wordcnt.most_common())))

word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i

concatenated = list()
input_dataset = list()

for sent in tokens:
    sent_indices = list()
    for word in sent:
        try:
            sent_indices.append(word2index[word])
            concatenated.append(word2index[word])
        except:
            ""
    input_dataset.append(sent_indices)
concatenated = np.array(concatenated)
random.shuffle(input_dataset)

alpha, iterations = (0.05, 2)
hidden_size, window, negative = (50, 2, 5)

weight_0_1 = (np.random.rand(len(vocab), hidden_size) - 0.5) * 0.2
weight_1_2 = np.random.rand(len(vocab), hidden_size) * 0

layer_2_target = np.zeros(negative+1)
layer_2_target[0] = 1

def similar(target='beautiful'):
    target_index = word2index[target]

    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weight_0_1[index] - (weight_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))

    return scores.most_common(10)

def sigmoid(x):
    return 1 / (1 + np.exp(-x))

for rev_i, review in enumerate(input_dataset * iterations):
    for target_i in range(len(review)):

        target_samples = [review[target_i]] + list(concatenated[(np.random.rand(negative) * len(concatenated)).astype('int').tolist()])

        left_context = review[max(0, target_i - window): target_i]
        right_context = review[target_i+1: min(len(review), target_i+window)]

        layer_1 = np.mean(weight_0_1[left_context+right_context], axis=0)
        layer_2 = sigmoid(layer_1.dot(weight_1_2[target_samples].T))
        layer_2_delta = layer_2 - layer_2_target
        layer_1_delta = layer_2_delta.dot(weight_1_2[target_samples])

        weight_0_1[left_context+right_context] -= layer_1_delta * alpha
        weight_1_2[target_samples] -= np.outer(layer_2_delta, layer_1) * alpha

    if(rev_i % 250 == 0):
        sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)
        *iterations)) + "   " + str(similar('terrible')))

    sys.stdout.write('\rProgress:'+str(rev_i/float(len(input_dataset)
        *iterations)))

print(similar('terrible'))

Progress:0.99998 [('terrible', -0.0), ('horrible', -2.762726716928402), ('brilliant', -3.024791215455538), ('phenomenal', -3.526713519394144), ('pathetic', -3.596528053636521), ('marvelous', -3.8258569079399107), ('masterful', -3.870744676472927), ('superb', -3.880766088331156), ('miserable', -3.9039319837915523), ('terrific', -3.9585539296027745)]3)]5)]]]6)]][('terrible', -0.0), ('horrible', -2.7755246900173525), ('brilliant', -3.276813903181487), ('pathetic', -3.5996119023941135), ('phenomenal', -3.6558890017015946), ('bad', -3.872238000997129), ('marvelous', -3.957775280623674), ('masterful', -4.024629238649957), ('miserable', -4.082535113342476), ('dreadful', -4.138937666104752)]


In [33]:
def analogy(positive=['terrible', 'good'], negative=['bad']):
    norms = np.sum(weight_0_1 * weight_0_1, axis=1)
    norms.resize(norms.shape[0], 1)

    normed_weights = weight_0_1 * norms

    query_vect = np.zeros(len(weight_0_1[0]))
    for word in positive:
        query_vect += normed_weights[word2index[word]]

    for word in negative:
        query_vect -= normed_weights[word2index[word]]

    scores = Counter()
    for word, index in word2index.items():
        raw_difference = weight_0_1[index] - query_vect
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))

    return scores.most_common(10)[1:]

In [34]:
print(analogy(['terrible', 'good'], ['bad']))

[('superb', -212.77893303172317), ('decent', -213.11131920628588), ('terrific', -213.13783761791217), ('fine', -213.21595751836736), ('worth', -213.35394466329433), ('perfect', -213.36565675658474), ('nice', -213.40455609620668), ('brilliant', -213.48774533085336), ('outstanding', -213.6879190094353)]


In [35]:
print(analogy(['elizabeth', 'he'], ['she']))

[('christopher', -188.26883458111715), ('william', -188.3849814564768), ('david', -188.39227767170277), ('peter', -188.4292561104581), ('tom', -188.4374089864225), ('simon', -188.62456547433578), ('him', -188.66948853622625), ('been', -188.68776612766194), ('dan', -188.70029218496893)]
