In [11]:
import sys, random, math
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt

np.random.seed(1)
random.seed(1)

In [12]:
f = open('reviews.txt')
raw_reviews = f.readlines()
f.close()

tokens = list(map(lambda x: (x.split(" ")), raw_reviews))

wordcnt = Counter()
for sent in tokens:
    for word in sent:
        wordcnt[word] -= 1

vocab = list(set(map(lambda x: x[0], wordcnt.most_common())))

word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i

print(word2index)



In [13]:
alpha, iterations = (0.05, 2)
hidden_size, window, negative = (50, 2, 5)

weights_0_1 = (np.random.rand(len(vocab), hidden_size) - 0.5) * 0.2
weights_1_2 = np.random.rand(len(vocab), hidden_size) * 0

# print(weights_0_1[:1])
norms = np.sum(weights_0_1 * weights_0_1, axis=1)
print(norms)
print(norms.shape)

norms.resize(norms.shape[0], 1)
normed_weights = weights_0_1 * norms
print(normed_weights.shape)
print(normed_weights[:1])

[0.18857182 0.15891503 0.22144939 ... 0.1831402  0.13476361 0.16948323]
(74075,)
(74075, 50)
[[-0.00312946  0.0083094  -0.01885287 -0.0074549  -0.01332238 -0.01537469
  -0.0118325  -0.00582458 -0.00389335  0.00146395 -0.00304753  0.00698544
  -0.0111464   0.01426046 -0.01782428  0.00642907 -0.0031188   0.00221345
  -0.01356258 -0.01138591  0.01134239  0.01766019 -0.00703659  0.00725332
   0.01419528  0.01488234 -0.01564979 -0.01738426 -0.01245214  0.0142614
  -0.01514809 -0.00297538  0.01726901  0.00125081  0.00723652 -0.00695771
   0.00703376  0.01262019 -0.01816745  0.00943403  0.01843708  0.00935941
  -0.00828041  0.01090999 -0.01496408 -0.00196516  0.01540992 -0.00778371
  -0.00800392 -0.01395324]]


In [14]:
def make_sent_vect(words):
    indices = list(map(lambda x:word2index[x], filter(lambda x: x in word2index, words)))
    return np.mean(normed_weights[indices], axis=0)

reviews2vectors = list()
for review in tokens:
    reviews2vectors.append(make_sent_vect(review))

reviews2vectors = np.array(reviews2vectors)
print(reviews2vectors)

[[-0.0011266  -0.00087934  0.00046491 ...  0.00322687 -0.00076521
  -0.0002159 ]
 [-0.00047019  0.00085039  0.00033806 ...  0.00087971 -0.00082544
   0.00176484]
 [-0.00252075  0.00068792 -0.00222042 ...  0.00305061 -0.00214706
   0.00045303]
 ...
 [-0.00360131  0.0003417  -0.00162273 ...  0.00160478 -0.00311688
   0.00093489]
 [-0.00256729  0.0017996  -0.00220445 ...  0.00272988 -0.002597
   0.00133951]
 [-0.00302836  0.00088439 -0.00139157 ...  0.00116375 -0.00233056
   0.00047722]]


In [19]:
print(tokens[:1][0])
print(len(tokens[:1][0]))
v = make_sent_vect(tokens[:1][0])
print(np.array(v).shape)

['bromwell', 'high', 'is', 'a', 'cartoon', 'comedy', '.', 'it', 'ran', 'at', 'the', 'same', 'time', 'as', 'some', 'other', 'programs', 'about', 'school', 'life', '', 'such', 'as', '', 'teachers', '', '.', 'my', '', '', 'years', 'in', 'the', 'teaching', 'profession', 'lead', 'me', 'to', 'believe', 'that', 'bromwell', 'high', '', 's', 'satire', 'is', 'much', 'closer', 'to', 'reality', 'than', 'is', '', 'teachers', '', '.', 'the', 'scramble', 'to', 'survive', 'financially', '', 'the', 'insightful', 'students', 'who', 'can', 'see', 'right', 'through', 'their', 'pathetic', 'teachers', '', 'pomp', '', 'the', 'pettiness', 'of', 'the', 'whole', 'situation', '', 'all', 'remind', 'me', 'of', 'the', 'schools', 'i', 'knew', 'and', 'their', 'students', '.', 'when', 'i', 'saw', 'the', 'episode', 'in', 'which', 'a', 'student', 'repeatedly', 'tried', 'to', 'burn', 'down', 'the', 'school', '', 'i', 'immediately', 'recalled', '.', '.', '.', '.', '.', '.', '.', '.', '.', 'at', '.', '.', '.', '.', '.', '.

In [25]:
def most_similar_reviews(review):
    v = make_sent_vect(review)
    scores = Counter()
    for i, val in enumerate(reviews2vectors.dot(v)):
        scores[i] = val
    most_similar = list()

    for idx, score in scores.most_common(3):
        most_similar.append(raw_reviews[idx][0:40])
    return most_similar

most_similar_reviews(['terrible', 'bad'])

['this movie is perfect for families to wa',
 'the characters are unlikeable and the sc',
 'a milestone in eastern european film mak']

In [1]:
import numpy as np

a = np.array([1, 2, 3])
b = np.array([0.1, 0.2, 0.3])
c = np.array([-1, -0.5, 0])
d = np.array([0, 0, 0])

identity = np.eye(3)
print(identity)

[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]


In [2]:
print(a.dot(identity))
print(b.dot(identity))
print(c.dot(identity))
print(d.dot(identity))

[1. 2. 3.]
[0.1 0.2 0.3]
[-1.  -0.5  0. ]
[0. 0. 0.]


In [4]:
this = np.array([2, 4, 6])
movie = np.array([10, 10, 10])
rocks = np.array([1, 1, 1])

print(this + movie + rocks)
print((this.dot(identity) + movie).dot(identity) + rocks)
print(this.dot(identity) + movie.dot(identity) + rocks.dot(identity))

[13 15 17]
[13. 15. 17.]
[13. 15. 17.]


In [7]:
import numpy as np
def softmax(x_):
    x = np.atleast_2d(x_)
    temp = np.exp(x)
    return temp / np.sum(temp, axis=1, keepdims=True)

word_vects = {}
word_vects['yankees'] = np.array([[0.,0.,0.]])
word_vects['bears'] = np.array([[0.,0.,0.]])
word_vects['braves'] = np.array([[0.,0.,0.]])
word_vects['red'] = np.array([[0.,0.,0.]])
word_vects['sox'] = np.array([[0.,0.,0.]])
word_vects['lose'] = np.array([[0.,0.,0.]])
word_vects['defeat'] = np.array([[0.,0.,0.]])
word_vects['beat'] = np.array([[0.,0.,0.]])
word_vects['tie'] = np.array([[0.,0.,0.]])

sent2output = np.random.rand(3, len(word_vects))
print(sent2output.shape)
identity = np.eye(3)

layer_0 = word_vects['red']
layer_1 = layer_0.dot(identity) + word_vects['sox']
layer_2 = layer_1.dot(identity) + word_vects['defeat']

pred = softmax(layer_2.dot(sent2output))
print(pred)

(3, 9)
[[0.11111111 0.11111111 0.11111111 0.11111111 0.11111111 0.11111111
  0.11111111 0.11111111 0.11111111]]


In [8]:
y = np.array([1,0,0,0,0,0,0,0,0])

pred_delta = pred - y
layer_2_delta = pred_delta.dot(sent2output.T)
defeat_delta = layer_2_delta * 1
layer_1_delta = layer_2_delta.dot(identity.T)
sox_delta = layer_1_delta * 1
layer_0_delta = layer_1_delta.dot(identity.T)

alpha = 0.01
word_vects['red'] -= layer_0_delta * alpha
word_vects['sox'] -= sox_delta * alpha
word_vects['defeat'] -= defeat_delta * alpha

identity -= np.outer(layer_0,layer_1_delta) * alpha
identity -= np.outer(layer_1,layer_2_delta) * alpha
sent2output -= np.outer(layer_2,pred_delta) * alpha
print(sent2output)

[[0.9622665  0.15135229 0.7360912  0.38534863 0.38324159 0.60023218
  0.68042376 0.91969805 0.62172191]
 [0.25879367 0.05658805 0.72200372 0.30802339 0.74077488 0.51300932
  0.04912222 0.96232369 0.28402187]
 [0.8103251  0.09531825 0.50272952 0.77304342 0.73242155 0.596089
  0.51464199 0.74892664 0.31344868]]


In [9]:
import sys, random, math
from collections import Counter
import numpy as np

f = open('assets/qa1_single-supporting-fact_train.txt', 'r')
raw = f.readlines()
f.close()

tokens = list()
for line in raw[0:1000]:
    tokens.append(line.lower().replace("\n", "").split(" ")[1:])

print(tokens[0:3])

[['mary', 'moved', 'to', 'the', 'bathroom.'], ['john', 'went', 'to', 'the', 'hallway.'], ['where', 'is', 'mary?', '\tbathroom\t1']]
