In [1]:
cd /Users/mac/Desktop/

/Users/mac/Desktop


In [2]:
import os
import numpy as np
import sys
from collections import Counter

train_neg_data_path = 'IMDB/aclImdb/train/neg'
train_pos_data_path = 'IMDB/aclImdb/train/pos'

seed = 77
reviews = []
labels = []
         
for file in os.listdir(train_neg_data_path):
    with open(f'{train_neg_data_path}/{file}') as f:
        rev = f.readline()
        for sign in ("<br />","/","\\",")","(",";",":","?","!","<",">","#","$",",","]","["):
            rev = rev.replace(sign, '')
            rev = rev.replace('.', ' ')
            rev = rev.replace("'", ' ')
        reviews.append(rev.lower())
    labels.append('negative\n')

for file in os.listdir(train_pos_data_path):
    with open(f'{train_pos_data_path}/{file}') as f:
        rev = f.readline()
        for sign in ("<br />","/","\\",")","(",";",":","?","!","<",">","#","$",",","]","["):
            rev = rev.replace(sign, '')
            rev = rev.replace('.', ' ') 
            rev = rev.replace("'", ' ') 
        reviews.append(rev.lower())
    labels.append('positive\n')
print('done')
    
np.random.seed(seed)
np.random.shuffle(reviews)
np.random.seed(seed)
np.random.shuffle(labels) 

done


In [None]:
tokens = list(map(lambda x:x.split(' '), reviews))

vocab = set()
for sent in tokens:
    for word in sent:
        if (len(word) > 0):
            vocab.add(word)
vocab = list(vocab)

word2index = {}
for i, word in enumerate(vocab):
    word2index[word] = i
    
input_dataset = []

for sent in tokens:
    sent_indices = []
    
    for word in sent:
        try:
            sent_indices.append(word2index[word])
        except:
            ''
    input_dataset.append(list(set(sent_indices)))

target_dataset = []

for line in labels:

    if line == 'negative\n':
        target_dataset.append(0)
    elif line == 'positive\n':
        target_dataset.append(1)

def sigmoid(x):
    return 1/(1 + np.exp(-x))

alpha, iterations = (0.01, 10)
hidden_size = 100

weights_0_1 = 0.2*np.random.random((len(vocab), hidden_size)) - 0.1
weights_1_2 = 0.2*np.random.random((hidden_size, 1)) - 0.1

correct, total = (0,0)
for iter in range(iterations):
    
    for i in range(len(input_dataset) - 1000):
        x,y = (input_dataset[i], target_dataset[i])
        layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
        layer_2 = sigmoid(np.dot(layer_1, weights_1_2))
        
        layer_2_delta = layer_2 - y
        layer_1_delta = layer_2_delta.dot(weights_1_2.T)
        
        weights_0_1[x] -= layer_1_delta*alpha
        weights_1_2    -= np.outer(layer_1, layer_2_delta) * alpha
        
        if (np.abs(layer_2_delta) < 0.5):
            correct += 1
        total += 1
        if (i%1000 == 0):
            #print(layer_1)
            #print(np.dot(layer_1, weights_1_2))
            #print(layer_2)
            #print(layer_2_delta)
            progress = str(i/float(len(input_dataset)))
            sys.stdout.write('\rIter:'+str(iter) + ' Progress: ' + progress[2:4] + '.' + progress[4:6] \
                             + '% Training Accuracy: ' + str(correct/float(total)) + '%')
    print()

correct, total = (0,0)

for i in range(len(input_dataset) - 1000, len(input_dataset)):
    
    x = input_dataset[i]
    y = target_dataset[i]
        
    layer_1 = sigmoid(np.sum(weights_0_1[x], axis=0))
    layer_2 = sigmoid(np.dot(layer_1, weights_1_2))  

    if (np.abs(layer_2 - y) < 0.5):
        correct += 1
    total += 1
print("Test Accuracy: " + str(correct/float(total)))

Iter:0 Progress: 92.% Training Accuracy: 0.8304421546889266%
Iter:1 Progress: 92.% Training Accuracy: 0.8673219718729389%
Iter:2 Progress: 92.% Training Accuracy: 0.8873536992436726%
Iter:3 Progress: 92.% Training Accuracy: 0.9015799833685961%
Iter:4 Progress: 92.% Training Accuracy: 0.912580566549861%%
Iter:5 Progress: 52.% Training Accuracy: 0.9183915910406689%

In [21]:
from collections import Counter
import math

def similar(target='terrible'):
    target_index = word2index[target]
    scores = Counter()
    
    for word, index in word2index.items():
        raw_difference = weights_0_1[index] - (weights_0_1[target_index])
        squared_difference = raw_difference * raw_difference
        scores[word] = -math.sqrt(sum(squared_difference))
        
    return scores.most_common(10)
similar()


[('terrible', -0.0),
 ('gain', -0.6783184302663342),
 ('redeeming', -0.7227914850411765),
 ('pointless', -0.7258882705450946),
 ('kitchen', -0.746662505410141),
 ('horrible', -0.7570797594971905),
 ('pass', -0.7649519293351765),
 ('lame', -0.7686034708950499),
 ('connect', -0.7746148238557031),
 ('excitement', -0.7784793712546751)]

In [23]:
#Сначала поэлементное умножение, потом сумма в строку
norms = np.sum(weights_0_1 * weights_0_1, axis=1)
#print(weight_0_1.shape)   - (123950, 50)
#print(norms.shape)        - (123950,) 

norms.resize(norms.shape[0], 1)
#print(norms.shape) (123950, 1)

normed_weights = weights_0_1 * norms   #(123950, 1)
#print(normed_weights.shape) (123950, 50)


#  indices = list(map(lambda x:word2index[x], filter(lambda x:x in word2index, ['working',
# 'with', '123131313'])))
# Если слова из списка есть в словаре word2index, то выражение "x('one') in word2index" становится True,
# и это слово передается в lambda x:word2index[x], после чего превращается в индекс слова и попадает в лист.
#То есть подсчитывается среднее всех слов по обзору

def make_sent_vect(words):
    indices = list(map(lambda x:word2index[x], filter(lambda x:x in word2index, words)))
    return np.mean(normed_weights[indices], axis=0)
    #Все слова суммируем и превращаем в средний вектор

#make_sent_vect(['bad', 'lame']).shape (50,)

reviews2vectors = list()
for review in tokens:
    reviews2vectors.append(make_sent_vect(review))
reviews2vectors = np.array(reviews2vectors)
#Все обзоры в средних векторах
#print('rev2vec', reviews2vectors.shape) (25000, 50)

def most_similar_reviews(review):
    v = make_sent_vect(review) #превращаем поданный обзор в средний вектор
    scores = Counter()
    
    for i, val in enumerate(reviews2vectors.dot(v)): #перемножаем все вектора и заданный, и пишем в счетчик
        scores[i] = val                              #метрика сходства
        
    #print(scores.most_common(3)) [(4780, 31047.867819076473), (3376, 31025.682831544942), (5615, 30938.59066475642)]
    most_similar = list()
    
    for idx, score in scores.most_common(3):
        most_similar.append(raw_reviews[idx][0:200])
    return most_similar
most_similar_reviews(['beautiful', 'amazing'])


['he-he-hello this is a really fun movie  basically in party girl you have your fun-lovin  independent early 90 s new yorker chick  along with her party friends she meets a mature turkish vendor  it is ',
 '"it wasn t me it was er my twin brother rupert" bobby says to dugan when confronted about being over at sally s place  i have used this line dozens of times over the years no one has yet to believe it',
 'the fiendish plot of dr  fu manchu starring peter sellers in a spoof of the characters created by sax rohmer is an injustice to the end of sellers  career  the plot was very simplistic and if done the']