In [19]:
from torch.utils.data import DataLoader
import numpy as np
import random
import sys

reviews_path = '../datasets/IMDB/reviews.txt'
labels_path = '../datasets/IMDB/labels.txt'

class imdb_reviews:
    def __init__(self, reviews_path, labels_path, train):
        self.reviews_path = reviews_path
        self.labels_path = labels_path
        self.is_train = train
        self.test_num = 1000
        
        self.reviews = []
        self.labels = []
        self.word2ind = []
        self.vocab = {}
        self.vocab_reverse = {}
        
    def parse_reviews(self):
        f = open(self.reviews_path)
        raw_reviews = f.readlines()
        f.close()
        
        f = open(self.labels_path)
        raw_labels = f.readlines()
        f.close()
        
        raw_reviews = [r.split() for r in raw_reviews]
        ind = 0
        for r in raw_reviews:
            for w in r:
                if w not in self.vocab:
                    self.vocab[w] = ind
                    self.vocab_reverse[ind] = w
                    ind += 1
                    
        if self.is_train:
            raw_reviews = raw_reviews[:-1*self.test_num]
        else:
            raw_reviews = raw_reviews[-1*self.test_num:]
        
        if self.is_train:
            raw_labels = raw_labels[:-1*self.test_num]
        else:
            raw_labels = raw_labels[-1*self.test_num:]
        raw_labels = [r[:-1] for r in raw_labels]
        
        self.word2ind = list(self.vocab_reverse.keys())
        self.reviews = raw_reviews
        self.labels = raw_labels
    
    def get_vocab_len(self):
        return len(self.vocab)
    
    def get_word_ind(self, word):
        return self.vocab[word]
    
    def get_ind_word(self, ind):
        return self.vocab_reverse[ind]
    
    def gen_random_word(self, win=2):
        targets = random.choices(self.word2ind, k=win)
        return targets
        
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        r = np.array([self.vocab[w] for w in self.reviews[idx]])
        l = 1 if self.labels[idx]=='positive' else 0
        
        return r, l

In [20]:
batch_size = 1
train_ds = imdb_reviews(reviews_path, labels_path, True)
train_ds.parse_reviews()
train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)  #24000

test_ds = imdb_reviews(reviews_path, labels_path, False)
test_ds.parse_reviews()
test_dataloader = DataLoader(test_ds, batch_size=batch_size, shuffle=True)    #1000

print('we have %d train samples, and %d test samples' % (len(train_dataloader), len(test_dataloader)))

we have 24000 train samples, and 1000 test samples


In [21]:
alpha = 0.05
win = 2
negative = 5
target_y = np.zeros((1,negative+1))
target_y[0,0] = 1

vocab_len = train_ds.get_vocab_len()
hidden_size = 128

layer0_w = np.random.normal(0, 0.02, (vocab_len, hidden_size))-0.01
layer1_w = np.random.normal(0, 0.02, (vocab_len, hidden_size))-0.01
print('length of vocab is', vocab_len)

length of vocab is 74073


In [22]:
def sigmoid(x):
    return 1/(1+np.exp(-1*x))

def calc_acc(pred, gt):
    y = (pred>=0.5).astype(np.int)
    return np.mean(y==gt)

def cross_entropy_loss(pred, gt):
    loss = -gt*np.log(pred)-(1-gt)*np.log(1-pred)
    return np.mean(loss)

# previous one may have problem, when pred = 0 or 1
def cross_entropy_loss2(pred, gt):
    loss = 0.0
    for one_pred, one_gt in zip(pred[0], gt[0]):
        cur_loss = 0
        if one_gt == 1:
            if one_pred == 0:
                one_pred += 1e+7
            cur_loss = -1*np.log(one_pred)
        else:
            if 1-one_pred == 0:
                one_pred -= 1e-7
            cur_loss = -1*np.log(1-one_pred)
        loss += cur_loss

    return np.mean(loss)

In [23]:
def forward(left, right, target_sample):
    word_embed = np.mean(layer0_w[left+right], axis=0, keepdims=True)

    pred = word_embed.dot(layer1_w[target_sample].T)
    pred = sigmoid(pred)
    
    return pred, word_embed

def backward(x, word_embed, pred, gt):
    num = pred.shape[0]
    delta = (pred-gt)/num
    layer1_delta = word_embed.T.dot(delta)  # hidden_size*(w+1)
    
    w_delta = delta.dot(layer1_w[x])        # 1*hidden_size
    
    return layer1_delta.T, w_delta

def step(layer1_delta, w_delta, left, right, target_sample):
    global layer0_w, layer1_w
    
    layer1_w[target_sample] -= alpha*layer1_delta
    layer0_w[left+right] -= alpha*w_delta
        
def test():
    loss = 0
    accuracy = 0
    for x,y in test_dataloader:
        x = x.cpu().numpy()
        y = y.cpu().numpy()
        
        pred, _ = forward(x)
        loss += cross_entropy_loss2(pred, y)
        accuracy += calc_acc(pred, y)
    
    return loss/len(test_dataloader), accuracy/len(test_dataloader)

In [24]:
epochs = 2
train_ds_num = len(train_dataloader)
for epoch in range(epochs):
    total_loss = 0
    total_accuracy = 0
    total_counter = 0
    for idx, (x,y) in enumerate(train_dataloader):
        x = x.cpu().numpy().tolist()[0]
        y = y.cpu().numpy()
        sentence_len = len(x)
        
        loss = 0
        accuracy = 0
        counter = 0
        for i, target_word in enumerate(x):
            left = x[max(0, i-win):i]
            right = x[i+1:min(sentence_len, i+1+win)]
            target_sample = [target_word]+train_ds.gen_random_word(negative) # negative sampling

            #counter += 1
            pred, word_embed = forward(left, right, target_sample)
            #loss += cross_entropy_loss2(pred, target_y)
            #accuracy += calc_acc(pred, target_y)
        
            layer1_delta, w_delta = backward(target_sample, word_embed, pred, target_y)
            step(layer1_delta, w_delta, left, right, target_sample)
        #print('in cur sentence, train loss: %.4f, train acc: %.4f' % (loss/counter, accuracy/counter))
        
        #total_counter += counter
        #total_loss += loss
        #total_accuracy += accuracy
        
        if (idx+1) % 200 == 0:
            sys.stdout.write('\rCurrent Progress:'+str((idx+epoch*train_ds_num)/(epochs*train_ds_num)))

    #print('in epoch %d, train loss: %.4f, train acc: %.4f' % (epoch, total_loss/total_counter, total_accuracy/total_counter))

Current Progress:0.999979166666666645

In [25]:
pred

array([[0.99976412, 0.02828265, 0.00983641, 0.01533648, 0.14435772,
        0.02797571]])

In [26]:
target_sample

[6, 48064, 37975, 29847, 31874, 10907]

In [27]:
def calc_similar(w_vec1, words_embed, topk=10):
    similar_ratio = {}
    for ind, one_vec in enumerate(words_embed):
        s = np.sum((w_vec1-one_vec)**2)**0.5
        similar_ratio[ind] = s
    similar = sorted(similar_ratio.items(), key=lambda x:x[1])    
    return similar[:topk]

In [34]:
word = 'beautiful'
word_ind = train_ds.get_word_ind(word)
word_vec = layer0_w[word_ind,:]
similar_topk = calc_similar(word_vec, layer0_w)
print(word,'\' similar top 10 is:')
for one_similar in similar_topk:
    print(train_ds.get_ind_word(one_similar[0]),':',one_similar[1])

beautiful ' similar top 10 is:
beautiful : 0.0
creepy : 5.483081724418267
gorgeous : 5.526548973779048
simple : 5.693494642706597
cute : 5.783632045118047
hilarious : 5.872267160125815
amazing : 5.938913670649891
laughable : 5.95729117686135
bizarre : 6.009242231104356
design : 6.042151794856582


In [29]:
word = 'terrible'
word_ind = train_ds.get_word_ind(word)
word_vec = layer0_w[word_ind,:]
similar_topk = calc_similar(word_vec, layer0_w)
print(word,'\' similar top 10 is:')
for one_similar in similar_topk:
    print(train_ds.get_ind_word(one_similar[0]),':',one_similar[1])

terrible ' similar top 10 is:
terrible : 0.0
horrible : 3.6985681519161497
ridiculous : 4.113903726515322
fantastic : 4.274580778626832
laughable : 4.309029905230709
unbelievable : 4.70128648631396
amazing : 4.753286227954823
weak : 4.78175429792215
ok : 4.806621719440567
amateurish : 4.823310725853438


# What is King-Man+Woman ?

In [31]:
def analogy(words_embed, positive=['terrible','good'], negative=['bad'], topk=10):
    query_word_embed = np.zeros((1, hidden_size))
    for one_word in positive:
        word_ind = train_ds.get_word_ind(one_word)
        query_word_embed += words_embed[word_ind]
    
    for one_word in negative:
        word_ind = train_ds.get_word_ind(one_word)
        query_word_embed -= words_embed[word_ind]
    
    similar_ratio = {}
    for ind, one_vec in enumerate(words_embed):
        s = np.sum((query_word_embed-one_vec)**2)**0.5
        similar_ratio[ind] = s
    similar = sorted(similar_ratio.items(), key=lambda x:x[1])    
    return similar[:topk]


# terrible+good-bad = ?

In [32]:
similar = analogy(layer0_w, ['terrible','good'], ['bad'])
for one in similar:
    word = train_ds.get_ind_word(one[0]) 
    print(word,' '*(15-len(word)),':  ', one[1])

good             :   9.078256428583233
great            :   10.527742974507317
fine             :   10.558165287023504
wonderful        :   10.842303564946338
interesting      :   10.993378847357336
excellent        :   11.06400691229927
amazing          :   11.171037603024756
fantastic        :   11.33046015876567
strong           :   11.366604138898591
nice             :   11.380902566844185


# 伊丽莎白＋他－她＝？

In [33]:
similar = analogy(layer0_w, ['elizabeth','he'], ['she'])
for one in similar:
    word = train_ds.get_ind_word(one[0]) 
    print(word,' '*(15-len(word)),':  ', one[1])

simon            :   8.932276197819354
dylan            :   8.937588886962676
hawke            :   8.949760044293816
perry            :   9.040179995689215
wallace          :   9.04926009813364
vincent          :   9.063156965830961
maggie           :   9.066293520456885
hamilton         :   9.069684751450241
hudson           :   9.103670036152934
bacon            :   9.104474143165621


# I think simon is a good choice