In [62]:
from torch.utils.data import DataLoader
import numpy as np

reviews_path = '../reviews.txt'
labels_path = '../labels.txt'

class imdb_reviews:
    def __init__(self, reviews_path, labels_path, train):
        self.reviews_path = reviews_path
        self.labels_path = labels_path
        self.is_train = train
        self.test_num = 1000
        
        self.reviews = []
        self.labels = []
        self.vocab = {}
        self.vocab_reverse = {}
        
    def parse_reviews(self):
        f = open(self.reviews_path)
        raw_reviews = f.readlines()
        f.close()
        
        f = open(self.labels_path)
        raw_labels = f.readlines()
        f.close()
        
        raw_reviews = [r.split() for r in raw_reviews]
        ind = 0
        for r in raw_reviews:
            for w in r:
                if w not in self.vocab:
                    self.vocab[w] = ind
                    self.vocab_reverse[ind] = w
                    ind += 1
                    
        if self.is_train:
            raw_reviews = raw_reviews[:-1*self.test_num]
        else:
            raw_reviews = raw_reviews[-1*self.test_num:]
        
        if self.is_train:
            raw_labels = raw_labels[:-1*self.test_num]
        else:
            raw_labels = raw_labels[-1*self.test_num:]
        raw_labels = [r[:-1] for r in raw_labels]
        
        self.reviews = raw_reviews
        self.labels = raw_labels
    
    def get_vocab_len(self):
        return len(self.vocab)
    
    def get_word_ind(self, word):
        return self.vocab[word]
    
    def get_ind_word(self, ind):
        return self.vocab_reverse[ind]
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        r = [self.vocab[w] for w in self.reviews[idx]]
        r = np.array(list(set(r)))  # remove dups
        l = 1 if self.labels[idx]=='positive' else 0
        
        return r, l

In [63]:
batch_size = 1
train_ds = imdb_reviews(reviews_path, labels_path, True)
train_ds.parse_reviews()
train_dataloader = DataLoader(train_ds, batch_size=batch_size, shuffle=True)  #20000

test_ds = imdb_reviews(reviews_path, labels_path, False)
test_ds.parse_reviews()
test_dataloader = DataLoader(test_ds, batch_size=batch_size, shuffle=True)    #5000

print('we have %d train samples, and %d test samples' % (len(train_dataloader), len(test_dataloader)))

we have 24000 train samples, and 1000 test samples


In [64]:
alpha = 0.1
vocab_len = train_ds.get_vocab_len()
hidden_size = 128
layer0_w = np.random.normal(0, 0.02, (vocab_len, hidden_size))-0.01
layer1_w = np.random.normal(0, 0.02, (hidden_size, 1))-0.01
print('length of vocab is', vocab_len)

length of vocab is 74073


In [65]:
def sigmoid(x):
    return 1/(1+np.exp(-1*x))

def calc_acc(pred, gt):
    y = (pred>=0.5).astype(np.int)
    return np.mean(y==gt)

def cross_entropy_loss(pred, gt):
    loss = -gt*np.log(pred)-(1-gt)*np.log(1-pred)
    return np.mean(loss)

# previous one may have problem, when pred = 0 or 1
def cross_entropy_loss2(pred, gt):
    loss = 0.0
    for one_pred, one_gt in zip(pred, gt):
        cur_loss = 0
        if one_gt == 1:
            if one_pred == 0:
                one_pred += 1e+7
            cur_loss = -1*np.log(one_pred)
        else:
            if 1-one_pred == 0:
                one_pred -= 1e-7
            cur_loss = -1*np.log(1-one_pred)
        loss += cur_loss

    return np.mean(loss)

In [66]:
def forward(x):
    word_embed = []
    for one in x:
        w = np.mean(layer0_w[one], axis=0)
        word_embed.append(w)
    word_embed = np.stack(word_embed)

    pred = word_embed.dot(layer1_w)
    pred = sigmoid(pred)
    
    return pred, word_embed

def backward(x, word_embed, pred, gt):
    num = pred.shape[0]
    delta = (pred-gt.reshape(num,1))/num
    layer1_delta = word_embed.T.dot(delta)
    
    w_delta = delta.dot(layer1_w.T)
    
    return layer1_delta, w_delta

def step(layer1_delta, w_delta, x):
    global layer0_w, layer1_w
    layer1_w = layer1_w - alpha*layer1_delta
    
    for ind, one in enumerate(x):
        layer0_w[one] = layer0_w[one] - alpha*w_delta[ind]
        
def test():
    loss = 0
    accuracy = 0
    for x,y in test_dataloader:
        x = x.cpu().numpy()
        y = y.cpu().numpy()
        
        pred, _ = forward(x)
        loss += cross_entropy_loss2(pred, y)
        accuracy += calc_acc(pred, y)
    
    return loss/len(test_dataloader), accuracy/len(test_dataloader)

In [67]:
epochs = 15
for epoch in range(epochs):
    loss = 0
    accuracy = 0
    for idx, (x,y) in enumerate(train_dataloader):
        x = x.cpu().numpy()
        y = y.cpu().numpy()
        
        pred, word_embed = forward(x)
        loss += cross_entropy_loss2(pred, y)
        accuracy += calc_acc(pred, y)
        
        layer1_delta, w_delta = backward(x, word_embed, pred, y)
        step(layer1_delta, w_delta, x)
    
    # do test
    test_loss, test_acc = test()
    print('in epoch %d, train loss: %.4f, train acc: %.4f, test loss: %.4f, test acc: %.4f' % \
          (epoch, loss/(idx+1), accuracy/(idx+1), test_loss, test_acc))

in epoch 0, train loss: 0.3681, train acc: 0.8356, test loss: 0.3564, test acc: 0.8530
in epoch 1, train loss: 0.2500, train acc: 0.9025, test loss: 0.3596, test acc: 0.8510
in epoch 2, train loss: 0.2124, train acc: 0.9213, test loss: 0.3709, test acc: 0.8560
in epoch 3, train loss: 0.1836, train acc: 0.9334, test loss: 0.4059, test acc: 0.8470
in epoch 4, train loss: 0.1595, train acc: 0.9446, test loss: 0.4139, test acc: 0.8490
in epoch 5, train loss: 0.1400, train acc: 0.9530, test loss: 0.4390, test acc: 0.8420
in epoch 6, train loss: 0.1199, train acc: 0.9609, test loss: 0.4669, test acc: 0.8400
in epoch 7, train loss: 0.1039, train acc: 0.9667, test loss: 0.4882, test acc: 0.8450
in epoch 8, train loss: 0.0891, train acc: 0.9715, test loss: 0.6039, test acc: 0.8280
in epoch 9, train loss: 0.0763, train acc: 0.9755, test loss: 0.5948, test acc: 0.8360
in epoch 10, train loss: 0.0640, train acc: 0.9792, test loss: 0.6537, test acc: 0.8300
in epoch 11, train loss: 0.0554, train acc

In [68]:
def calc_similar(w_vec1, words_embed, topk=10):
    similar_ratio = {}
    for ind, one_vec in enumerate(words_embed):
        s = np.sum((w_vec1-one_vec)**2)**0.5
        similar_ratio[ind] = s
    similar = sorted(similar_ratio.items(), key=lambda x:x[1])    
    return similar[:topk]

In [69]:
word = 'beautiful'
word_ind = train_ds.get_word_ind(word)
word_vec = layer0_w[word_ind,:]
similar_topk = calc_similar(word_vec, layer0_w)
print(word,'\' similar top 10 is:')
for one_similar in similar_topk:
    print(train_ds.get_ind_word(one_similar[0]),':',one_similar[1])

beautiful ' similar top 10 is:
beautiful : 0.0
madness : 0.3410572914219501
complaint : 0.37086161466842504
cleverly : 0.38595111735481274
episodes : 0.3860301084633669
rapid : 0.38606041506936145
deserves : 0.393883906669571
enjoyed : 0.3973891232859879
deanna : 0.41394740062047597
bumped : 0.414539686895638


In [70]:
word = 'terrible'
word_ind = train_ds.get_word_ind(word)
word_vec = layer0_w[word_ind,:]
similar_topk = calc_similar(word_vec, layer0_w)
print(word,'\' similar top 10 is:')
for one_similar in similar_topk:
    print(train_ds.get_ind_word(one_similar[0]),':',one_similar[1])

terrible ' similar top 10 is:
terrible : 0.0
horrible : 0.37675105548173093
supposedly : 0.4239087393245637
christmas : 0.45122855728386285
confusing : 0.4881462609517464
junk : 0.5213723338226643
unintentional : 0.5332199904651174
incoherent : 0.5621633789855115
wayans : 0.568229124288795
flimsy : 0.6304759367890448
