In [2]:
path = '../datasets/qa1_single-supporting-fact_train.txt'
f = open(path)
raw = f.readlines()
f.close()

In [3]:
def remove_noneed(r):
    no_needs = ['0','1','2','3','4','5','6','7','8','9','\n','\t1', '\t2','\t3','\t4','\t5''\t6','\t7','\t8','\t9', '.','?','\t']
    for n in no_needs:
        r = r.replace(n, '')
    return r

In [4]:
input_datasets = [remove_noneed(r).split(' ')[1:] for r in raw]

In [6]:
print(input_datasets[:10])
print(input_datasets[-10:])
print(len(input_datasets))

[['Mary', 'moved', 'to', 'the', 'bathroom'], ['John', 'went', 'to', 'the', 'hallway'], ['Where', 'is', 'Mary', 'bathroom'], ['Daniel', 'went', 'back', 'to', 'the', 'hallway'], ['Sandra', 'moved', 'to', 'the', 'garden'], ['Where', 'is', 'Daniel', 'hallway'], ['John', 'moved', 'to', 'the', 'office'], ['Sandra', 'journeyed', 'to', 'the', 'bathroom'], ['Where', 'is', 'Daniel', 'hallway'], ['Mary', 'moved', 'to', 'the', 'hallway']]
[['Where', 'is', 'Sandra', 'bedroom'], ['Mary', 'journeyed', 'to', 'the', 'kitchen'], ['John', 'went', 'back', 'to', 'the', 'bedroom'], ['Where', 'is', 'Daniel', 'office'], ['Daniel', 'travelled', 'to', 'the', 'kitchen'], ['Sandra', 'travelled', 'to', 'the', 'kitchen'], ['Where', 'is', 'John', 'bedroom'], ['Sandra', 'travelled', 'to', 'the', 'hallway'], ['Daniel', 'went', 'to', 'the', 'garden'], ['Where', 'is', 'Daniel', 'garden']]
3000


In [8]:
raw[:10]

['1 Mary moved to the bathroom.\n',
 '2 John went to the hallway.\n',
 '3 Where is Mary? \tbathroom\t1\n',
 '4 Daniel went back to the hallway.\n',
 '5 Sandra moved to the garden.\n',
 '6 Where is Daniel? \thallway\t4\n',
 '7 John moved to the office.\n',
 '8 Sandra journeyed to the bathroom.\n',
 '9 Where is Daniel? \thallway\t4\n',
 '10 Mary moved to the hallway.\n']

In [9]:
import numpy as np
from torch.utils.data import DataLoader

ModuleNotFoundError: No module named 'numpy'

In [9]:
class QA_Dataset:
    def __init__(self, path):
        self.path = path
        self.start_token = '<START>'
        self.word2ind = {}
        self.ind2word = {}
        self.corpus = []
    
    def remove_noneed(self, r):
        no_needs = ['0','1','2','3','4','5','6','7','8','9','\n','\t1', '\t2','\t3','\t4','\t5''\t6','\t7','\t8','\t9', '.','?','\t']
        for n in no_needs:
            r = r.replace(n, '')
        return r

    def parse(self):
        f = open(path)
        raw = f.readlines()
        f.close()
        
        self.corpus = [remove_noneed(self, r).split(' ')[1:] for r in raw]
        ind = 0
        # add start token
        self.word2ind[self.start_token] = ind
        self.ind2word[ind] = self.start_token
        ind = 1
        
        for sent in self.corpus:
            for w in sent:
                if w not in self.word2ind:
                    self.word2ind[w] = ind
                    self.ind2word[ind] = w
                    ind += 1
        np.random.shuffle(self.corpus)
        
    def __len__(self):
        return len(self.corpus)

    def __getitem__(self, idx):
        return np.array([self.word2ind[self.start_token]]+[self.word2ind[w] for w in self.corpus[idx]])
    
    def get_sent(self, idx):
        return self.corpus[idx]
    
    def get_vob_len(self):
        return len(self.word2ind)
    
    def get_start_token(self):
        return self.start_token


In [10]:
path = '../datasets/qa1_single-supporting-fact_train.txt'
qa_ds = QA_Dataset(path)
qa_ds.parse()
batch_size = 1
train_dataloader = DataLoader(qa_ds, batch_size=batch_size, shuffle=True)  #20000
print(qa_ds.get_sent(0), 'ind:', qa_ds[0])
print('we have', len(qa_ds),'training samples.')

['Daniel', 'moved', 'to', 'the', 'bedroom'] ind: [ 0 11  2  3  4 18]
we have 3000 training samples.


In [11]:
def softmax(pred):
    y = np.exp(pred).sum(axis=1, keepdims=True)
    return np.exp(pred)/y

def cross_entropy_loss(pred, gt):
    num = gt.shape[0]
    loss = -np.log(pred[np.arange(num), gt])
    return np.sum(loss)

# previous one may have problem, when pred = 0 or 1
def cross_entropy_loss2(pred, gt):
    loss = 0.0
    num = gt.shape[0]
    for one_pred, one_gt in zip(pred, gt):
        cur_loss = 0
        p = pred[one_gt]
        if p == 0:
            p += 1e+7
            cur_loss = -1*np.log(p)
        loss += cur_loss

    return np.mean(loss)

def calc_acc(pred, gt):
    y = pred.argmax(axis=1)
    return np.mean(y == gt)

In [17]:
alpha = 0.001
hidden_size = 10
vocab = qa_ds.get_vob_len()

word_embed = np.random.normal(0, 0.1, (vocab, hidden_size))-0.05
transition_layer = np.eye(hidden_size)
output_layer = np.random.normal(0,0.1, (hidden_size, vocab))-0.05
h0 = np.zeros((1, hidden_size))
print('we have words', vocab)

we have words 20


In [18]:
def forward(x, h=None):
    preds = []
    hidden_state = []
    
    if h is None:   # h0
        h = np.zeros((1,hidden_size))
    hidden_state.append(h)
    
    for word in x:
        word_vec = word_embed[[word]]
        h = h.dot(transition_layer)+word_vec
        pred = softmax(h.dot(output_layer))
        
        preds.append(pred)
        hidden_state.append(h)
        
    return np.concatenate(preds), np.concatenate(hidden_state)

def backward(preds, hidden_state, target):
    num = preds.shape[0]
    vocab_len = preds.shape[1]
    hidden_size = hidden_state.shape[1]
    
    output_layer_delta = np.zeros((hidden_size, vocab_len))
    transition_layer_delta = np.zeros((hidden_size, hidden_size))
    word_embed_delta = np.zeros((num, hidden_size))
    h0_delta = np.zeros((1, hidden_size))
    
    for ind in reversed(range(num)):
        pred = preds[[ind]]
        word = target[ind]
        y = np.zeros((1,vocab_len))
        prev_h = hidden_state[[ind]]   # to keep the hidden state's shape to 1*hidden_size, use [ind] not ind
        cur_h = hidden_state[[ind+1]]  # to keep the hidden state's shape to 1*hidden_size, use [ind+1] not ind+1
        y[0, word] = 1
        delta = pred - y
        output_layer_delta += cur_h.T.dot(delta)   # shape = hidden_size * vocab
        
        transition_layer_delta += prev_h.T.dot(delta.dot(output_layer.T))  # shape = hidden_size*hidden_size
        word_embed_delta[[ind]] += delta.dot(output_layer.T)                    # shape = 1*hidden_size
        
        if ind>0:
            # for previous layers' gradient
            # because previous hidden state(h) & input word do contribute to current prediction
            prev_gradient = delta.dot(output_layer.T)   # shape=1*hidden_size
            for prev_ind in reversed(range(ind)):
                prev_h = hidden_state[[prev_ind]]
                cur_gradient = prev_gradient.dot(transition_layer.T)
                transition_layer_delta += prev_h.T.dot(cur_gradient)              # shape = hidden_size*hidden_size
                word_embed_delta[[prev_ind]] += cur_gradient                      # shape = 1*hidden_size
                prev_gradient = cur_gradient
        
        h0_delta += prev_gradient.dot(transition_layer.T)
        
    return output_layer_delta, transition_layer_delta, word_embed_delta, h0_delta

def step(output_layer_delta, transition_layer_delta, word_embed_delta, h0_delta, x):
    global output_layer, transition_layer, word_embed, h0
    num = x.shape[0]
    output_layer -= alpha*output_layer_delta/num
    transition_layer -= alpha*transition_layer_delta/num
    word_embed[x] -= alpha*word_embed_delta/num
    h0 -= h0_delta/num

In [19]:
epoches = 20
for epoch in range(epoches):
    loss = 0.0
    accuracy = 0.0
    for x in train_dataloader:
        sent = x.cpu().numpy()[0]
    
        x = sent[:-1]
        y = sent[1:]
    
        preds, hidden_state = forward(x, h0)
        cur_loss = cross_entropy_loss(preds, y)
        loss += cur_loss
        accuracy += calc_acc(preds, y)
        output_layer_delta, transition_layer_delta, word_embed_delta, h0_delta = backward(preds, hidden_state, y)
        step(output_layer_delta, transition_layer_delta, word_embed_delta, h0_delta, x)
    
    print("Perplexity:", np.exp(cur_loss/len(y)))
    print('In epoch %d, train loss:%.4f, acc:%.4f' % (epoch, loss/len(train_dataloader), accuracy/len(train_dataloader)))

Perplexity: 6.951310559181079
In epoch 0, train loss:10.9838, acc:0.3187
Perplexity: 7.686951881924321
In epoch 1, train loss:9.4104, acc:0.4002
Perplexity: 12.413094807987672
In epoch 2, train loss:9.1740, acc:0.4033
Perplexity: 7.8934299815769515
In epoch 3, train loss:9.0597, acc:0.4067
Perplexity: 5.552090386425283
In epoch 4, train loss:9.0259, acc:0.4097
Perplexity: 9.631119372252218
In epoch 5, train loss:8.8700, acc:0.4106
Perplexity: 8.372487023931402
In epoch 6, train loss:9.1166, acc:0.3641
Perplexity: 3.819681996947627
In epoch 7, train loss:8.4240, acc:0.4082
Perplexity: 7.237313426697132
In epoch 8, train loss:8.3644, acc:0.4174
Perplexity: 3.6175414171901585
In epoch 9, train loss:7.1224, acc:0.4539
Perplexity: 3.627590472900009
In epoch 10, train loss:6.2953, acc:0.4843
Perplexity: 4.247551120181662
In epoch 11, train loss:5.9678, acc:0.5017
Perplexity: 2.906866548008806
In epoch 12, train loss:5.7426, acc:0.5156
Perplexity: 3.425728584616724
In epoch 13, train loss:5.5

In [20]:
transition_layer

array([[ 9.75226863e-01,  5.82407737e-02,  5.66514107e-02,
         7.81298138e-02,  2.36640783e-01,  1.22063891e-01,
         2.03308131e-01, -9.55908078e-03,  9.99884053e-02,
         2.17839597e-01],
       [-4.32038916e-02,  7.68571418e-01, -1.19955967e-01,
        -2.49706395e-02, -6.36187641e-02, -9.20372391e-02,
         1.27189003e-01,  2.08306325e-01,  2.05126403e-01,
        -5.00746898e-02],
       [ 7.93179958e-02,  7.97998891e-02,  7.30468257e-01,
         9.88133628e-02, -2.66347505e-02, -3.33391351e-02,
        -2.10883846e-01, -3.87870576e-01, -4.03085882e-01,
        -2.97779078e-02],
       [ 8.87196829e-02, -1.27084121e-01,  1.92931664e-01,
         7.34578959e-01,  7.29772396e-02, -1.44326803e-01,
         2.14297255e-01,  4.21087971e-01,  3.42861821e-01,
         1.60882723e-02],
       [ 6.47908043e-02, -1.11352852e-01,  1.27028656e-01,
        -2.48142272e-01,  8.47570100e-01, -2.16393152e-01,
        -2.87956710e-04,  2.38027919e-01,  4.45441411e-02,
        -6.

## check the word prediction

In [21]:
def predict_for_test(num=5):
    for i in range(num):
        sent_index = np.random.choice(len(qa_ds))
        
        preds,_ = forward(qa_ds[sent_index][:-1], h0)
        print(qa_ds.get_sent(sent_index))
        for pred, input_ind, target_ind in zip(preds,qa_ds[sent_index][:-1],qa_ds[sent_index][1:]):
            input = qa_ds.ind2word[input_ind]
            target = qa_ds.ind2word[target_ind]
            pred = qa_ds.ind2word[pred.argmax()]
            print("Prev Input:", input, ' '*(20-len(input)), "Target:", target, ' '*(20-len(target)), "Pred:", pred)
        print("")

In [22]:
predict_for_test()

['John', 'went', 'back', 'to', 'the', 'office']
Prev Input: <START>               Target: John                  Pred: Where
Prev Input: John                  Target: went                  Pred: went
Prev Input: went                  Target: back                  Pred: to
Prev Input: back                  Target: to                    Pred: to
Prev Input: to                    Target: the                   Pred: the
Prev Input: the                   Target: office                Pred: bathroom

['John', 'went', 'to', 'the', 'bathroom']
Prev Input: <START>               Target: John                  Pred: Where
Prev Input: John                  Target: went                  Pred: went
Prev Input: went                  Target: to                    Pred: to
Prev Input: to                    Target: the                   Pred: the
Prev Input: the                   Target: bathroom              Pred: garden

['Daniel', 'journeyed', 'to', 'the', 'garden']
Prev Input: <START>               Ta