In [10]:
import encoder
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
from scipy.stats import bernoulli as bern
import heapq
import time
import torch.optim as optim
from nltk.translate.bleu_score import sentence_bleu, corpus_bleu, SmoothingFunction
import pickle

In [11]:
# get vocab
model_dir = 'gpt_vocab'
enc = encoder.get_encoder(model_dir)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
# define parameters
class HParams():
    n_vocab = 50257
    n_embed = 768
    start_token = 50256
    batch_size = 32
    n_tuples = 2000
    device = device
 
hparams = HParams()

In [12]:
class Attn(nn.Module):
    def __init__(self, hparams, input_dim = 768, hid_dim = 256):
        super(Attn,self).__init__()
        self.input_dim = input_dim
        self.hid_dim = hid_dim
        # Number of triples selected
        self.hparams = hparams
        self.k = hparams.n_tuples
        self.attn = nn.Sequential(nn.Linear(input_dim*2,hid_dim), nn.Linear(hid_dim,1))

    def forward(self, h_k, h_c):
        #h_k = [batch_size, k, hid_dim]
        #h_c = [batch_size, hid_dim]

        h_c =  h_c.unsqueeze(1) #[batch_size, 1, hid_dim]
        h_c = torch.cat([h_c]*self.k,dim=1)    #[batch_size, k, hid_dim]
        h_k = h_k.unsqueeze(0)
        h_k = torch.cat([h_k]*self.hparams.batch_size,dim=0)    #[batch_size, k, hid_dim]
        h_comb = torch.cat((h_k,h_c),dim=2)   #[batch_size, k, hid_dim*2]

        attn_logits = self.attn(h_comb).squeeze(2)  #[batch_size,k]
        attn_weight = F.softmax(attn_logits).unsqueeze(1)
        h_k_comb = torch.bmm(attn_weight,h_k).squeeze(1)    #[batch_size,hid_dim]
        return h_k_comb

In [13]:
class DecoderLSTM(nn.Module):
    def __init__(self,embedding_size = 256, num_units = 768, vocab_size = 50257, dropout_p = 0.1, num_layers = 2):
        super(DecoderLSTM, self).__init__()
        self.embedding_size = embedding_size
        self.num_units = num_units
        self.vocab_size = vocab_size
        self.dropout_p = dropout_p

        self.embedding = nn.Embedding(vocab_size, embedding_size) # !

        self.lstm = nn.LSTM(embedding_size,hidden_size=num_units,num_layers=num_layers, batch_first = True)

        self.Linear = nn.Linear(num_units,vocab_size)

        self.dropout = nn.Dropout(dropout_p)

    def forward(self, input, hidden, cell):
        #input = [batch_size]
        #hidden = [batch_size, 2, hid_dim]
        #cell = [batch_size, 2, hid_dim]

        input = input.unsqueeze(1)  #[batch_size, 1]
        # print("Input shape:",input.shape)

        embedding = self.dropout(self.embedding(input)) #[batch_size, 1, emb_dim]
        # print("Embedding shape:",embedding.shape)
        # print("hidden shape",hidden.shape)

        output, (hidden, cell) = self.lstm(embedding, (hidden, cell)) 

        #output = [batch_size, 1, hid_dim]
        #hidden = [2, batch_size hid_dim]
        #cell  = [2, batch_size, hid_dim]

        logits = self.Linear(output)  #[batch_size, vocab_size]

        return logits, hidden, cell



class Decoder(nn.Module):
    def __init__(self, hparams, embedding_size = 256, num_units = 768, vocab_size = 50257, dropout_p = 0.1, seq_len = 100, batch_size = 32, teacher_forcing_ratio = 0.5):
        super(Decoder,self).__init__()

        self.LSTM = DecoderLSTM(embedding_size, num_units,vocab_size,dropout_p)
        self.start_token = hparams.start_token
        self.batch_size = hparams.batch_size
        self.teacher_forcing_ratio = teacher_forcing_ratio
        self.hparams = hparams


    def forward(self, trg, h_c, h_k):
        #h_c = [batch_size, hid_dim]
        #h_k = [batch_size, hid_dim]
        #trg = [batch_size, seq_len]

        input = trg[:,0]
        hidden = torch.stack((h_c,h_k),dim=0)
        cell = torch.zeros(2, self.batch_size,self.LSTM.num_units).to(self.hparams.device)
        outputs = torch.zeros(self.batch_size, 1, self.LSTM.vocab_size).to(self.hparams.device)
        # print('trg_shape: ', trg.shape[1])

        for t in range(1, trg.shape[1]):
            output, hidden, cell = self.LSTM.forward(input,hidden,cell)
            # print(output.shape)

            # outputs = torch.cat([outputs,output],dim=1)
            outputs = torch.cat([outputs,output],dim=1)
            # print('outputs: ', outputs[:,1:].shape)

            top1 = output.squeeze(1).argmax(1)
            # print("top1_shape",top1.shape)

            replace = np.random.random() < self.teacher_forcing_ratio

            input = trg[:,t] if replace else top1
            # print(input.shape)

        # print('outputs: ', outputs[:,1:].shape)
        return outputs[:,1:]


    def decode(self, h_c, h_k, seq_len):
      with torch.no_grad():
        input = torch.LongTensor([self.hparams.start_token]*self.batch_size).to(self.hparams.device)
        hidden = torch.stack((h_c,h_k),dim=0)
        cell = torch.zeros(2,self.batch_size,self.LSTM.num_units).to(self.hparams.device)
        tokens = None
        # tokens = torch.LongTensor(np.zeros([self.batch_size,1]))
        for t in range(0, seq_len-1):
            output, hidden, cell = self.LSTM.forward(input,hidden,cell)
            input = output.squeeze(1).argmax(1)
            tokens = input.unsqueeze(1) if tokens == None else torch.cat([tokens,input.unsqueeze(1)],dim=1)
        return tokens

In [19]:
class Hashmodel(nn.Module):
    def __init__(self,hparams):
        super(Hashmodel,self).__init__()
        self.attn = Attn(hparams)
        self.decoder = Decoder(hparams)
        self.hparams = hparams
        self.normal = nn.BatchNorm1d(768)


    def forward(self,conv,tuples,trg):
        #conv = [batch_size,768]
        #tuples = [tuples_size,768]
        batch_size = self.hparams.batch_size
        # seq_len = trg.shape[1]

        # outputs = torch.zeros(5, batch_size , seq_len-1, self.hparams.n_vocab).to(self.hparams.device)
        # tokens = torch.zeros(5, batch_size, seq_len-1).to(self.hparams.device)

            
        # now go next level
        after_attn = self.attn(tuples, conv)
        # normalize
        conv = self.normal(conv)
        after_attn = self.normal(after_attn)
        trg_normal = nn.BatchNorm1d(trg.shape[1])
        trg = trg_normal(trg)

        after_decode = self.decoder(trg,conv,after_attn)
        output = after_decode
        # tokens[i] = self.decoder.decode(conv_emb, after_attn, seq_len) # [batch_size, seq_len]


        return output

In [20]:
model = Hashmodel(hparams).to(device)
# init weights
def init_weights(m):
    for name, param in m.named_parameters():
        nn.init.uniform_(param.data, -0.8, 0.8)

model.apply(init_weights)

# calculate the number of trainable parameters in the model
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

# optimizer
optimizer = optim.Adam(model.parameters())

# index of <pad>
PAD_ID = enc.encoder['<|endoftext|>']   
# criterion
# we ignore the loss whenever the target token is a padding token
criterion = nn.CrossEntropyLoss(ignore_index = hparams.start_token)

In [21]:
def train(model, criterion, optimizer, tuples_emb, conv_emb, trg_tokens, hparams):
    '''
    tuples_emb is the embedding of all the tuples
    conv_emb is the embedding of all the conversation 
    trg_tokens is the targer response
    '''
    model.train()
    epoch_loss = 0
    tuples_input = torch.FloatTensor(tuples_emb).to(hparams.device)
    for epoach in range(10):

        for iter in range(int(len(trg_tokens)/hparams.batch_size)):
            batched_data = get_batched_data(trg_tokens,conv_emb, hparams.batch_size, iter)
            conv_input = torch.FloatTensor(batched_data['conv_emb']).to(hparams.device)
            trg_input = torch.LongTensor(batched_data['trg_tokens']).to(hparams.device)

        

            optimizer.zero_grad()

            outputs = model.forward(conv_input, tuples_input, trg_input)

            #trg = [trg sent len, batch size]
            #output = [trg sent len, batch size, output dim]
        
            # output = output[1:].view(-1, output.shape[-1])
            # trg = trg[1:].view(-1)

            #output = [(trg sent len - 1) * batch size, output dim]
            #trg = [(trg sent len - 1) * batch size]

            # print('trg:',trg_input.shape)
            # print('output:',outputs.shape)
            output = outputs.reshape(-1, outputs.shape[-1])
            trg = trg_input[:,1:].reshape(-1)
            # print('trg:',trg.shape)
            # print('output:',output.shape)
            # output = outputs[i]
            # trg = trg_input[:,1:]
            loss = criterion(output, trg) 


            loss.backward()

            torch.nn.utils.clip_grad_norm_(model.parameters(), 1)

            optimizer.step()

            print("iter %d loss: %f"%(iter,loss.item()))
            torch.cuda.empty_cache()

    epoch_loss += loss.item()
    print("epoach %d loss: %f"%(epoach,epoch_loss))
    return epoch_loss  

In [22]:
def EmbLoader(kg_path, qs_path):
    responds = []
    questions = []
    r_flag = False
    list_file = open(qs_path,'rb')
    embeddings = pickle.load(list_file)
    list_file.close()
    for line in embeddings:
        if r_flag == False:
            questions.append(line)
            r_flag = True
        else:
            responds.append(line)
            r_flag = False

    list_file = open(kg_path,'rb')
    tuples = pickle.load(list_file)
    list_file.close()
    # print(len(np.array(tuples)))
    return np.array(questions), np.array(tuples)
    
def TextLoader(trg_path,size):
  with open("source_.txt",'r') as fin:
    r_flag = False
    response_lst = []
    count = 0
    for line in fin:
      if not r_flag:
        r_flag = True
        continue
      else:
        response_lst.append(line.strip())
        count += 1
        if count >= size:
          break
        r_flag = False
    response_token = [enc.encode(response) for response in response_lst]
    return response_token

def get_batched_data(tokens,conv_emb,batch_size,iter_num):
  assert(len(tokens) == len(conv_emb))
  batched_data = {}
  st = batch_size*iter_num
  ed = batch_size*(iter_num + 1)
  if ed >= len(tokens):
    ed = len(tokens)
    
  batched_tokens = tokens[st:ed]
  max_len = max([len(text) for text in batched_tokens]) + 2
  batched_pad_tokens = pad_text(batched_tokens,max_len)
  batched_data['trg_tokens'] = batched_pad_tokens
  batched_data['conv_emb'] = conv_emb[st:ed]
  return batched_data

def pad_text(text,max_len):
  pad_texts = [[PAD_ID] + line + [PAD_ID]*(max_len - len(line)) for line in text]
  return np.array(pad_texts)


In [23]:
questions_emb, tuples_emb = EmbLoader('tuples.pickle', 'embeddings.pickle')
tokens = TextLoader("source_.txt",5000)
train(model, criterion, optimizer, tuples_emb, questions_emb, tokens, hparams)

torch.Size([32, 29])


RuntimeError: CUDA out of memory. Tried to allocate 112.00 MiB (GPU 0; 4.00 GiB total capacity; 2.67 GiB already allocated; 2.29 MiB free; 2.91 GiB reserved in total by PyTorch)

In [1]:
!nvidia-smi

Tue Jul 07 14:26:43 2020       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 451.48       Driver Version: 451.48       CUDA Version: 11.0     |
|-------------------------------+----------------------+----------------------+
| GPU  Name            TCC/WDDM | Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|   0  GeForce GTX 1050   WDDM  | 00000000:01:00.0 Off |                  N/A |
| N/A   63C    P8    N/A /  N/A |     75MiB /  4096MiB |      0%      Default |
+-------------------------------+----------------------+----------------------+
                                                                               
+-----------------------------------------------------------------------------+
| Processes:                                                                  |
|  GPU   GI   CI        PID   Type   Process name                  GPU Memory |
|       