<a href="https://colab.research.google.com/github/ninalzr/nlg/blob/master/GPT2EncoderGPT2Decoder.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [0]:
!pip install transformers

In [0]:
import os, sys, json
from datetime import datetime

In [0]:
#TODO: Adjust the class for other tokenizers
class Lookup():
    def __init__(self, model_class, file_prefix = None):

        self.model_class = model_class

        self.bos_token = None
        self.eos_token = None
        self.unk_token = None
        self.sep_token = None
        self.pad_token = None
        self.cls_token = None
        self.mask_token = None

        if model_class == 'gpt2':
            from transformers import GPT2Tokenizer
            self._tokenizer = GPT2Tokenizer.from_pretrained(model_class)
            self._tokenizer.add_special_tokens({'pad_token': '<PAD>'})
            if self._tokenizer._bos_token:
                self.bos_token = self._tokenizer.bos_token
            if self._tokenizer._eos_token:
                self.eos_token = self._tokenizer.eos_token
            if self._tokenizer._unk_token:                
                self.unk_token = self._tokenizer.unk_token
            if self._tokenizer._sep_token:
                self.sep_token = self._tokenizer.sep_token
            if self._tokenizer._pad_token:
                self.pad_token = self._tokenizer.pad_token
            if self._tokenizer._cls_token:
                self.cls_token = self._tokenizer.cls_token
            if self._tokenizer._mask_token:
                self.mask_token = self._tokenizer.mask_token 
        else:
            print("You need to load a tokenizer from https://huggingface.co/transformers/main_classes/tokenizer.html#")
        
        if file_prefix:
            self.load(file_prefix)

        def save_special_tokens(self, file_prefix):
            if self.model_class == "gpt2":
                special_tokens = {}
            if self.bos_token:
                special_tokens['bos_token'] = self.bos_token
            if self.eos_token:
                special_tokens['eos_token'] = self.eos_token
            if self.unk_token:
                special_tokens['unk_token'] = self.unk_token
            if self.sep_token:
                special_tokens['sep_token'] = self.sep_token
            if self.pad_token:
                special_tokens['pad_token'] = self.pad_token
            if self.cls_token:
                special_tokens['cls_token'] = self.cls_token
            if self.mask_token:
                special_tokens['mask_token'] = self.mask_token            
            json.dump(special_tokens, open(file_prefix+".special_tokens","w",encoding="utf8"), indent=4, sort_keys=True)            
            self._tokenizer.add_special_tokens(special_tokens)  
        
        def load(self, file_prefix):
            if os.path.exists(file_prefix+".special_tokens"):
                special_tokens = json.load(open(file_prefix+".special_tokens","r",encoding="utf8"))            
            if 'bos_token' in special_tokens:
                self.bos_token = special_tokens['bos_token']
            if 'eos_token' in special_tokens:
                self.eos_token = special_tokens['eos_token']
            if 'unk_token' in special_tokens:
                self.unk_token = special_tokens['unk_token']
            if 'sep_token' in special_tokens:
                self.sep_token = special_tokens['sep_token']
            if 'pad_token' in special_tokens:
                self.pad_token = special_tokens['pad_token']
            if 'cls_token' in special_tokens:
                self.cls_token = special_tokens['cls_token']
            if 'mask_token' in special_tokens:
                self.mask_token = special_tokens['mask_token']
            self._tokenizer.add_special_tokens(special_tokens)      

    def tokenize(self, text):
        return self._tokenizer.tokenize(text)

    def convert_tokens_to_ids(self, tokens):
        return self._tokenizer.convert_tokens_to_ids(tokens)

    def convert_ids_to_tokens(self, token_ids):
        return self._tokenizer.convert_ids_to_tokens(token_ids)

    def convert_tokens_to_string(self, tokens):
        return self._tokenizer.convert_tokens_to_string(tokens)

    def encode(self, text, add_bos_eos_tokens = False):
        tokens = self.tokenize(text)
        if add_bos_eos_tokens:
            return [self.convert_tokens_to_ids(self.bos_token)] + self.convert_tokens_to_ids(tokens) + [self.convert_tokens_to_ids(self.eos_token)]
        else:
            return self.convert_tokens_to_ids(tokens)

    def decode(self, token_ids, skip_bos_eos_tokens = False):
        if skip_bos_eos_tokens:
            if len(token_ids)>0:
                if token_ids[0] == self.convert_tokens_to_ids(self.bos_token):
                    token_ids = token_ids[1:]
            if len(token_ids)>0:
                if token_ids[-1] == self.convert_tokens_to_ids(self.eos_token):
                    token_ids = token_ids[:-1]   
        if len(token_ids) > 0:
            tokens = self.convert_ids_to_tokens(token_ids)
            return self.convert_tokens_to_string(tokens)
        return ""

    def __len__(self):          
        return len(self._tokenizer)

In [0]:
model = 'gpt2'
lookup = Lookup(model)
text = "Daisy, Daisy, Give me your answer, do!"
print("\n1. String to tokens (tokenize):")
tokens = lookup.tokenize(text)
print(tokens)

print("\n2. Tokens to ints (convert_tokens_to_ids):")
ids = lookup.convert_tokens_to_ids(tokens)
print(ids)

print("\n2.5 Token to int (convert_tokens_to_ids with a single str):")
id = lookup.convert_tokens_to_ids(tokens[0])
print(id)

print("\n3. Ints to tokens (convert_ids_to_tokens):")
tokens = lookup.convert_ids_to_tokens(ids)
print(tokens)

print("\n3.5 Int to token (convert_ids_to_tokens with a single int):")
token = lookup.convert_ids_to_tokens(id)
print(token)

print("\n4. Tokens to string (convert_tokens_to_string):")
recreated_text = lookup.convert_tokens_to_string(tokens)
print(recreated_text)

print("\n5. String to ints (encode):")
ids = lookup.encode(text)
print(ids)

print("\n6. Ints to string (decode):")
recreated_text = lookup.decode(ids)
print(recreated_text)

print("\n7. Encode adding special tokens:")
ids = lookup.encode(text, add_bos_eos_tokens=True)
print(ids)
print("How it looks like with tokens: {}".format(lookup.convert_ids_to_tokens(ids)))
    
print("\n8. Decode skipping special tokens:")
recreated_text = lookup.decode(ids, skip_bos_eos_tokens=True)
print(recreated_text)

print("\n9. Vocabulary size:")
vocab_size = lookup.__len__()
print(vocab_size)

In [0]:
import os, sys, json, random
import numpy as np
import torch
import torch as nn
import torch.utils.data

from functools import partial

In [0]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

In [0]:
#Ignore slots for now.
#TODO: Figure out what to do with slots
#Remember to change load from file X, y
def loader(data_folder, batch_size, src_lookup, tgt_lookup, min_seq_len_X = 5, max_seq_len_X = 1000, min_seq_len_y = 5,
           max_seq_len_y = 1000, MEI = ""):
    MEI = MEI.replace(" ","_")
    pad_id = tgt_lookup.convert_tokens_to_ids(tgt_lookup.pad_token)
    
    train_loader = torch.utils.data.DataLoader(
        MyDataset(data_folder, "train", min_seq_len_X, max_seq_len_X, min_seq_len_y, max_seq_len_y, MEI),
        num_workers=0,
        batch_size=batch_size,
        collate_fn=partial(paired_collate_fn, padding_idx = pad_id),
        shuffle=True)

    valid_loader = torch.utils.data.DataLoader(
        MyDataset(data_folder, "dev", min_seq_len_X, max_seq_len_X, min_seq_len_y, max_seq_len_y, MEI),
        num_workers=0,
        batch_size=batch_size,
        collate_fn=partial(paired_collate_fn, padding_idx = pad_id))
    
    return train_loader, valid_loader

def paired_collate_fn(insts, padding_idx):
    # insts contains a batch_size number of (x, y) elements    
    src_insts, tgt_insts = list(zip(*insts))   
    
    src_max_len = max(len(inst) for inst in src_insts) # determines max size for all examples
    
    src_seq_lengths = torch.tensor(list(map(len, src_insts)), dtype=torch.long)    
    src_seq_tensor = torch.tensor(np.array( [ inst + [padding_idx] * (src_max_len - len(inst)) for inst in src_insts ] ), dtype=torch.long)
    src_seq_mask = torch.tensor(np.array( [ [1] * len(inst) + [0] * (src_max_len - len(inst)) for inst in src_insts ] ), dtype=torch.long)
    
    src_seq_lengths, perm_idx = src_seq_lengths.sort(0, descending=True)
    src_seq_tensor = src_seq_tensor[perm_idx]   
    src_seq_mask = src_seq_mask[perm_idx]
    tgt_max_len = max(len(inst) for inst in tgt_insts)
    
    tgt_seq_lengths = torch.tensor(list(map(len, tgt_insts)), dtype=torch.long)    
    tgt_seq_tensor = torch.tensor(np.array( [ inst + [padding_idx] * (tgt_max_len - len(inst)) for inst in tgt_insts ] ), dtype=torch.long)
    tgt_seq_mask = torch.tensor(np.array( [ [1] * len(inst) + [0] * (tgt_max_len - len(inst)) for inst in tgt_insts ] ), dtype=torch.long)
    
    tgt_seq_lengths = tgt_seq_lengths[perm_idx]
    tgt_seq_tensor = tgt_seq_tensor[perm_idx]      
    tgt_seq_mask = tgt_seq_mask[perm_idx]   
      
    return ((src_seq_tensor, src_seq_lengths, src_seq_mask), (tgt_seq_tensor, tgt_seq_lengths, tgt_seq_mask))   
class MyDataset(torch.utils.data.Dataset):
    def __init__(self, root_dir, type, min_seq_len_X, max_seq_len_X, min_seq_len_y, max_seq_len_y, MEI):  
        self.root_dir = root_dir

        self.X = [] # this will store joined sentences
        self.y = [] # this will store the output

    
        with open(os.path.join(root_dir, type, MEI + '_output.txt'), 'r') as f:
            y = [lookup.encode(y.strip(), add_bos_eos_tokens=True)  for y in f]
        with open(os.path.join(root_dir, type, MEI + '_sentences.txt'), 'r') as g:
            X = [lookup.encode(x.strip(), add_bos_eos_tokens=True)  for x in g]   
                    
        cut_over_X = 0
        cut_under_X = 0
        cut_over_y = 0
        cut_under_y = 0
        
        # max len
        for (sx, sy) in zip(X, y):
            if len(sx) > max_seq_len_X:
                cut_over_X += 1
            elif len(sx) < min_seq_len_X+2:                
                cut_under_X += 1
            elif len(sy) > max_seq_len_y:
                cut_over_y += 1
            elif len(sy) < min_seq_len_y+2:                
                cut_under_y += 1
            else:
                self.X.append(sx)
                self.y.append(sy)         

        c = list(zip(self.X, self.y))
        random.shuffle(c)
        self.X, self.y = zip(*c)
        self.X = list(self.X)
        self.y = list(self.y)
        print(X)
        print(y)
                    
        print("Dataset [{}] loaded with {} out of {} ({}%) instances.".format(type, len(self.X), len(X), float(100.*len(self.X)/len(X)) ) )
        print("\t\t For X, {} are over max_len {} and {} are under min_len {}.".format(cut_over_X, max_seq_len_X, cut_under_X, min_seq_len_X))
        print("\t\t For y, {} are over max_len {} and {} are under min_len {}.".format(cut_over_y, max_seq_len_y, cut_under_y, min_seq_len_y))
        
        assert(len(self.X)==len(self.y))
        
    def __len__(self):
        return len(self.X)

    def __getitem__(self, idx):        
        return self.X[idx], self.y[idx]

In [31]:
from google.colab import drive
drive.mount('/content/drive')
data_path = 'drive/My Drive/nlg/tiny'
src_lookup = Lookup(model)
tgt_lookup = Lookup(model)
batch_size = 4    
min_seq_len_X = 10
max_seq_len_X = 1000
min_seq_len_y = min_seq_len_X
max_seq_len_y = max_seq_len_X 
MEI = "Management Overview"
model = 'gpt2'
lookup = Lookup(model)

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3aietf%3awg%3aoauth%3a2.0%3aoob&response_type=code&scope=email%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdocs.test%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive%20https%3a%2f%2fwww.googleapis.com%2fauth%2fdrive.photos.readonly%20https%3a%2f%2fwww.googleapis.com%2fauth%2fpeopleapi.readonly

Enter your authorization code:
··········
Mounted at /content/drive


In [0]:
train_loader, valid_loader = loader(data_path, batch_size, src_lookup, tgt_lookup, min_seq_len_X, max_seq_len_X, min_seq_len_y, max_seq_len_y, MEI = MEI)

In [0]:
import os, sys

import torch
import torch.nn as nn
from transformers import GPT2Model, GPT2Config

class Encoder(nn.Module):
    def __init__(self, vocab_size, device):       
        super().__init__()
        
        self.hidden_size = 768

        configuration = GPT2Config()
        configuration.output_attentions = True

        self.gpt2model = GPT2Model(configuration)
        self.gpt2model.resize_token_embeddings(vocab_size)
        
        for param in self.gpt2model.parameters():
            param.requires_grad = False
        
        self.device = device
        self.to(device)

    def forward(self, input_tuple):
        """
        Args:
            input_tuple (tensor): The input of the encoder. On the first position it must be a 2-D tensor of integers, padded. The second is the lenghts of the first.
                Shape: ([batch_size, seq_len_enc], [batch_size], [att_mask]])

        Returns:
            Output shape: [batch_size, seq_len_enc, 768]
            (tuple) Past shape: ((2, batch_size, num_heads, sequence_length, embed_size_per_head),(2, batch_size, num_heads, sequence_length, embed_size_per_head))
            (tuple) Att shape: ((batch_size, num_heads, sequence_length, sequence_length), (batch_size, num_heads, sequence_length, sequence_length))

        
        """
        self.gpt2model.eval()
        X, X_lengths, X_att_mask = input_tuple[0], input_tuple[1], input_tuple[2]
        batch_size = X.size(0)
        seq_len = X.size(1)
        print(seq_len)
        
        output = torch.zeros(batch_size, seq_len, self.hidden_size).to(self.device)
        output.requires_grad = False

        
        with torch.no_grad(): # hack ?? documentation is not clear on padding, so, skipping it with this hack
            hidden_states, past, att   = self.gpt2model(X, attention_mask = X_att_mask)  
            for i in range(batch_size):
                output[i:i+1, 0:X_lengths[i], :] = hidden_states[i:i+1, 0:X_lengths[i], :]
            
        return {'output':output, 'past':past, 'att': att}

In [35]:
for i, t in enumerate(train_loader):
    sentences = t[0][0]
    seq_len = t[0][1]
    sen_mask = t[0][2]
    y = t[1][0]
    y_seq_len = t[1][1]
    y_mask = t[1][2]
    print(seq_len)
    print(y_seq_len)
    break

tensor([89, 82, 81, 77])
tensor([90, 86, 88, 88])


In [0]:
encoder = Encoder(vocab_size=vocab_size, device = device)
encoder_out = encoder.forward((sentences, seq_len, sen_mask))
print(encoder_out['output'].shape, encoder_out['past'][0].shape, encoder_out['att'][1].shape)

In [0]:
from transformers import GPT2Tokenizer, GPT2Model, GPT2Config
import torch

configuration = GPT2Config()
#print(configuration)
model = GPT2Model(configuration)
model.config.output_attentions = True

In [0]:
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
configuration = GPT2Config()
configuration.output_attentions = True
model = GPT2Model(configuration)

input_ids = torch.tensor(tokenizer.encode("Hello, my dog is cute", add_special_tokens=True)).unsqueeze(0)  # Batch size 1
outputs, past, att = model(input_ids)

In [0]:
print(outputs)

In [0]:
class DecoderGPT2(nn.Module):
    def __init__(self, hidden_size, vocab_size, device = device):
        super(DecoderGPT2, self).__init__()
        self.hidden_size = hidden_size
        self.vocab_size = vocab_size

        self.gpt2model = GPT2Model.from_pretrained('gpt2')
        self.gpt2model.resize_token_embeddings(vocab_size) #resize the size of vocab to include new tokens 
        for param in self.gpt2model.parameters():
            param.requires_grad = False
        
        self.lin_out = nn.Linear(hidden_size, vocab_size)
        self.softmax = nn.LogSoftmax(dim = 1)

        self.device = device
        self.to(device)

    def forward(self, y_tuple):
        y = y_tuple[0]
        y_lenghts = y_tuple[1]
        batch_size = y.size(0)
        y_seq_len = y.size(1)

        output = torch.zeros(batch_size, y_seq_len, self.hidden_size).to(self.device)

        output.requires_grad = False
        with torch.no_grad():
            hidden, past = self.gpt2model(y)  
            for i in range(batch_size):
                output[i:i+1, 0:y_lenghts[i], :] = hidden[i:i+1, 0:y_lenghts[i], :]

        out_lin = self.lin_out(output)
        output = self.softmax(out_lin)

        return output     


In [82]:
encoder = Encoder(vocab_size=vocab_size, device = device)
encoder_out = encoder.forward((sentences, seq_len))
print(encoder_out['output'].shape, encoder_out['past'][1].shape)

82
torch.Size([4, 82, 768]) torch.Size([2, 4, 12, 82, 64])


In [85]:
hidden_size = 768
decoder = DecoderGPT2(hidden_size=hidden_size, vocab_size=vocab_size, device = device)
decoder_out = decoder.forward((y, y_seq_len))
print(decoder_out.shape)

torch.Size([4, 88, 50258])
tensor(6.9024, grad_fn=<NllLossBackward>)


In [0]:
criterion = nn.NLLLoss()
loss = criterion(decoder_out.view(-1, vocab_size), y.contiguous().flatten())
print(loss)

In [40]:
# prep inputs
batch_size = 2
seq_len = 10
enc_size = 4
dec_layers = 5
dec_size = 3

encoder_outputs = torch.tensor(np.random.rand(batch_size, seq_len, enc_size), dtype=torch.float)
decoder_hidden_state = torch.tensor(np.random.rand(dec_layers*1, batch_size, dec_size), dtype=torch.float) # 1 for unidirectional

#type = "additive"    
type = "general"    
att = Attention(enc_size, dec_size, device, type)

# run
context, attention_weights = att(encoder_outputs, decoder_hidden_state)
print("Output is:")
print(context)
print("Attention weights size:" + str(attention_weights.size()))

Output is:
tensor([[-0.1195,  0.0557, -0.3561,  0.0497],
        [-0.1426,  0.0300, -0.3922,  0.0362]], grad_fn=<SumBackward1>)
Attention weights size:torch.Size([2, 10, 1])


In [104]:
for i, t in enumerate(train_loader):
    sentences = t[0][0]
    seq_len = t[0][1]
    sen_mask = t[0][2]
    y = t[1][0]
    y_seq_len = t[1][1]
    y_mask = t[1][2]
    print(sentences.size())
    print(y.size())
    break

torch.Size([4, 89])
torch.Size([4, 90])


In [105]:
vocab_size = lookup.__len__()
print(vocab_size)

50258


In [0]:
encoder = Encoder(vocab_size=vocab_size, device = device)

In [109]:
encoder = Encoder(vocab_size=vocab_size, device = device)
encoder_out = encoder.forward((sentences, seq_len))
print(encoder_out['output'].shape)

torch.Size([4, 89, 768])


In [0]:
input_size = encoder_out['output'].size(2)
decoder = Decoder(vocab_size=vocab_size, input_size=input_size, top_k=0, top_p=0.0, device=device )

In [0]:
from torch.autograd import Variable
hidden = Variable(torch.rand(batch_size, 1, decoder.hidden_dim),requires_grad=False)
cell = Variable(torch.rand(batch_size, 1, decoder.hidden_dim),requires_grad=False)
dec_states = (hidden.zero_().permute(1, 0, 2), cell.zero_().permute(1, 0, 2))
decoder_out = decoder.forward((sentences, seq_len, sen_mask), (y, y_seq_len, y_mask), encoder_out, dec_states, teacher_forcing_ratio=0.)

In [68]:

print(dec_states)

(tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]]), tensor([[[0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.],
         [0., 0., 0.,  ..., 0., 0., 0.]]]))


In [64]:
a = torch.rand([2, 4,2])
print(a.size())

torch.Size([2, 4, 2])
