In [None]:
!wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
!tar -xf  simple-examples.tgz
!mkdir data
!mv /content/simple-examples/data/ptb.train.txt data/
!mv /content/simple-examples/data/ptb.valid.txt data/
!mv /content/simple-examples/data/ptb.test.txt data/
!rm -rf ./simple_examples

--2021-03-21 07:16:52--  http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
Resolving www.fit.vutbr.cz (www.fit.vutbr.cz)... 147.229.9.23, 2001:67c:1220:809::93e5:917
Connecting to www.fit.vutbr.cz (www.fit.vutbr.cz)|147.229.9.23|:80... connected.
HTTP request sent, awaiting response... 200 OK
Length: 34869662 (33M) [application/x-gtar]
Saving to: ‘simple-examples.tgz’


2021-03-21 07:17:06 (2.60 MB/s) - ‘simple-examples.tgz’ saved [34869662/34869662]



In [None]:
!pip install transformers
!wget https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json

--2021-03-21 08:07:29--  https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json
Resolving s3.amazonaws.com (s3.amazonaws.com)... 52.216.92.101
Connecting to s3.amazonaws.com (s3.amazonaws.com)|52.216.92.101|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 209850483 (200M) [application/json]
Saving to: ‘personachat_self_original.json’


2021-03-21 08:07:35 (41.3 MB/s) - ‘personachat_self_original.json’ saved [209850483/209850483]



In [None]:
from torch.utils.data import Dataset, DataLoader
from transformers import EncoderDecoderModel, BertTokenizer, BertModel, BertConfig
import torch
from transformers import AdamW
from tqdm import tqdm
import json
import random 
from sklearn.model_selection import train_test_split
from datetime import datetime

import logging
logging.basicConfig(level=logging.DEBUG)

DEBUG:matplotlib.pyplot:Loaded backend module://ipykernel.pylab.backend_inline version unknown.


In [None]:
SPECIAL_TOKENS = ["<bos>", "<eos>", "<persona>", "<speaker1>", "<speaker2>", "<pad>"]

ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>',
                         'additional_special_tokens': ['<speaker1>', '<speaker2>', '<persona>']}

def read_data(data_json_path='/content/personachat_self_original.json'):
  with open(data_json_path) as json_file:
    data_dict = json.load(json_file)
  return data_dict

def data_to_samples(data_dict, test=False):
  samples=[]
  for dialogue in (data_dict['train'] if test==False else data_dict['valid']):
    original_persona = dialogue['personality']
    
    for item in dialogue['utterances']:
      original_persona = [original_persona[-1]] + original_persona[:-1]
      history = item['history']
      response = item['candidates'][-1]
      samples.append({
          'persona':original_persona,
          'history':history,
          'response':response
      })
  return samples
  
def bertified(samples):
  bertified_data = []
  for item in samples:
    persona = ' <persona> '.join(item['persona'])
    persona = '<bos> ' + persona
    history = ''
    speakers = [" <speaker1> ", " <speaker2> "]
    speaker = 0
    for hst in item['history'][::-1]:
      history = speakers[speaker] + hst + history
      speaker = 1 - speaker
    response = '<speaker2> ' + item['response'] + ' <eos>'
    bertified_data.append({
          'persona':persona.replace('  ', ' '),
          'history':history.replace('  ', ' '),
          'input': persona.replace('  ', ' ')+' '+history.replace('  ', ' '), 
          'response':response.replace('  ', ' ')
      })
  return bertified_data


In [None]:
class PersonaDataset_v1(Dataset):
  '''
      Convert Data to proper Tensor dataset
  '''
  def __init__(self, samples):
    self.samples = samples
    self.n_samples = len(self.samples)

  def __getitem__(self, index):
    # returns specific item
    return self.samples[index] 
  def __len__(self):
    return self.n_samples
    # returns dataset length


class PTBDataset(Dataset):
  '''
      Convert Data to proper Tensor dataset
  '''
  def __init__(self, path):
    self.texts = []
    with open(path, 'r') as fin:
      for line in fin:
        self.texts.append(line.strip())
    self.n_samples = len(self.texts)

  def __getitem__(self, index):
    # returns specific item
    return self.texts[index] 
  def __len__(self):
    return self.n_samples
    # returns dataset length





In [None]:
class TrainingLoop:
  '''
  Everything related to model training
  '''
  def __init__( self, model, tokenizer, optimizer, freezeemb=True, 
                epochs=6, save_path='./models/', **kw):
    self.model = model
    params = []
    for paramname, param in self.model.named_parameters():
      if paramname.startswith("bert.embeddings.word_embeddings"):
        if not freezeemb:
          params.append(param)
      else:
        params.append(param)
    self.optimizer = optimizer(params, **kw)
    self.tokenizer = tokenizer
    self.epochs = epochs
    self.save_path = save_path
    self.predicts = None


  def train(self, dataloader, eval_dataloader, test_dataloader):
    device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
    self.model.to(device)
    for epoch in range(self.epochs):
      self.model.train()
      losses = []

      for _, batch in enumerate(tqdm(dataloader, position=0, leave=True, desc=f"Train Epoch Number {epoch+1}")):
        self.model.zero_grad()
        X = self.tokenizer(batch['input'], add_special_tokens=True, max_length=512, truncation=True, padding=True)
        y = self.tokenizer(batch['response'], add_special_tokens=True, max_length=512, truncation=True, padding=True)
        X = torch.tensor(X["input_ids"])
        y = torch.tensor(y['input_ids'])
        X = X.to(device); y = y.to(device)
        outputs = self.model(input_ids=X, decoder_input_ids=y, labels=y) 
        losses.append(outputs.loss)
        outputs.loss.backward()
        self.optimizer.step()
        # break 
      logging.info(f'Epoch number: {epoch+1} Train Loss is equal: {sum(losses)/len(losses)}') 
      self.random_predict(test_dataloader, device, number_of_samples=10)
      self.eval(eval_dataloader, epoch, device)
      self.save(f"./models/autoencoder_{epoch}_{datetime.today().strftime('%Y-%m-%d')}.pt")


  def eval(self, dataloader, epoch, device):
    self.model.eval()
    losses = []
    for _, batch in enumerate(tqdm(dataloader, position=0, leave=True, desc=f"Eval Epoch Number {epoch+1}")):
      with torch.no_grad():
        X = self.tokenizer(batch['input'], add_special_tokens=True, max_length=512, truncation=True, padding=True)
        y = self.tokenizer(batch['response'], add_special_tokens=True, max_length=512, truncation=True, padding=True)
        X = torch.tensor(X["input_ids"])
        y = torch.tensor(y['input_ids'])
        X = X.to(device); y = y.to(device)
        outputs = self.model(input_ids=X, decoder_input_ids=y, labels=y) 
        
        losses.append(outputs.loss)
        # break
    logging.info(f'Epoch number: {epoch+1} Eval Loss is equal: {sum(losses)/len(losses)}')
  
  def save(self, save_path='./models/autoencoder.pt'):
    logging.info(f'Saving model ...')
    torch.save(self.model, save_path)
	
  def load(self, save_path='./models/autoencoder.pt'):
    logging.info(f'Loading model ...')
    self.model = torch.load(save_path)

  def random_predict(self, dataloader, device, number_of_samples=10):
    counter=0
    for sample in dataloader:
      counter+=1
      _input = self.tokenizer(sample['input'], add_special_tokens=True, max_length=512, padding=True)
      _input = torch.tensor(_input['input_ids'])
      _input = _input.to(device)
      self.model = self.model.to(device)
      decoder_start = torch.tensor(30526).to(device)
      generated = self.model.generate(_input, decoder_start_token_id=torch.tensor(30526).to(device))
      logging.info('Real: '+ sample['response'][0])
      logging.info(tokenizer.convert_ids_to_tokens(generated[0]))
      if counter>number_of_samples:
        break 


In [5]:


model = EncoderDecoderModel.from_encoder_decoder_pretrained("bert-base-uncased", "bert-base-uncased")
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
model.get_encoder().resize_token_embeddings(len(tokenizer))
model.get_decoder().resize_token_embeddings(len(tokenizer))

optimizer = AdamW
kw = {'lr':0.0002, 'weight_decay':0.1}
tl = TrainingLoop(model, tokenizer, optimizer, False, **kw)

data = read_data()
data_samples = data_to_samples(data)
bertified_data = bertified(data_samples)
train, valid = train_test_split(bertified_data, test_size=0.15, random_state=99)
test_data = read_data()
test_data_samples = data_to_samples(test_data, True)
test = bertified(test_data_samples)

train_dataset = PersonaDataset_v1(train)
valid_dataset = PersonaDataset_v1(valid)
test_dataset = PersonaDataset_v1(test)

train_dataloader = DataLoader(train_dataset, batch_size=6, shuffle=True)
valid_dataloader = DataLoader(valid_dataset, batch_size=6, shuffle=False)
test_dataloader = DataLoader(test_dataset, batch_size=1, shuffle=True)


device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
tl.train(train_dataloader, valid_dataloader, test_dataloader)



# tl.save()
# ##################################################
# tl.load()
# tl.random_predict(test_dataloader, device, number_of_samples=10)
# ##################################################
# tl.readable_predict(device, print_result=True)

KeyboardInterrupt: ignored