<a href="https://colab.research.google.com/github/meti-94/TextGeneration/blob/main/bert_autoencoder_personachat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!wget http://www.fit.vutbr.cz/~imikolov/rnnlm/simple-examples.tgz
!tar -xf  simple-examples.tgz
!mkdir data
!mv /content/simple-examples/data/ptb.train.txt data/
!mv /content/simple-examples/data/ptb.valid.txt data/
!mv /content/simple-examples/data/ptb.test.txt data/
!rm -rf ./simple_examples

In [1]:
!pip install transformers
!wget https://s3.amazonaws.com/datasets.huggingface.co/personachat/personachat_self_original.json
!mkdir models

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/d8/b2/57495b5309f09fa501866e225c84532d1fd89536ea62406b2181933fb418/transformers-4.5.1-py3-none-any.whl (2.1MB)
[K     |████████████████████████████████| 2.1MB 23.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/75/ee/67241dc87f266093c533a2d4d3d69438e57d7a90abb216fa076e7d475d4a/sacremoses-0.0.45-py3-none-any.whl (895kB)
[K     |████████████████████████████████| 901kB 51.8MB/s 
Collecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/ae/04/5b870f26a858552025a62f1649c20d29d2672c02ff3c3fb4c688ca46467a/tokenizers-0.10.2-cp37-cp37m-manylinux2010_x86_64.whl (3.3MB)
[K     |████████████████████████████████| 3.3MB 50.2MB/s 
Installing collected packages: sacremoses, tokenizers, transformers
Successfully installed sacremoses-0.0.45 tokenizers-0.10.2 transformers-4.5.1
--2021-04-20 14:04:40--  https://s3.amazonaws.com/datasets.huggin

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
from torch.utils.data import Dataset, DataLoader
from transformers import EncoderDecoderModel, BertTokenizer, BertModel, BertConfig
import torch
from transformers import AdamW
from tqdm import tqdm
import json
import random 
from sklearn.model_selection import train_test_split
from datetime import datetime
from transformers import Seq2SeqTrainer
from transformers import Seq2SeqTrainingArguments
from tqdm import tqdm_notebook as tqdm
import logging
logging.basicConfig(level=logging.DEBUG)

In [10]:
SPECIAL_TOKENS = ["<bos>", "<eos>", "<persona>", "<speaker1>", "<speaker2>", "<pad>"]

ATTR_TO_SPECIAL_TOKEN = {'bos_token': '<bos>', 'eos_token': '<eos>', 'pad_token': '<pad>',
                         'additional_special_tokens': ['<speaker1>', '<speaker2>', '<persona>']}

encoder_max_length=600
decoder_max_length=128

def read_data(data_json_path='/content/personachat_self_original.json'):
  with open(data_json_path) as json_file:
    data_dict = json.load(json_file)
  return data_dict

def data_to_samples(data_dict, test=False):
  samples=[]
  for dialogue in (data_dict['train'] if test==False else data_dict['valid']):
    original_persona = dialogue['personality']
    
    for item in dialogue['utterances']:
      original_persona = [original_persona[-1]] + original_persona[:-1]
      history = item['history']
      response = item['candidates'][-1]
      samples.append({
          'persona':original_persona,
          'history':history,
          'response':response
      })
  return samples
  
def bertified(samples):
  bertified_data = []
  for item in samples:
    persona = ' <persona> '.join(item['persona'])
    persona = '<bos> ' + persona
    history = ''
    speakers = [" <speaker1> ", " <speaker2> "]
    speaker = 0
    for hst in item['history'][::-1]:
      history = speakers[speaker] + hst + history
      speaker = 1 - speaker
    response = '<speaker2> ' + item['response'] + ' <eos>'
    bertified_data.append({
          'persona':persona.replace('  ', ' '),
          'history':history.replace('  ', ' '),
          'input': persona.replace('  ', ' ')+' '+history.replace('  ', ' '), 
          'response':response.replace('  ', ' ')
      })
  return bertified_data

def bertified_to_model_food(bertified_data, tokenizer):
    model_food = []
    pbar = tqdm(bertified_data)
    pbar.set_description('Tokenizing Data Sample ...')
    for item in pbar:
        inputs = tokenizer(item['input'], add_special_tokens=True, max_length=encoder_max_length, truncation=True, padding="max_length")
        outputs = tokenizer(item['response'], add_special_tokens=True, max_length=decoder_max_length, truncation=True, padding="max_length")
        _item = {}
        _item["input_ids"] = inputs.input_ids
        _item["attention_mask"] = inputs.attention_mask
        _item["decoder_input_ids"] = outputs.input_ids
        _item["decoder_attention_mask"] = outputs.attention_mask
        _item["labels"] = outputs.input_ids.copy()
        model_food.append(_item)
    return model_food 

In [11]:
class PersonaDataset_v3(Dataset):
  '''
      Convert Data to proper Tensor dataset
  '''
  def __init__(self, samples):
    self.samples = samples
    self.n_samples = len(self.samples)

  def __getitem__(self, index):
    # returns specific item
    return self.samples[index]

  def __len__(self):
    return self.n_samples

In [14]:
model = EncoderDecoderModel.from_encoder_decoder_pretrained("prajjwal1/bert-tiny", "prajjwal1/bert-tiny")
tokenizer = BertTokenizer.from_pretrained("prajjwal1/bert-tiny")
tokenizer.add_special_tokens(ATTR_TO_SPECIAL_TOKEN)
model.get_encoder().resize_token_embeddings(len(tokenizer))
model.get_decoder().resize_token_embeddings(len(tokenizer))

data = read_data(r'C:\Users\meti\Downloads\personachat_self_original.json')
data_samples = data_to_samples(data)
bertified_data = bertified(data_samples)
train, valid = train_test_split(bertified_data, test_size=0.15, random_state=99)
train = bertified_to_model_food(train, tokenizer)
valid = bertified_to_model_food(valid, tokenizer)
test_data = read_data(r'C:\Users\meti\Downloads\personachat_self_original.json')
test_data_samples = data_to_samples(test_data, True)
test = bertified(test_data_samples)
test = bertified_to_model_food(test, tokenizer)

# train_dataset = PersonaDataset_v3(list(zip(*map(one_line_tokenizer, tqdm(train)))))
train_dataset = PersonaDataset_v3(train)
valid_dataset = PersonaDataset_v3(valid)
test_dataset = PersonaDataset_v3(test)



DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /prajjwal1/bert-tiny/resolve/main/config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /prajjwal1/bert-tiny/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /prajjwal1/bert-tiny/resolve/main/config.json HTTP/1.1" 200 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): huggingface.co:443
DEBUG:urllib3.connectionpool:https://huggingface.co:443 "HEAD /prajjwal1/bert-tiny/resolve/main/pytorch_model.bin HTTP/1.1" 302 0
Some weights of the model checkpoint at prajjwal1/bert-tiny were not used when initializing BertLMHeadModel: ['cls.seq_relationship.weight', 'cls.seq_relationsh

HBox(children=(FloatProgress(value=0.0, max=111722.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=19716.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=7801.0), HTML(value='')))




In [15]:
def compute_metrics(pred):
    labels_ids = pred.label_ids
    pred_ids = pred.predictions

    pred_str = tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    labels_ids[labels_ids == -100] = tokenizer.pad_token_id
    label_str = tokenizer.batch_decode(labels_ids, skip_special_tokens=True)

    rouge_output = rouge.compute(predictions=pred_str, references=label_str, rouge_types=["rouge2"])["rouge2"].mid

    return {
        "rouge2_precision": round(rouge_output.precision, 4),
        "rouge2_recall": round(rouge_output.recall, 4),
        "rouge2_fmeasure": round(rouge_output.fmeasure, 4),
    }

In [16]:
model.config.decoder_start_token_id = tokenizer.cls_token_id
model.config.eos_token_id = tokenizer.sep_token_id
model.config.pad_token_id = tokenizer.pad_token_id
model.config.vocab_size = model.config.encoder.vocab_size

# model.config.max_length = 142
# model.config.min_length = 56
model.config.no_repeat_ngram_size = 3
model.config.early_stopping = True
model.config.length_penalty = 2.0
model.config.num_beams = 4

In [18]:
batch_size = 1
training_args = Seq2SeqTrainingArguments(
    predict_with_generate=True,
    evaluation_strategy="steps",
    per_device_train_batch_size=batch_size,
    per_device_eval_batch_size=batch_size,
#     fp16=True, 
    output_dir="./",
    logging_steps=2,
    save_steps=10,
    eval_steps=4,
    # logging_steps=1000,
    # save_steps=500,
    # eval_steps=7500,
    # warmup_steps=2000,
    # save_total_limit=3,
)
# instantiate trainer
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    compute_metrics=compute_metrics,
    train_dataset=train_dataset,
    eval_dataset=valid_dataset,
)
trainer.train()

NameError: name 'train_dataset' is not defined