In [3]:
import pandas as pd
import numpy as np

from tqdm.notebook import tqdm

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, DataCollatorForLanguageModeling, TrainingArguments, Trainer
from datasets import load_dataset

- <h3>Prepare dataset</h3>

In [4]:
data_files = {
  "train": "./data/da_prompts/train.csv",
  "val": "./data/da_prompts/val.csv",
  "test": "./data/da_prompts/test.csv",
}
dd = load_dataset("csv", data_files=data_files, sep=';')
dd

  utils.DeprecatedIn35,
Using custom data configuration default-d87bd31fe63ad7c9
Reusing dataset csv (/home/admin/.cache/huggingface/datasets/csv/default-d87bd31fe63ad7c9/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/3 [00:00<?, ?it/s]

DatasetDict({
    train: Dataset({
        features: ['history', 'response', 'da'],
        num_rows: 76052
    })
    val: Dataset({
        features: ['history', 'response', 'da'],
        num_rows: 7069
    })
    test: Dataset({
        features: ['history', 'response', 'da'],
        num_rows: 6740
    })
})

Load the tokenizer and add special tokens:

In [8]:
toker = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
toker.add_special_tokens({
  "additional_special_tokens": ["[DA_1]", "[DA_2]", "[DA_3]", "[DA_4]"],
  "pad_token": "[PAD]"
})
toker.all_special_ids, toker.vocab_size

([50256, 50261, 50257, 50258, 50259, 50260], 50257)

In [9]:
def tokenize(example):
  return toker(
      [x + " " + y for x, y in zip(example['history'], example['response'])],
      padding=True,
      truncation=True,
      max_length=128
  )

In [10]:
enc_dd = dd.map(
  tokenize,
  batched=True,
  num_proc=4,
  remove_columns=dd["train"].column_names
)

 

Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/csv/default-d87bd31fe63ad7c9/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-6d6464599fdf2082.arrow


 

Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/csv/default-d87bd31fe63ad7c9/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-854fba7d9f3ab7d7.arrow


 

Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/csv/default-d87bd31fe63ad7c9/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-d68fff71b174c7a1.arrow


 

Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/csv/default-d87bd31fe63ad7c9/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-751ee030686a17e0.arrow


 

Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/csv/default-d87bd31fe63ad7c9/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-ac81e407e5b9bf56.arrow


 

Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/csv/default-d87bd31fe63ad7c9/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-89332f75ff668091.arrow


 

Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/csv/default-d87bd31fe63ad7c9/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-3158c138ae918001.arrow


 

Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/csv/default-d87bd31fe63ad7c9/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-b6bfe82ff436d58b.arrow


 

Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/csv/default-d87bd31fe63ad7c9/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-cc3e3566247cb0e7.arrow


 

Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/csv/default-d87bd31fe63ad7c9/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-046bd62f8e0bd3b5.arrow


 

Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/csv/default-d87bd31fe63ad7c9/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-ea2b2ed59753aa41.arrow


 

Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/csv/default-d87bd31fe63ad7c9/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-57edcedbab4a9301.arrow


Now we need a data collator to pad data and get lm_labels:

In [11]:
collator = DataCollatorForLanguageModeling(
  tokenizer=toker,
  mlm=False,
  pad_to_multiple_of=128
)

Test collator:

In [12]:
tmp1 = next(iter(enc_dd["train"]))
out = collator([tmp1])
out["input_ids"], out["labels"]

(tensor([[50259, 13816,   837,  5395,   837,   703,   546,  1016,   329,   257,
           1178, 16800,   706,  8073,  5633,   220, 50256,   220, 50260,   220,
            921,   760,   326,   318, 29850,   475,   318,  1107,   407,   922,
            329,   674, 13547,   764,   220, 50256, 50261, 50261, 50261, 50261,
          50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261,
          50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261,
          50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261,
          50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261,
          50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261,
          50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261,
          50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261,
          50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261,
          50261, 50261, 50261, 50261, 50

Load the model and resize it's embedding space as we added 5 special tokens:

In [21]:
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")
model.resize_token_embeddings(toker.vocab_size + 5)

  utils.DeprecatedIn35,


Embedding(50262, 768)

- <h3>Train the model:</h3>

In [25]:
train_args = TrainingArguments(
    output_dir="./models/results3",
    evaluation_strategy="epoch",
    learning_rate=1e-5,
    weight_decay=0.01,
    save_steps=4000,
    per_device_eval_batch_size=32,
    per_device_train_batch_size=8
)

trainer = Trainer(
    model=model,
    args=train_args,
    train_dataset=enc_dd["train"],
    eval_dataset=enc_dd["test"],
    data_collator=collator
)

In [27]:
trainer.train()

***** Running training *****
  Num examples = 76052
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 28521


Epoch,Training Loss,Validation Loss
1,2.1527,2.170343
2,1.9894,2.119049
3,1.9353,2.109156


Saving model checkpoint to ./models/results3/checkpoint-4000
Configuration saved in ./models/results3/checkpoint-4000/config.json
Model weights saved in ./models/results3/checkpoint-4000/pytorch_model.bin
Saving model checkpoint to ./models/results3/checkpoint-8000
Configuration saved in ./models/results3/checkpoint-8000/config.json
Model weights saved in ./models/results3/checkpoint-8000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 6740
  Batch size = 32
Saving model checkpoint to ./models/results3/checkpoint-12000
Configuration saved in ./models/results3/checkpoint-12000/config.json
Model weights saved in ./models/results3/checkpoint-12000/pytorch_model.bin
Saving model checkpoint to ./models/results3/checkpoint-16000
Configuration saved in ./models/results3/checkpoint-16000/config.json
Model weights saved in ./models/results3/checkpoint-16000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 6740
  Batch size = 32
Saving model checkpoint to ./mod

TrainOutput(global_step=28521, training_loss=2.1548138021239933, metrics={'train_runtime': 5346.1678, 'train_samples_per_second': 42.677, 'train_steps_per_second': 5.335, 'total_flos': 1.4903836213248e+16, 'train_loss': 2.1548138021239933, 'epoch': 3.0})

In [28]:
trainer.save_state()
trainer.save_model("./models/results3/baseline")

Saving model checkpoint to ./models/results3/baseline
Configuration saved in ./models/results3/baseline/config.json
Model weights saved in ./models/results3/baseline/pytorch_model.bin


In [29]:
toker.save_pretrained("./models/results3/baseline/tokenizer")

tokenizer config file saved in ./models/results3/baseline/tokenizer/tokenizer_config.json
Special tokens file saved in ./models/results3/baseline/tokenizer/special_tokens_map.json


('./models/results3/baseline/tokenizer/tokenizer_config.json',
 './models/results3/baseline/tokenizer/special_tokens_map.json',
 './models/results3/baseline/tokenizer/vocab.json',
 './models/results3/baseline/tokenizer/merges.txt',
 './models/results3/baseline/tokenizer/added_tokens.json',
 './models/results3/baseline/tokenizer/tokenizer.json')

- <h3>Evaluate the model:</h3>

In [13]:
model_2 = AutoModelForCausalLM.from_pretrained("./models/results3/baseline")
model_2

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50262, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


In [14]:
toker_2 = AutoTokenizer.from_pretrained("./models/results3/baseline/tokenizer")
toker_2

PreTrainedTokenizerFast(name_or_path='./models/results3/baseline/tokenizer', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '[PAD]', 'additional_special_tokens': ['[DA_1]', '[DA_2]', '[DA_3]', '[DA_4]']})

In [42]:
def eval_model(net, eval_batches, eval_len):
  net.eval()
  net.cuda()
  eval_loss = 0.0
  nb_eval_steps = 0
  eval_batch_size = 32
  for i in tqdm(range(0, eval_len, eval_batch_size)):
    X, y = eval_batches["input_ids"][i: i +
                                    eval_batch_size], eval_batches["labels"][i: i + eval_batch_size]
    with torch.no_grad():
      output = net(X.cuda(), labels=y.cuda())
      lm_loss = output[0]
      eval_loss += lm_loss.mean().item()
    nb_eval_steps += 1
  eval_loss = eval_loss / nb_eval_steps
  ppl = torch.exp(torch.tensor(eval_loss))
  return eval_loss, ppl

In [43]:
test_batches = collator([x for x in enc_dd["val"]])
test_len = len(test_batches[next(iter(test_batches.keys()))])
test_len


7069

In [44]:
eval_model(model, test_batches, test_len)

  0%|          | 0/221 [00:00<?, ?it/s]

(2.1092151721678167, tensor(8.2418))

In [45]:
model.cpu()
eval_model(model_2, test_batches, test_len)

  0%|          | 0/221 [00:00<?, ?it/s]

(2.1092151721678167, tensor(8.2418))

In [47]:
model_2.cpu()

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50262, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


In [15]:
def gen_response(net, tokenizer, inp):
  net.cuda()
  net.eval()
  input_ids = tokenizer.encode(inp, return_tensors='pt')
  response_ids = net.generate(
    input_ids.cuda(),
    max_length=500,
    pad_token_id=tokenizer.pad_token_id,
    top_p=0.92,
    top_k=50
  )
  response = tokenizer.decode(
    response_ids[:, input_ids.shape[-1]:][0],
    skip_special_tokens=True
  )
  net.cpu()
  return response

In [16]:
gen_response(model_2, toker_2, f"[DA_1] hello{toker_2.eos_token}[DA_2] Hi, how are you ?{toker_2.eos_token}[DA_2] ")

" I'm fine. How about yourself? "

Preporcess the validation set to evaulate generation:

In [18]:
PROMPT_LEN = 7

def move_prefixes(batch):
  histories = batch['history']
  responses = batch['response']
  for i in range(len(histories)):
    histories[i] = "".join([histories[i], responses[i][:PROMPT_LEN]])
    responses[i] = responses[i][PROMPT_LEN:]
  batch["history"] = histories
  batch["response"] = responses
  return batch

In [19]:
val_set = dd["val"].map(
  move_prefixes,
  batched=True
)
val_set

  0%|          | 0/8 [00:00<?, ?ba/s]

Dataset({
    features: ['history', 'response', 'da'],
    num_rows: 7069
})

In [29]:
val_set.num_rows

7069

In [73]:
df = pd.DataFrame(val_set)
df.iloc[1]["response"]

" Well , that's too far.Can you change some money for me ? <|endoftext|>"

In [70]:
tmp_df = df.head(5)


In [74]:
def save_model_answers(df: pd.DataFrame, output_path, model=model_2, toker=toker_2):
  model_answers = []
  for i in tqdm(range(df.shape[0])):
    model_answers.append(gen_response(
      model, 
      toker,
      df.iloc[i]["history"]
    ))
  df["model_answer"] = model_answers
  df.rename(columns={"response": "gold_answer", "da": "meta"}, inplace=True)
  print(f"Saving to {output_path}")
  with open(output_path, "w+") as f:
    df.to_csv(f)

In [75]:
save_model_answers(df, "./results/baseline3.csv")

  0%|          | 0/7069 [00:00<?, ?it/s]

Input length of input_ids is 502, but ``max_length`` is set to 500.This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.
Input length of input_ids is 576, but ``max_length`` is set to 500.This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.
Input length of input_ids is 599, but ``max_length`` is set to 500.This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.
Input length of input_ids is 659, but ``max_length`` is set to 500.This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.
Input length of input_ids is 677, but ``max_length`` is set to 500.This can lead to unexpected behavior. You should consider increasing ``config.max_length`` or ``max_length``.
Input length of input_ids is 697, but ``max_length`` is set to 500.This can lead to unexpected behavior. You should

Saving to ./results/baseline3.csv


In [76]:
df

Unnamed: 0,history,gold_answer,meta,model_answer
0,"[DA_2] Good morning , sir . Is there a bank ne...",There is one . 5 blocks away from here ? <|en...,1,"Yes, there is."
1,"[DA_2] Good morning , sir . Is there a bank ne...","Well , that's too far.Can you change some mon...",3,"Yes, please. I'd like to withdraw some money."
2,"[DA_2] Good morning , sir . Is there a bank ne...","Surely , of course . What kind of currency ha...",2,Sure. What kind of money do you want?
3,"[DA_2] Good morning , sir . Is there a bank ne...",RIB . <|endoftext|>,1,I have a US dollar note.
4,"[DA_2] Good morning , sir . Is there a bank ne...",How much would you like to change ? <|endofte...,2,How much is it?
...,...,...,...,...
7064,"[DA_2] Welcome , sir . What can I do for you ?...","Oh , it must be very precious . Is it breakab...",2,What is the name of the tea set?
7065,"[DA_2] Welcome , sir . What can I do for you ?...","No , if you take some care when you use them ...",1,"Yes, sir."
7066,"[DA_2] Welcome , sir . What can I do for you ?...",How much is it ? <|endoftext|>,2,What about the handle?
7067,"[DA_2] Welcome , sir . What can I do for you ?...",Two thousand . <|endoftext|>,1,It's only $ 2.50.
