In [1]:
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from datasets import load_dataset

In [2]:
model = AutoModelForCausalLM.from_pretrained("microsoft/DialoGPT-small")
model

  utils.DeprecatedIn35,


GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
      (1): GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )


- <h3>Prepare dataset:</h3>

In [3]:
data_files = {
  "train": "./data/da_prompts_train.tsv",
  "val": "./data/da_prompts_val.tsv",
  "test": "./data/da_prompts_test.tsv",
}
daily_dialog = load_dataset("csv", data_files=data_files, sep='\t')

Using custom data configuration default-d2143c68bd759019
Reusing dataset csv (/home/admin/.cache/huggingface/datasets/csv/default-d2143c68bd759019/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519)


  0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
next(iter(daily_dialog["val"]))

{'history': '[DA_3] Good morning , sir . Is there a bank near here ?<|endoftext|>There is one . 5 blocks away from here ?<|endoftext|>',
 'response': "Well , that's too far.Can you change some money for me ?<|endoftext|>",
 'sent': 2,
 'da': 3}

In [4]:
toker = AutoTokenizer.from_pretrained("microsoft/DialoGPT-small")
toker

PreTrainedTokenizerFast(name_or_path='microsoft/DialoGPT-small', vocab_size=50257, model_max_len=1024, is_fast=True, padding_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True)})

In [5]:
toker.add_special_tokens({
  "additional_special_tokens": ["[DA_1]", "[DA_2]", "[DA_3]", "[DA_4]"],
  "pad_token": "[PAD]"
})
toker.all_special_ids, toker.vocab_size

([50256, 50261, 50257, 50258, 50259, 50260], 50257)

In [21]:
toker.encode("[DA_1]hello<|endoftext|>[PAD]"), toker.pad_token_id

([50257, 31373, 50256, 50261], 50261)

In [12]:
sentence = next(iter(daily_dialog["train"]))
sentence = "".join([sentence['history'], sentence['response']])
sentence

'[DA_2] Say , Jim , how about going for a few beers after dinner ?<|endoftext|>You know that is tempting but is really not good for our fitness .<|endoftext|>What do you mean ? It will help us to relax .<|endoftext|>'

In [22]:
encoded = toker([sentence], padding=True, truncation=True, max_length=128)
encoded

{'input_ids': [[50258, 13816, 837, 5395, 837, 703, 546, 1016, 329, 257, 1178, 16800, 706, 8073, 5633, 50256, 1639, 760, 326, 318, 29850, 475, 318, 1107, 407, 922, 329, 674, 13547, 764, 50256, 2061, 466, 345, 1612, 5633, 632, 481, 1037, 514, 284, 8960, 764, 50256]], 'attention_mask': [[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]]}

In [25]:
len(encoded['input_ids'][0])

44

In [6]:
model.resize_token_embeddings(toker.vocab_size + 5)

Embedding(50262, 768)

In [27]:
toker.pad_token_id is None

False

In [7]:
def preprocess(example):
  return toker(
    [x + " " + y for x, y in zip(example['history'], example['response'])],
    padding=True,
    truncation=True,
    max_length=128
  )

In [8]:
tokenized_dd = daily_dialog.map(
  preprocess,
  batched=True,
  remove_columns=daily_dialog["train"].column_names
)

  0%|          | 0/77 [00:00<?, ?ba/s]

Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/csv/default-d2143c68bd759019/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-ae7b1046e168ca21.arrow
Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/csv/default-d2143c68bd759019/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-a0ecb63512b2c1b4.arrow


In [8]:
block_size = 128

def group_texts(examples):
  cct_samples = {k: sum(examples[k], []) for k in examples.keys()}
  total_len = len(cct_samples[next(iter(examples.keys()))])
  result = {
    k: [t[i : i + block_size] for i in range(0, total_len, block_size)]
    for k, t in cct_samples.items()
  }
  return result

In [9]:
lm_data = tokenized_dd.map(group_texts, batched=True)

Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/csv/default-05c39de4a71b388e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-c4ad94436343b6a6.arrow
Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/csv/default-05c39de4a71b388e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-33cf261542d469d4.arrow
Loading cached processed dataset at /home/admin/.cache/huggingface/datasets/csv/default-05c39de4a71b388e/0.0.0/433e0ccc46f9880962cc2b12065189766fbb2bee57a221866138fb9203c83519/cache-3ba8af8b373b401c.arrow


In [12]:
tokenized_dd["train"]

Dataset({
    features: ['input_ids', 'attention_mask'],
    num_rows: 76051
})

In [34]:
tmp1 = next(iter(tokenized_dd["train"]))
# tmp2 = next(iter(tokenized_dd["train"]))
len(tmp1["input_ids"])

128

In [10]:
from transformers import DataCollatorForLanguageModeling

In [11]:
collator = DataCollatorForLanguageModeling(
  tokenizer=toker, 
  mlm=False, 
  pad_to_multiple_of=128
)

In [38]:
out = collator([tmp1])

In [39]:
out["input_ids"], out["labels"]

(tensor([[50258, 13816,   837,  5395,   837,   703,   546,  1016,   329,   257,
           1178, 16800,   706,  8073,  5633, 50256,  1639,   760,   326,   318,
          29850,   475,   318,  1107,   407,   922,   329,   674, 13547,   764,
          50256,  1867,   466,   345,  1612,  5633,   632,   481,  1037,   514,
            284,  8960,   764, 50256, 50261, 50261, 50261, 50261, 50261, 50261,
          50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261,
          50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261,
          50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261,
          50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261,
          50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261,
          50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261,
          50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261, 50261,
          50261, 50261, 50261, 50261, 50

- <h3>Train the model:</h3>

In [12]:
from transformers import TrainingArguments, Trainer

In [13]:
train_args = TrainingArguments(
  output_dir="./models/results2",
  evaluation_strategy="epoch",
  learning_rate=1e-5,
  weight_decay=0.01,
  save_steps=3000,
  per_device_eval_batch_size=32,
  per_device_train_batch_size=8
)

trainer = Trainer(
  model=model,
  args=train_args,
  train_dataset=tokenized_dd["train"],
  eval_dataset=tokenized_dd["test"],
  data_collator=collator
)

In [14]:
trainer.train(resume_from_checkpoint=True)

Loading model from ./models/results2/checkpoint-6000).
***** Running training *****
  Num examples = 76051
  Num Epochs = 3
  Instantaneous batch size per device = 8
  Total train batch size (w. parallel, distributed & accumulation) = 8
  Gradient Accumulation steps = 1
  Total optimization steps = 28521
  Continuing training from checkpoint, will skip to saved global_step
  Continuing training from epoch 0
  Continuing training from global step 6000
  Will skip the first 0 epochs then the first 6000 batches in the first epoch. If this takes a lot of time, you can add the `--ignore_data_skip` flag to your launch command, but you will resume the training on data already seen by your model.


  0%|          | 0/6000 [00:00<?, ?it/s]

Didn't find an RNG file, if you are resuming a training that was launched in a distributed fashion, reproducibility is not guaranteed.


Epoch,Training Loss,Validation Loss
1,2.4962,2.536964
2,2.3077,2.498218
3,2.2526,2.489703


Saving model checkpoint to ./models/results2/checkpoint-9000
Configuration saved in ./models/results2/checkpoint-9000/config.json
Model weights saved in ./models/results2/checkpoint-9000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 6739
  Batch size = 32
Saving model checkpoint to ./models/results2/checkpoint-12000
Configuration saved in ./models/results2/checkpoint-12000/config.json
Model weights saved in ./models/results2/checkpoint-12000/pytorch_model.bin
Saving model checkpoint to ./models/results2/checkpoint-15000
Configuration saved in ./models/results2/checkpoint-15000/config.json
Model weights saved in ./models/results2/checkpoint-15000/pytorch_model.bin
Saving model checkpoint to ./models/results2/checkpoint-18000
Configuration saved in ./models/results2/checkpoint-18000/config.json
Model weights saved in ./models/results2/checkpoint-18000/pytorch_model.bin
***** Running Evaluation *****
  Num examples = 6739
  Batch size = 32
Saving model checkpoint to ./

TrainOutput(global_step=28521, training_loss=1.859443076851199, metrics={'train_runtime': 4252.1278, 'train_samples_per_second': 53.656, 'train_steps_per_second': 6.707, 'total_flos': 1.4903640244224e+16, 'train_loss': 1.859443076851199, 'epoch': 3.0})

In [17]:
trainer.save_state()

In [23]:
toker.save_pretrained("./models/results2/baseline")

tokenizer config file saved in ./models/results2/baseline/tokenizer_config.json
Special tokens file saved in ./models/results2/baseline/special_tokens_map.json


('./models/results2/baseline/tokenizer_config.json',
 './models/results2/baseline/special_tokens_map.json',
 './models/results2/baseline/vocab.json',
 './models/results2/baseline/merges.txt',
 './models/results2/baseline/added_tokens.json',
 './models/results2/baseline/tokenizer.json')

- <h3>Evaluate the model:</h3>

Load pretrained the model:

In [14]:
model = AutoModelForCausalLM.from_pretrained("./models/results2/baseline")
model.config

GPT2Config {
  "_name_or_path": "./models/results/baseline",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 768,
  "n_head": 12,
  "n_inner": null,
  "n_layer": 12,
  "n_positions": 1024,
  "resid_pdrop": 0.1,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "conversational": {
      "max_length": 1000
    }
  },
  "torch_dtype": "float32",
  "transformers_version": "4.11.3",
  "use_cache": true,
  "vocab_size": 50261
}

In [18]:
from tqdm.notebook import tqdm

In [19]:
model.cuda()
model.eval()
test_batches = collator([x for x in tokenized_dd["val"]])
test_len = len(test_batches[next(iter(test_batches.keys()))])
test_len

7068

In [20]:
eval_loss = 0.0
nb_eval_steps = 0
eval_batch_size = 32
for i in tqdm(range(0, test_len, eval_batch_size)):
  X, y = test_batches["input_ids"][i : i + eval_batch_size], test_batches["labels"][i : i + eval_batch_size]
  with torch.no_grad():
    output = model(X.cuda(), labels=y.cuda())
    lm_loss = output[0]
    eval_loss += lm_loss.mean().item()
  nb_eval_steps += 1
eval_loss = eval_loss / nb_eval_steps
ppl = torch.exp(torch.tensor(eval_loss))

  0%|          | 0/221 [00:00<?, ?it/s]

In [21]:
eval_loss, ppl

(2.4791028526573697, tensor(11.9306))

In [24]:
%reload_ext autoreload
%autoreload 2

In [25]:
from gpt2_training.eval_utils import eval_model_all_outputs
from data_loader import DynamicBatchingLoader

In [47]:
import importlib
import gpt2_training.eval_utils
importlib.reload(gpt2_training)

<module 'gpt2_training' (namespace)>

In [26]:
class Args:
  def __init__(self):
    self.device = "cuda:0"
    self.temperature = 1.0
args = Args()

In [27]:
eval_input_file = './data/da_prompts_val.tsv'
eval_dataloader = DynamicBatchingLoader(
  eval_input_file, 
  toker, 
  True,
  1,
  128
)
eval_out_path = "./results/baseline.csv"

In [30]:
print(f'Evaluating to {eval_out_path}')
eval_details = eval_model_all_outputs(model, toker, eval_dataloader, 0,
                       eval_out_path, args)

Evaluating to ./results/baseline.csv


100%|██████████| 7069/7069 [23:35<00:00,  4.99it/s]


In [31]:
import pandas as pd

In [32]:
df = pd.DataFrame(eval_details)
df.to_csv(eval_out_path)

In [58]:
with torch.no_grad():
  output1 = model(X.cuda())

In [59]:
output1.keys()

odict_keys(['logits', 'past_key_values'])

In [66]:
for step in range(600):
    # encode the new user input, add the eos_token and return a tensor in Pytorch
    new_user_input_ids = toker.encode(
        input(">> User: ") + toker.eos_token, return_tensors='pt')
    # print(new_user_input_ids)

    # append the new user input tokens to the chat history
    bot_input_ids = torch.cat(
        [chat_history_ids, new_user_input_ids], dim=-1) if step > 0 else new_user_input_ids

    # generated a response while limiting the total chat history to 1000 tokens,
    chat_history_ids = model.generate(
        bot_input_ids.cuda(), max_length=500,
        pad_token_id=toker.eos_token_id,
        top_p=0.92, top_k=50
    )

    # pretty print last ouput tokens from bot
    print("DialoGPT: {}".format(toker.decode(
        chat_history_ids[:, bot_input_ids.shape[-1]:][0], skip_special_tokens=True)))


DialoGPT:  Hello, I'm Mary. I'm calling to tell you about our new project.    I'm Mary. I'm calling to tell you about our new project.  I'm calling to tell you about our new project.  I'm calling to tell you about our new project.  I'm calling to tell you about our new project.  I'm calling to tell you about our new project.  I'm calling to tell you about our new project.  I'm calling to tell you about our new project.  I'm calling to tell you about our new project.  I'm calling to tell you  I'm calling to tell you about our new project.  I'm calling to tell you about our new project.  I'm calling to tell you about our new project.  I'm calling to tell you about the project. I'm calling to tell me about the project.  I'm calling to tell you about the project.  I'm calling to tell you.  I' I' I' I' I' I' I' I' I' I' I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I'I

KeyboardInterrupt: Interrupted by user

'hello'

In [79]:
a = "tring"
"".join([a, "eos"])

'tringeos'