# **Install Required Libraries (Google Colab)**

In [None]:
!pip install transformers datasets sentencepiece accelerate

# **Import Libraries**

In [2]:
import torch

from datasets import load_dataset

from transformers import (
AutoTokenizer,
AutoModelForCausalLM,
TrainingArguments,
Trainer,
DataCollatorForLanguageModeling
)

# **Load Dataset**

In [3]:
dataset = load_dataset("blended_skill_talk")

dataset = dataset["train"]

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/train-00000-of-00001.parquet:   0%|          | 0.00/5.88M [00:00<?, ?B/s]

data/validation-00000-of-00001.parquet:   0%|          | 0.00/2.62M [00:00<?, ?B/s]

data/test-00000-of-00001.parquet:   0%|          | 0.00/2.40M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/4819 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/1009 [00:00<?, ? examples/s]

Generating test split:   0%|          | 0/980 [00:00<?, ? examples/s]

# **Format Conversation**

In [4]:
def format_conversation(example):

    context = example["previous_utterance"]

    response = example["free_messages"]

    text = ""

    for user, bot in zip(context, response):

        text += f"User: {user}\n"
        text += f"Bot: {bot}\n"

    return {"text": text}

dataset = dataset.map(format_conversation)

Map:   0%|          | 0/4819 [00:00<?, ? examples/s]

# **Remove unused Columns**

In [5]:
dataset = dataset.remove_columns(
    [
        'free_messages',
        'guided_messages',
        'suggestions',
        'personas',
        'context',
        'additional_context',
        'previous_utterance'
    ]
)

# **Load Tokenizer**

In [6]:
model_name = "gpt2"

tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token



config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

# **Tokenize Dataset**

In [7]:
def tokenize_function(example):

    tokens = tokenizer(

        example["text"],

        padding="max_length",

        truncation=True,

        max_length=128

    )

    tokens["labels"] = tokens["input_ids"].copy()

    return tokens


tokenized_dataset = dataset.map(tokenize_function)


tokenized_dataset = tokenized_dataset.remove_columns(["text"])

Map:   0%|          | 0/4819 [00:00<?, ? examples/s]

# **Load Model**

In [8]:
model = AutoModelForCausalLM.from_pretrained(model_name)

model = model.to("cuda")

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

Loading weights:   0%|          | 0/148 [00:00<?, ?it/s]

GPT2LMHeadModel LOAD REPORT from: gpt2
Key                  | Status     |  | 
---------------------+------------+--+-
h.{0...11}.attn.bias | UNEXPECTED |  | 

Notes:
- UNEXPECTED	:can be ignored when loading from different task/architecture; not ok if you expect identical arch.


generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

# **Using DataCollator**

In [9]:
data_collator = DataCollatorForLanguageModeling(

    tokenizer=tokenizer,

    mlm=False

)

# **Training Arguments**

In [10]:
training_args = TrainingArguments(

    output_dir="./chatbot",

    num_train_epochs=5,

    per_device_train_batch_size=2,

    save_steps=500,

    logging_steps=100,

    learning_rate=5e-5,

    fp16=False,

    report_to="none"

)

# **Trainer**

In [11]:
trainer = Trainer(

    model=model,

    args=training_args,

    train_dataset=tokenized_dataset,

    data_collator=data_collator

)

# **Train Model**

In [12]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
100,3.263937
200,3.152123
300,3.069885
400,3.118399
500,3.096528
600,3.069724
700,3.079491
800,3.081248
900,3.048108
1000,2.952021


Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

TrainOutput(global_step=12050, training_loss=2.506697458607527, metrics={'train_runtime': 1735.8772, 'train_samples_per_second': 13.881, 'train_steps_per_second': 6.942, 'total_flos': 1573957877760000.0, 'train_loss': 2.506697458607527, 'epoch': 5.0})

# **Save Model**

In [13]:
model.save_pretrained("chatbot-model")

tokenizer.save_pretrained("chatbot-model")


print("MODEL SAVED SUCCESSFULLY")

Writing model shards:   0%|          | 0/1 [00:00<?, ?it/s]

MODEL SAVED SUCCESSFULLY
