In [1]:
!pip install trl transformers datasets accelerate bitsandbytes -q

from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import Dataset
import pandas as pd

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.5/465.5 kB[0m [31m8.7 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m11.0 MB/s[0m eta [36m0:00:00[0m
[?25h

In [3]:
# Load and preprocess dataset
df = pd.read_csv("train-balanced-sarc.csv", sep="\t", header=None,
    names=["label","comment","author","subreddit","score","ups","downs","date","created_utc","parent_comment"],
    on_bad_lines="skip"
)
# Keep sarcastic samples
df = df[df["label"]==1]
df = df[['parent_comment', 'comment']].dropna()
# Sample some examples for faster training
df = df.sample(10000, random_state=42).reset_index(drop=True)
# Format samples
df["text"] = df.apply(lambda r: f"<PARENT> {r['parent_comment']} </PARENT>\n<RESPONSE> {r['comment']} </RESPONSE>", axis=1)
dataset = Dataset.from_pandas(df[["text"]])

In [4]:
# Load GPT-2 tokenizer and model
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [5]:
# Define training arguments
args = TrainingArguments(
    output_dir="./gpt2-sft",
    num_train_epochs=3,
    per_device_train_batch_size=2,
    learning_rate=5e-5,
    fp16=True,
    save_strategy="epoch",
    save_total_limit=1,
    report_to="none",
)

In [6]:
# SFT trainer
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=dataset,
)

Adding EOS to train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

Token indices sequence length is longer than the specified maximum sequence length for this model (1368 > 1024). Running this sequence through the model will result in indexing errors


Truncating train dataset:   0%|          | 0/10000 [00:00<?, ? examples/s]

In [7]:
trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
500,2.9001
1000,2.7921
1500,2.8172
2000,2.7573
2500,2.7589
3000,2.7577
3500,2.7642
4000,2.7725
4500,2.7478
5000,2.7502


TrainOutput(global_step=15000, training_loss=2.491579541015625, metrics={'train_runtime': 1631.4395, 'train_samples_per_second': 18.389, 'train_steps_per_second': 9.194, 'total_flos': 1263309209856000.0, 'train_loss': 2.491579541015625, 'epoch': 3.0})

In [27]:
# Test SFT training result
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load SFT checkpoint
# Load the fine-tuned tokenizer and model
ckpt_path = "./gpt2-sft/checkpoint-15000"
tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
tokenizer.pad_token = tokenizer.eos_token

# Load the fine-tuned causal language model.
model = AutoModelForCausalLM.from_pretrained(ckpt_path)
model.eval()
model.to("cuda")

def test_sft(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=80,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    print("\n------ MODEL OUTPUT ------")
    print(tokenizer.decode(output[0], skip_special_tokens=True))


#Run a sample test prompt
prompt = "<PARENT> I love spending all weekend doing homework. </PARENT>\n<RESPONSE>"
test_sft(prompt)



------ MODEL OUTPUT ------
<PARENT> I love spending all weekend doing homework. </PARENT>
<RESPONSE> You're right, I miss homework! </RESPONSE>
