In [2]:
!pip install trl transformers datasets accelerate bitsandbytes -q

import torch
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset

In [3]:
# Load and preprocess dataset
raw_train = load_dataset("marcbishara/sarcasm-on-reddit", split="sft_train")
raw_eval = load_dataset("marcbishara/sarcasm-on-reddit", split="sft_validation")
raw_train = raw_train.filter(lambda x: x["label"] == 1)
raw_eval = raw_eval.filter(lambda x: x["label"] == 1)
train_dataset = raw_train.shuffle(seed=42)
eval_dataset = raw_eval.shuffle(seed=42)
print(f"Initial SFT Training samples for test: {len(train_dataset)}")
print(f"Initial SFT Validation samples for test: {len(eval_dataset)}")

# Format samples
def format_sample(example):
    parent = example.get('parent_comment', '')
    response = example.get('comment', '')

    if not parent or not response:
        return {"text": None}

    text = f"<PARENT> {parent} </PARENT>\n<RESPONSE> {response} </RESPONSE>"

    if len(text) > 800:
        return {"text": None}

    return {"text": text}

train_dataset = train_dataset.map(format_sample, remove_columns=train_dataset.column_names)
train_dataset = train_dataset.filter(lambda x: x["text"] is not None)
eval_dataset = eval_dataset.map(format_sample, remove_columns=eval_dataset.column_names)
eval_dataset = eval_dataset.filter(lambda x: x["text"] is not None)

print(f"Final Training samples after filtering: {len(train_dataset)}")
print(f"Final Validation samples after filtering: {len(eval_dataset)}")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


README.md: 0.00B [00:00, ?B/s]

data/holdout-00000-of-00001.parquet:   0%|          | 0.00/18.2M [00:00<?, ?B/s]

data/sft_train-00000-of-00001.parquet:   0%|          | 0.00/49.1M [00:00<?, ?B/s]

data/sft_validation-00000-of-00001.parqu(…):   0%|          | 0.00/5.44M [00:00<?, ?B/s]

data/reward_train-00000-of-00001.parquet:   0%|          | 0.00/49.3M [00:00<?, ?B/s]

data/reward_validation-00000-of-00001.pa(…):   0%|          | 0.00/5.53M [00:00<?, ?B/s]

data/ppo_train-00000-of-00001.parquet:   0%|          | 0.00/49.4M [00:00<?, ?B/s]

data/ppo_validation-00000-of-00001.parqu(…):   0%|          | 0.00/5.51M [00:00<?, ?B/s]

Generating holdout split:   0%|          | 0/101083 [00:00<?, ? examples/s]

Generating sft_train split:   0%|          | 0/272922 [00:00<?, ? examples/s]

Generating sft_validation split:   0%|          | 0/30325 [00:00<?, ? examples/s]

Generating reward_train split:   0%|          | 0/272922 [00:00<?, ? examples/s]

Generating reward_validation split:   0%|          | 0/30325 [00:00<?, ? examples/s]

Generating ppo_train split:   0%|          | 0/272924 [00:00<?, ? examples/s]

Generating ppo_validation split:   0%|          | 0/30325 [00:00<?, ? examples/s]

Filter:   0%|          | 0/272922 [00:00<?, ? examples/s]

Filter:   0%|          | 0/30325 [00:00<?, ? examples/s]

Initial SFT Training samples for test: 136640
Initial SFT Validation samples for test: 15086


Map:   0%|          | 0/136640 [00:00<?, ? examples/s]

Filter:   0%|          | 0/136640 [00:00<?, ? examples/s]

Map:   0%|          | 0/15086 [00:00<?, ? examples/s]

Filter:   0%|          | 0/15086 [00:00<?, ? examples/s]

Final Training samples after filtering: 134136
Final Validation samples after filtering: 14819


In [4]:
# Load GPT-2 tokenizer and model
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [8]:
# Define training arguments
args = TrainingArguments(
    output_dir="./gpt2-sft",
    num_train_epochs=2,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    fp16=True,
    save_strategy="epoch",
    save_total_limit=1,
    report_to="none",
    eval_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

In [9]:
# SFT trainer
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)
trainer.train()
trainer.evaluate()

Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,2.7154,2.62699,2.639089,8144401.0,0.533697
2,2.6568,2.619008,2.607449,16288802.0,0.534744


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


{'eval_loss': 2.6190078258514404,
 'eval_runtime': 55.7724,
 'eval_samples_per_second': 265.705,
 'eval_steps_per_second': 16.621,
 'eval_entropy': 2.607448987076069,
 'eval_num_tokens': 16288802.0,
 'eval_mean_token_accuracy': 0.5347436554287887,
 'epoch': 2.0}

In [2]:
# Test SFT training result
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load SFT checkpoint
# Load the fine-tuned tokenizer and model
ckpt_path = "Zoe3324/gpt2-sft-full-v2"
tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
tokenizer.pad_token = tokenizer.eos_token

# Load the fine-tuned causal language model.
model = AutoModelForCausalLM.from_pretrained(ckpt_path)
model.eval()
model.to("cuda")

def test_sft(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=80,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    print("\n------ MODEL OUTPUT ------")
    print(tokenizer.decode(output[0], skip_special_tokens=True))


#Run a sample test prompt
prompt = "<PARENT> Are you implying that people with broken legs should stop asking for help and die because nobody has helped them thus far??? </PARENT>\n<RESPONSE>"
test_sft(prompt)
prompt = "<PARENT> why aren't there at least 3 grenades in your face?</PARENT>\n<RESPONSE>"
test_sft(prompt)
prompt = "<PARENT> And now Grandma is on another watch list</PARENT>\n<RESPONSE>"
test_sft(prompt)
prompt = "<PARENT> That is awesome! What made you chose?</PARENT>\n<RESPONSE>"
test_sft(prompt)




------ MODEL OUTPUT ------
<PARENT> Are you implying that people with broken legs should stop asking for help and die because nobody has helped them thus far??? </PARENT>
<RESPONSE> But, but, but, you would never do that! </RESPONSE>

------ MODEL OUTPUT ------
<PARENT> why aren't there at least 3 grenades in your face?</PARENT>
<RESPONSE> because of the 2nd grenade in the picture, it's not like they're getting their hands on any grenades! </RESPONSE>

------ MODEL OUTPUT ------
<PARENT> And now Grandma is on another watch list</PARENT>
<RESPONSE> Oh I see... </RESPONSE>

------ MODEL OUTPUT ------
<PARENT> That is awesome! What made you chose?</PARENT>
<RESPONSE> I chose to write my own poetry </RESPONSE>
