In [1]:
!pip install trl transformers datasets accelerate bitsandbytes -q

import torch
from trl import SFTTrainer
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments
from datasets import load_dataset

[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m465.5/465.5 kB[0m [31m9.6 MB/s[0m eta [36m0:00:00[0m
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m59.4/59.4 MB[0m [31m30.6 MB/s[0m eta [36m0:00:00[0m
[?25h

In [4]:
# Load and preprocess dataset
raw_train = load_dataset("marcbishara/sarcasm-on-reddit", split="sft_train")
raw_eval = load_dataset("marcbishara/sarcasm-on-reddit", split="sft_validation")
train_dataset = raw_train.shuffle(seed=42)
eval_dataset = raw_eval.shuffle(seed=42)
print(f"Initial SFT Training samples for test: {len(train_dataset)}")
print(f"Initial SFT Validation samples for test: {len(eval_dataset)}")

# Format samples
def format_sample(example):
    parent = example.get('parent_comment', '')
    response = example.get('comment', '')

    if not parent or not response:
        return {"text": None}

    text = f"<PARENT> {parent} </PARENT>\n<RESPONSE> {response} </RESPONSE>"

    if len(text) > 800:
        return {"text": None}

    return {"text": text}

train_dataset = train_dataset.map(format_sample, remove_columns=train_dataset.column_names)
train_dataset = train_dataset.filter(lambda x: x["text"] is not None)
eval_dataset = eval_dataset.map(format_sample, remove_columns=eval_dataset.column_names)
eval_dataset = eval_dataset.filter(lambda x: x["text"] is not None)

print(f"Final Training samples after filtering: {len(train_dataset)}")
print(f"Final Validation samples after filtering: {len(eval_dataset)}")

Initial SFT Training samples for test: 272922
Initial SFT Validation samples for test: 30325


Map:   0%|          | 0/272922 [00:00<?, ? examples/s]

Filter:   0%|          | 0/272922 [00:00<?, ? examples/s]

Map:   0%|          | 0/30325 [00:00<?, ? examples/s]

Filter:   0%|          | 0/30325 [00:00<?, ? examples/s]

Final Training samples after filtering: 267509
Final Validation samples after filtering: 29737


In [5]:
# Load GPT-2 tokenizer and model
model_name = "gpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(model_name)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [7]:
# Define training arguments
args = TrainingArguments(
    output_dir="./gpt2-sft",
    num_train_epochs=3,
    per_device_train_batch_size=16,
    per_device_eval_batch_size=16,
    gradient_accumulation_steps=1,
    learning_rate=1e-5,
    fp16=True,
    save_strategy="epoch",
    save_total_limit=1,
    report_to="none",
    eval_strategy="epoch",
    logging_strategy="steps",
    logging_steps=100,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
)

In [8]:
# SFT trainer
trainer = SFTTrainer(
    model=model,
    args=args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
)
trainer.train()
trainer.evaluate()

Adding EOS to train dataset:   0%|          | 0/267509 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/267509 [00:00<?, ? examples/s]

Truncating train dataset:   0%|          | 0/267509 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/29737 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/29737 [00:00<?, ? examples/s]

Truncating eval dataset:   0%|          | 0/29737 [00:00<?, ? examples/s]

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,2.6933,2.625586,2.641248,16242005.0,0.534281
2,2.6527,2.613466,2.594055,32484010.0,0.535595


Epoch,Training Loss,Validation Loss,Entropy,Num Tokens,Mean Token Accuracy
1,2.6933,2.625586,2.641248,16242005.0,0.534281
2,2.6527,2.613466,2.594055,32484010.0,0.535595
3,2.6353,2.609763,2.5801,48726015.0,0.535997


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


{'eval_loss': 2.6097633838653564,
 'eval_runtime': 52.8608,
 'eval_samples_per_second': 562.553,
 'eval_steps_per_second': 35.168,
 'eval_entropy': 2.5801002765099423,
 'eval_num_tokens': 48726015.0,
 'eval_mean_token_accuracy': 0.5359967675595081,
 'epoch': 3.0}

In [9]:
# Test SFT training result
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

# Load SFT checkpoint
# Load the fine-tuned tokenizer and model
ckpt_path = "./gpt2-sft/checkpoint-50160"
tokenizer = AutoTokenizer.from_pretrained(ckpt_path)
tokenizer.pad_token = tokenizer.eos_token

# Load the fine-tuned causal language model.
model = AutoModelForCausalLM.from_pretrained(ckpt_path)
model.eval()
model.to("cuda")

def test_sft(prompt):
    inputs = tokenizer(prompt, return_tensors="pt").to("cuda")

    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=80,
            temperature=0.7,
            top_p=0.9,
            do_sample=True,
            pad_token_id=tokenizer.eos_token_id,
        )

    print("\n------ MODEL OUTPUT ------")
    print(tokenizer.decode(output[0], skip_special_tokens=True))


#Run a sample test prompt
prompt = "<PARENT> I love spending all weekend doing homework. </PARENT>\n<RESPONSE>"
test_sft(prompt)



------ MODEL OUTPUT ------
<PARENT> I love spending all weekend doing homework. </PARENT>
<RESPONSE> You're not allowed to do homework in school. </RESPONSE>
