In [None]:
# GPT-2 dialog fine-tuning — end-to-end, Parquet-backed dataset (no scripts)

!pip install -q --upgrade transformers datasets torch

import os, math
from datasets import load_dataset, DatasetDict
from transformers import (
    AutoTokenizer, AutoModelForCausalLM,
    Trainer, TrainingArguments, DataCollatorForLanguageModeling
)
import torch

# -----------------------------
# 1) Model & tokenizer (GPT-2)
# -----------------------------
model_name = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token

# Seed for reproducibility (optional)
torch.manual_seed(42)
if torch.cuda.is_available():
    torch.cuda.manual_seed_all(42)

# Quick helper to sample generations
def sample(prompt, max_new_tokens=80):
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        output = model.generate(
            **inputs,
            max_new_tokens=max_new_tokens,
            do_sample=True, top_p=0.95, temperature=0.9,
            pad_token_id=tokenizer.eos_token_id,
            repetition_penalty=1.2
        )
    print(tokenizer.decode(output[0], skip_special_tokens=True))

# -------------------------------------------
# 2) "Before fine-tuning" sanity generations
# -------------------------------------------
print("Before fine-tuning:\n")
sample("User: Hi there!\nAssistant:")
sample("User: What's your favorite programming language?\nAssistant:")

# -------------------------------------------------------
# 3) Load a script-free dialog dataset from Hugging Face
#    (Parquet URIs → no 'dataset scripts are not supported')
# -------------------------------------------------------
data_files = {
    "train":      "hf://datasets/agentlans/li2017dailydialog@refs/convert/parquet/default/train/0000.parquet",
    "validation": "hf://datasets/agentlans/li2017dailydialog@refs/convert/parquet/default/validation/0000.parquet",
    "test":       "hf://datasets/agentlans/li2017dailydialog@refs/convert/parquet/default/test/0000.parquet",
}
raw_ds = load_dataset("parquet", data_files=data_files)  # -> DatasetDict with train/validation/test

# ------------------------------------------------
# 4) Build training text: "User:" / "Assistant:"
#    The Parquet rows have `conversations`:
#    [ {from: system|human|gpt, value: "..."}, ... ]
# ------------------------------------------------
def to_text(example):
    conv = example["conversations"]
    lines = []
    for m in conv:
        role = m.get("from", "")
        if role == "system":
            continue  # skip system prompts
        speaker = "User" if role in ("human", "user") else "Assistant"
        lines.append(f"{speaker}: {m['value']}")
    return {"text": "\n".join(lines).strip()}

ds = DatasetDict()
for split in raw_ds.keys():  # train / validation / test
    ds[split] = raw_ds[split].map(to_text, remove_columns=raw_ds[split].column_names)
    ds[split] = ds[split].filter(lambda e: len(e["text"]) > 0)

# --- NEW: use only half of the training split (random, reproducible) ---
train_n = ds["train"].num_rows
half_n  = train_n // 2
ds["train"] = ds["train"].shuffle(seed=42).select(range(half_n))
print(f"Training on {half_n} / {train_n} examples (~50%).")
# -----------------------------------------------------------------------


# --------------------------------
# 5) Tokenize (lab-style settings)
# --------------------------------
def tokenize_dataset(example):
    return tokenizer(example["text"], truncation=True, padding="max_length", max_length=128)

tokenized = ds.map(tokenize_dataset, batched=True)

# -----------------------------------------------
# 6) Trainer setup (same structure as your lab)
# -----------------------------------------------
training_args = TrainingArguments(
    output_dir="./dialog-gpt2-finetuned",
    per_device_train_batch_size=8,   # lower to 4/2 if OOM
    num_train_epochs=3,
    save_steps=250,
    logging_steps=50,
    learning_rate=5e-5,
    warmup_steps=50,
    weight_decay=0.01,
    report_to="none",
)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized["train"],
    tokenizer=tokenizer,
    data_collator=data_collator,
)

# ---------------
# 7) Fine-tuning
# ---------------
trainer.train()

# -----------------------------------------
# 8) After fine-tuning: sample generations
# -----------------------------------------
print("\nAfter fine-tuning:\n")
sample("User: Hello! How are you?\nAssistant:")
sample("User: What should I cook tonight?\nAssistant:")


Before fine-tuning:

User: Hi there!
Assistant: Ok, so you've come in here for lunch. And to see what's out of the way it just really does suck that I haven't done an episode yet and not enough news is being relayed at all!! You guys are a buncha busy right now lol!!!!! Now we can talk about our upcoming episodes because today was my last day with this show (or rather how long ago). If your
User: What's your favorite programming language?
Assistant: Hinting at the lack of context, I think it has a really nice design and there is an amazing community about using this.


Map:   0%|          | 0/11118 [00:00<?, ? examples/s]

Filter:   0%|          | 0/11118 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Filter:   0%|          | 0/1000 [00:00<?, ? examples/s]

Training on 5559 / 11118 examples (~50%).


Map:   0%|          | 0/5559 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

Map:   0%|          | 0/1000 [00:00<?, ? examples/s]

  trainer = Trainer(
The tokenizer has new PAD/BOS/EOS tokens that differ from the model config and generation config. The model config and generation config were aligned accordingly, being updated with the tokenizer's values. Updated tokens: {'pad_token_id': 50256}.


Step,Training Loss
50,2.8995
100,2.4949
150,2.4101
200,2.4296
250,2.3348
300,2.3176
350,2.3189
400,2.333
450,2.2954
500,2.2883



After fine-tuning:

User: Hello! How are you?
Assistant: Oh, really? What’s wrong with me today? I don't sleep well.
User; Have you got any symptoms tonight?
Assassination is very painful, it hurts my eyes and sometimes even gives in to cramps. It can be dangerous for the person sleeping as well. But usually no matter what happens we just try not take it too seriously. So why do people
User: What should I cook tonight?
Assistant: If you don’t mind. It can be a bit noisy if we want to make some fried chicken.
User, it is no problem in that place. Do not eat yourself for dinner this night, just clean up after yourselves.
Objection made. This time I think it will pay off nice here.
Assistant : How shall i turn on the grill now? I never use
