In [2]:
from pathlib import Path
from datetime import datetime
import json, time, random

import torch
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TextGenerationPipeline,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
)
from datasets import load_dataset, Dataset, DatasetDict

device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Running on: {device}")

Running on: cpu


In [3]:
def build_pipeline(model_name: str, **gen_kwargs):
    """
    Returns a text-generation pipeline with sensible defaults
    (can be overridden via **gen_kwargs).
    """
    tok = AutoTokenizer.from_pretrained(model_name)
    mdl = AutoModelForCausalLM.from_pretrained(model_name).to(device)
    # If the tokenizer has no pad token (GPT-2 family), set one:
    if tok.pad_token_id is None:
        tok.pad_token = tok.eos_token
    pipe = TextGenerationPipeline(model=mdl, tokenizer=tok, device=0 if device == "cuda" else -1)
    pipe_kwargs = dict(
        do_sample=True,
        max_length=64,
        temperature=1.0,
        top_k=0,
        top_p=0.95,
        num_return_sequences=1,
        pad_token_id=tok.eos_token_id,
    )
    pipe_kwargs.update(gen_kwargs)
    return pipe, pipe_kwargs

In [4]:
def pretty_print(prompt, out, meta):
    print("=" * 80)
    print(f"PROMPT: {prompt!r}")
    print(f"PARAMS: {json.dumps(meta, indent=2)}")
    print("-" * 80)
    print(out[0]["generated_text"])
    print("=" * 80 + "\n")

In [None]:
print("\n### TASK 2.2: Diverse generations (gpt2 base) ###\n")
BASE_MODEL = "gpt2"
prompts = [
    "Once upon a midnight dusk,",
    "Explain quantum computing in simple terms:",
    "The secret ingredient of success is",
    "In a world where AI governs humans, the first rule is",
    """User: What's the best way to learn Python?\nAssistant:""",
    "He opened the door and immediately knew",
    "Paris in the year 2100 will most likely",
]

param_grid = [
    dict(max_length=60, temperature=0.7, top_k=50, top_p=0.9),
    dict(max_length=50, temperature=1.0, top_k=0, top_p=0.92),
    dict(max_length=80, temperature=1.3, top_k=30, top_p=0.9),
    dict(max_length=40, temperature=0.6, top_k=40, top_p=0.85),
    dict(max_length=70, temperature=1.1, top_k=0, top_p=0.95),
]

pipe, base_defaults = build_pipeline(BASE_MODEL)  # default kwargs here

generation_log = []  # keep a record for your report

for prompt, overrides in zip(prompts, param_grid):
    gen_kwargs = {**base_defaults, **overrides}
    t0 = time.time()
    out = pipe(prompt, **gen_kwargs)
    dt = time.time() - t0
    meta = dict(model=BASE_MODEL, latency=f"{dt:.2f}s", **overrides)
    generation_log.append(dict(prompt=prompt, output=out[0]["generated_text"], **meta))
    pretty_print(prompt, out, meta)

# Save raw generations to disk for later inspection
Path("outputs").mkdir(exist_ok=True)
Path("outputs/task2_2_generations.json").write_text(json.dumps(generation_log, indent=2))




### TASK 2.2: Diverse generations (gpt2 base) ###



Device set to use cpu
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.
Both `max_new_tokens` (=256) and `max_length`(=60) seem to have been set. `max_new_tokens` will take precedence. Please refer to the documentation for more information. (https://huggingface.co/docs/transformers/main/en/main_classes/text_generation)


In [None]:
print("\n### TASK 2.3: Model size comparison ###\n")
MODELS = ["gpt2", "gpt2-medium"]  # add "gpt2-large" if resources permit

common_params = dict(max_length=72, temperature=0.85, top_k=50, top_p=0.9)

comparison_log = []

for model_name in MODELS:
    pipe, defaults = build_pipeline(model_name, **common_params)
    for prompt in prompts[:5]:  # use first 5 prompts for speed
        t0 = time.time()
        out = pipe(prompt, **defaults)
        dt = time.time() - t0
        meta = dict(model=model_name, latency=f"{dt:.2f}s", **common_params)
        comparison_log.append(dict(prompt=prompt, output=out[0]["generated_text"], **meta))
        pretty_print(prompt, out, meta)

Path("outputs/task2_3_comparison.json").write_text(json.dumps(comparison_log, indent=2))

In [None]:
print("\n### TASK 2.4: Fine-tuning ###\n")
# ---- 1. Load / prepare dataset ---------------------------------
CSV_PATH = "freecodecamp_casual_chatroom.csv"      # <- update if different
TEXT_COL = "message"

if Path(CSV_PATH).exists():
    raw_ds = load_dataset("csv", data_files=CSV_PATH)
else:
    raise FileNotFoundError(f"{CSV_PATH} not found! Place your chat log CSV in the working dir.")

# Combine multiple small rows into ~512-token chunks ----------------
tok_ft = AutoTokenizer.from_pretrained(BASE_MODEL)
tok_ft.pad_token = tok_ft.eos_token
TOK_LEN = 512

def chunk_messages(example):
    joined = " ".join(example[TEXT_COL]).strip()
    tokens = tok_ft.encode(joined)
    chunks = [tokens[i : i + TOK_LEN] for i in range(0, len(tokens), TOK_LEN)]
    return {"input_ids": chunks}

ds_tokenized = raw_ds.map(
    chunk_messages,
    batched=True,
    remove_columns=raw_ds["train"].column_names,
).remove_columns([])  # now only input_ids

def flatten(dataset):
    # push nested lists to individual rows
    input_ids = sum(dataset["input_ids"], [])
    return Dataset.from_dict({"input_ids": input_ids})

train_ds = flatten(ds_tokenized["train"])
val_ds   = train_ds.train_test_split(test_size=0.05, seed=42)["test"]


In [None]:
data_collator = DataCollatorForLanguageModeling(tok_ft, mlm=False)


In [None]:
FT_MODEL_DIR = "gpt2-finetuned-chat"
training_args = TrainingArguments(
    output_dir=FT_MODEL_DIR,
    overwrite_output_dir=True,
    num_train_epochs=1,        # ↑ increase if you have more GPU time
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    evaluation_strategy="epoch",
    logging_steps=100,
    save_total_limit=2,
    learning_rate=5e-5,
    fp16=torch.cuda.is_available(),
    report_to="none",
)

In [None]:
ft_model = AutoModelForCausalLM.from_pretrained(BASE_MODEL).to(device)


In [None]:
trainer = Trainer(
    model=ft_model,
    args=training_args,
    train_dataset=train_ds,
    eval_dataset=val_ds,
    tokenizer=tok_ft,
    data_collator=data_collator,
)

trainer.train()
trainer.save_model(FT_MODEL_DIR)
tok_ft.save_pretrained(FT_MODEL_DIR)

In [None]:
ft_pipe, ft_defaults = build_pipeline(FT_MODEL_DIR, max_length=64, temperature=0.9, top_p=0.9)
finetune_prompts = [
    "User: How do I fix a TypeError in JavaScript?\nAssistant:",
    ">>> print('Hello world')  # explain output\n",
    "Any tips for learning algorithms quickly?",
]

ft_log = []
for prompt in finetune_prompts:
    out = ft_pipe(prompt, **ft_defaults)
    ft_log.append(dict(prompt=prompt, output=out[0]["generated_text"], model="gpt2-finetuned-chat"))
    pretty_print(prompt, out, {"model": "gpt2-finetuned-chat", **ft_defaults})

Path("outputs/task2_4_finetune_generations.json").write_text(json.dumps(ft_log, indent=2))

print("\nAll tasks completed. Outputs saved to the 'outputs/' folder.")