In [14]:
# -----------------------------
# 1. Import libraries
# -----------------------------

import numpy as np
import pandas as pd
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset
from sklearn.model_selection import KFold 
import math

import os
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./data/follow_up_question_dataset.csv


In [15]:
# -----------------------------
# 2. Use GPU if available
# -----------------------------

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [16]:
# ----------------------------------
# 3. Load and preprocess the dataset
# ----------------------------------

file_path = './data/follow_up_question_dataset.csv'
dataset = load_dataset('csv', data_files=file_path)

# Combine the Statement and Follow-Up Question into a single prompt, which is the input required for the distilgpt2 model
def build_prompt(example):
    prompt = f"Statement: {example['statement']}\nFollow-Up: {example['follow_up_question']}"
    return {"text": prompt}

dataset = dataset.map(build_prompt) # apply the function to each example in the dataset

model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=256)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [34]:
# --------------------
# 4. Training one fold
# --------------------

def train_one_fold(train_ds, val_ds, fold_id):
    """
    Fine-tune DistilGPT-2 on one (train, validation) split.
    Returns the validation loss for this fold.
    """
    model = AutoModelForCausalLM.from_pretrained(model_name)

    args = TrainingArguments(
        output_dir = f"./fold_{fold_id}",   # distinct directory per fold
        num_train_epochs = 10,
        per_device_train_batch_size = 2,
        logging_steps = 10,
        save_strategy = "no",               # avoids clutter
        seed = 42,
    )

    trainer = Trainer(
        model = model,
        args = args,
        train_dataset = train_ds,
        eval_dataset = val_ds,
        data_collator = data_collator,
    )

    trainer.train()
    metrics = trainer.evaluate()
    return metrics["eval_loss"]


# ----------------------------------------
# 5. Training with K-Fold Cross-Validation
# ----------------------------------------

k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

indices = np.arange(len(tokenized_dataset["train"]))

losses = []
for fold, (train_idx, val_idx) in enumerate(kf.split(indices)):
    train_ds = tokenized_dataset["train"].select(train_idx.tolist())
    val_ds   = tokenized_dataset["train"].select(val_idx.tolist())

    loss = train_one_fold(train_ds, val_ds, fold)
    losses.append(loss)
    print(f"Fold {fold+1}/{k}  •  val_loss = {loss:.4f}")

print("\nCross-validated loss:",
      np.mean(losses), "±", np.std(losses),
      "   (perplexity ≈", math.exp(np.mean(losses)), ")")

Step,Training Loss
10,2.5326
20,1.3838
30,0.8713
40,0.7345


Fold 1/5  •  val_loss = 1.8252


Step,Training Loss
10,2.5554
20,1.3317
30,0.8856
40,0.7745


Fold 2/5  •  val_loss = 1.6957


Step,Training Loss
10,2.8384
20,1.9324
30,1.4486
40,1.3014


Fold 3/5  •  val_loss = 2.0847


Step,Training Loss
10,2.5866
20,1.3933
30,0.8836
40,0.7626


Fold 4/5  •  val_loss = 2.1557


Step,Training Loss
10,2.5614
20,1.3239
30,0.8339
40,0.6989


Fold 5/5  •  val_loss = 2.0697

Cross-validated loss: 1.9661964178085327 ± 0.17535332028205544    (perplexity ≈ 7.143454035193242 )


In [35]:
# ----------------------------------------
# 6. Final training on the entire dataset
# ----------------------------------------

# Load the model
model = AutoModelForCausalLM.from_pretrained(model_name)

# Define training arguments
final_args = TrainingArguments(
    output_dir = "./distilgpt2-followup",
    num_train_epochs = 10,
    per_device_train_batch_size = 2,
    logging_steps = 10,
    save_strategy = "no", 
    seed = 42,
)

trainer_final = Trainer(
    model = model,
    args = final_args,
    train_dataset = tokenized_dataset["train"],
    data_collator = data_collator,
)

trainer_final.train()

Step,Training Loss
10,2.6902
20,1.5546
30,1.004
40,0.8067
50,0.6682


TrainOutput(global_step=50, training_loss=1.3447319316864013, metrics={'train_runtime': 9.5955, 'train_samples_per_second': 10.422, 'train_steps_per_second': 5.211, 'total_flos': 6532418764800.0, 'train_loss': 1.3447319316864013, 'epoch': 10.0})

In [36]:
# -------------------------------------
# 7. Save the final model and tokenizer
# -------------------------------------

trainer_final.save_model("./distilgpt2-followup")
tokenizer.save_pretrained("./distilgpt2-followup")

('./distilgpt2-followup/tokenizer_config.json',
 './distilgpt2-followup/special_tokens_map.json',
 './distilgpt2-followup/vocab.json',
 './distilgpt2-followup/merges.txt',
 './distilgpt2-followup/added_tokens.json',
 './distilgpt2-followup/tokenizer.json')

In [38]:
# -------------------------
# 8. Prep for user examples
# -------------------------

model_dir = "./distilgpt2-followup"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model     = AutoModelForCausalLM.from_pretrained(model_dir)

generator = pipeline(
    "text-generation",
    model=trainer_final.model,
    tokenizer=tokenizer,
    max_length=60,
    min_length=20,
    do_sample=True,
    top_k=50,
    top_p=0.9, 
    temperature=0.7,
)

def ask_follow_up(statement: str) -> str:
    prompt = f"Statement: {statement}\nFollow-Up:"
    out = generator(prompt, num_return_sequences=1)[0]["generated_text"]
    return out.split("Follow-Up:")[-1].strip()


# -----------------------------------------
# 9. Test out the model with a few examples
# -----------------------------------------

# Get input from user
user_input = input("Enter your statement: ")
# user_input = "I just watched the latest Captain America movie last weekend"

# Generate and display a follow-up question
follow_up = ask_follow_up(user_input)
print("Suggested follow-up question:", follow_up)

Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Suggested follow-up question: What do you find most rewarding about watching a movie?
