In [29]:
# -----------------------------
# 1. Import libraries
# -----------------------------

import numpy as np
import pandas as pd
import torch
from transformers import pipeline, AutoTokenizer, AutoModelForCausalLM, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import load_dataset, Dataset
from sklearn.model_selection import KFold 
import math

import os
for dirname, _, filenames in os.walk('./data'):
    for filename in filenames:
        print(os.path.join(dirname, filename))

./data/follow_up_question_dataset.csv


In [30]:
# -----------------------------
# 2. Use GPU if available
# -----------------------------

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")

Using device: cpu


In [31]:
# ----------------------------------
# 3. Load and preprocess the dataset
# ----------------------------------

file_path = './data/follow_up_question_dataset.csv'
dataset = load_dataset('csv', data_files=file_path)

# Combine the Statement and Follow-Up Question into a single prompt, which is the input required for the distilgpt2 model
def build_prompt(example):
    prompt = f"Statement: {example['statement']}\nFollow-Up: {example['follow_up_question']}"
    return {"text": prompt}

dataset = dataset.map(build_prompt) # apply the function to each example in the dataset

model_name = "distilgpt2"
tokenizer = AutoTokenizer.from_pretrained(model_name)
tokenizer.pad_token = tokenizer.eos_token


def tokenize_function(examples):
    return tokenizer(examples["text"], truncation=True, padding="max_length", max_length=128)

tokenized_dataset = dataset.map(tokenize_function, batched=True)

data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [32]:
# --------------------
# 4. Training one fold
# --------------------

def train_one_fold(train_ds, val_ds, fold_id):
    """
    Fine-tune DistilGPT-2 on one (train, validation) split.
    Returns the validation loss for this fold.
    """
    model = AutoModelForCausalLM.from_pretrained(model_name)

    args = TrainingArguments(
        output_dir = f"./fold_{fold_id}",   # distinct directory per fold
        num_train_epochs = 10,
        per_device_train_batch_size = 2,
        logging_strategy="epoch",
        eval_strategy = "epoch",
        seed = 42,
    )

    trainer = Trainer(
        model = model,
        args = args,
        train_dataset = train_ds,
        eval_dataset = val_ds,
        data_collator = data_collator,
    )

    trainer.train()
    metrics = trainer.evaluate()
    return metrics["eval_loss"]


# ----------------------------------------
# 5. Training with K-Fold Cross-Validation
# ----------------------------------------

k = 5
kf = KFold(n_splits=k, shuffle=True, random_state=42)

indices = np.arange(len(tokenized_dataset["train"]))

losses = []
for fold, (train_idx, val_idx) in enumerate(kf.split(indices)):
    train_ds = tokenized_dataset["train"].select(train_idx.tolist())
    val_ds   = tokenized_dataset["train"].select(val_idx.tolist())

    loss = train_one_fold(train_ds, val_ds, fold)
    losses.append(loss)
    print(f"Fold {fold+1}/{k}  •  val_loss = {loss:.4f}")

print("\nCross-validated loss:",
      np.mean(losses), "±", np.std(losses),
      "   (perplexity ≈", math.exp(np.mean(losses)), ")")

Epoch,Training Loss,Validation Loss
1,3.2773,2.864744
2,2.396,2.431613
3,1.9149,2.185624
4,1.537,2.022842
5,1.3584,1.951285
6,1.1212,1.910924
7,1.0131,1.88484
8,0.9458,1.872361
9,0.8651,1.868285
10,0.8354,1.86844


Fold 1/5  •  val_loss = 1.8684


Epoch,Training Loss,Validation Loss
1,3.5384,3.185403
2,3.13,2.990004
3,2.7136,2.682733
4,2.5129,2.578274
5,2.398,2.517755
6,2.1197,2.466115
7,2.1148,2.406915
8,2.0352,2.350067
9,1.9208,2.313144
10,1.8585,2.298336


Fold 2/5  •  val_loss = 2.2983


Epoch,Training Loss,Validation Loss
1,3.4526,2.984812
2,2.5494,2.50348
3,1.9287,2.193488
4,1.4972,2.017502
5,1.3187,1.935356
6,1.0473,1.887828
7,0.9405,1.886858
8,0.842,1.903714
9,0.7427,1.923998
10,0.7286,1.930981


Fold 3/5  •  val_loss = 1.9310


Epoch,Training Loss,Validation Loss
1,3.1805,2.729129
2,2.1538,2.301623
3,1.6196,2.125255
4,1.271,2.067447
5,1.1075,2.043861
6,0.901,2.059117
7,0.7861,2.113421
8,0.7361,2.132623
9,0.6839,2.145864
10,0.65,2.147092


Fold 4/5  •  val_loss = 2.1471


Epoch,Training Loss,Validation Loss
1,3.2703,2.538452
2,2.1816,2.215488
3,1.621,2.067187
4,1.2953,2.016674
5,1.1377,1.991125
6,0.8873,2.000267
7,0.8009,2.017766
8,0.7404,2.045882
9,0.6486,2.056655
10,0.6477,2.062649


Fold 5/5  •  val_loss = 2.0626

Cross-validated loss: 2.061499524116516 ± 0.153444492375097    (perplexity ≈ 7.857743856793563 )


In [33]:
# ----------------------------------------
# 6. Final training on the entire dataset
# ----------------------------------------

# Create train and validation datasets from the original dataset and tokenize them
final_dataset = dataset["train"].train_test_split(test_size=0.2, seed=42)
final_tokenized_dataset = final_dataset.map(tokenize_function, batched=True)

# Load the model
model = AutoModelForCausalLM.from_pretrained(model_name)

# Define training arguments
final_args = TrainingArguments(
    output_dir = "./distilgpt2-followup",
    num_train_epochs = 10,
    per_device_train_batch_size = 2,
    logging_steps = 10,
    logging_strategy = "epoch",
    eval_strategy = "epoch",
    save_strategy = "epoch",
    seed = 42,
    load_best_model_at_end = True,
    metric_for_best_model = "eval_loss",
    greater_is_better = False,
    learning_rate = 2e-5,
    weight_decay = 0.01,
    max_grad_norm = 1.0,
)

trainer_final = Trainer(
    model = model,
    args = final_args,
    train_dataset = final_tokenized_dataset["train"],
    eval_dataset = final_tokenized_dataset["test"],
    data_collator = data_collator,
)

trainer_final.train()

Epoch,Training Loss,Validation Loss
1,3.4739,2.981997
2,2.8604,2.651104
3,2.4282,2.434
4,2.1818,2.294168
5,2.0151,2.205747
6,1.7953,2.147324
7,1.6775,2.109092
8,1.6202,2.086211
9,1.5231,2.072872
10,1.5016,2.067855


There were missing keys in the checkpoint model loaded: ['lm_head.weight'].


TrainOutput(global_step=40, training_loss=2.1077220320701597, metrics={'train_runtime': 15.8664, 'train_samples_per_second': 5.042, 'train_steps_per_second': 2.521, 'total_flos': 2612967505920.0, 'train_loss': 2.1077220320701597, 'epoch': 10.0})

In [34]:
# -------------------------------------
# 7. Save the final model and tokenizer
# -------------------------------------

trainer_final.save_model("./distilgpt2-followup")
tokenizer.save_pretrained("./distilgpt2-followup")

('./distilgpt2-followup/tokenizer_config.json',
 './distilgpt2-followup/special_tokens_map.json',
 './distilgpt2-followup/vocab.json',
 './distilgpt2-followup/merges.txt',
 './distilgpt2-followup/added_tokens.json',
 './distilgpt2-followup/tokenizer.json')

In [35]:
# -------------------------
# 8. Prep for user examples
# -------------------------

import re

model_dir = "./distilgpt2-followup"

tokenizer = AutoTokenizer.from_pretrained(model_dir)
model     = AutoModelForCausalLM.from_pretrained(model_dir)

generator = pipeline(
    "text-generation",
    model=trainer_final.model,
    tokenizer=tokenizer,
    max_length=60,
    min_length=20,
    early_stopping=True,
    num_beams=5,            # ← search 5 beams in parallel
    length_penalty=1.2,     # ← slightly favor longer sequences
    do_sample=True,
    top_k=50,
    top_p=0.9, 
    temperature=0.7,
)

def ask_follow_up(statement: str) -> str:
    prompt = f"Statement: {statement}\nFollow-Up:"
    out = generator(prompt, num_return_sequences=1)[0]["generated_text"]
    parts = re.split(r"Follow-Up:?\s*", out)
    return parts[-1].strip()


# -----------------------------------------
# 9. Test out the model with a few examples
# -----------------------------------------
statements = [
    "I volunteered at a local animal shelter yesterday.",
    "I went hiking in the mountains last weekend.",
    "I started reading a new book about the history of art."
]

for statement in statements:
    print(f"Statement: {statement}")
    print(f"Follow-Up: {ask_follow_up(statement)}")
    print()

Device set to use mps:0
Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


Statement: I volunteered at a local animal shelter yesterday.
Follow-Up: What inspired you to volunteer at a local animal shelter?

Statement: I went hiking in the mountains last weekend.
Follow-Up: What inspired you to hike there?

Statement: I started reading a new book about the history of art.
Follow-Up: What inspired you to start learning art?



In [36]:
# -----------------------------------------------------------------
# 10. Uncomment and run the following block to test with user input
# -----------------------------------------------------------------

"""
# Get input from user
user_input = input("Enter your statement: ")

# Generate and display a follow-up question
follow_up = ask_follow_up(user_input)
print("Suggested follow-up question:", follow_up)
"""

'\n# Get input from user\nuser_input = input("Enter your statement: ")\n\n# Generate and display a follow-up question\nfollow_up = ask_follow_up(user_input)\nprint("Suggested follow-up question:", follow_up)\n'