# MSAI 495 | Text Generation | Conversation Primer

### Business Goal / Case Statement

Accelerate and Innovate T-shirt Graphic Design Through AI.

### Assignment Context

**Relevant Industry and/or Business Function:** Social/Messaging apps

**Description:**

### The Data

**Dataset name:** <code>[conversation-starters](https://huggingface.co/datasets/Langame/conversation-starters)</code><br>

**Data characteristics**

* 17,470 diverse prompts with topic tags

* Wide range of conversation depths (casual to profound)

* Multiple topic categories for targeted generation

* Varying prompt lengths and complexity

### Model Architecture(s)

* GPT-2 Medium/Large: For autoregressive generation of conversation starters

### AI/ML Task(s)

Fine-tune a pre-trained transformer model (GTP-2)

## Step 1: Environment Setup and Installation

In [None]:
import torch
import pandas as pd
import json
from transformers import (
    GPT2LMHeadModel,
    GPT2Tokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import Dataset
import numpy as np

## Step 2: Load and Explore the Dataset

In [None]:
# Load the dataset
from datasets import load_dataset

# Load the conversation starters dataset
dataset = load_dataset("Langame/conversation-starters")

# Explore the dataset structure
print("Dataset structure:")
print(dataset)
print("\nFirst few examples:")
print(dataset['train'][:5])

# Check the columns and data format
print("\nColumn names:", dataset['train'].column_names)
print("\nDataset size:", len(dataset['train']))

## Step 3: Data Preprocessing and Tokenization

Preprocess the conversation starters into a format suitable for GPT-2 training:

In [None]:
def format_conversation_data(examples):
    """
    Format the data for GPT-2 training
    """
    formatted_texts = []

    for topics, prompt in zip(examples['topics'], examples['prompt']):
        # Handle topics properly
        if topics and len(topics) > 0:
            topic_str = ", ".join(topics[0]) if isinstance(topics[0], list) else str(topics[0])
        else:
            topic_str = "general"

        # Use a clearer format with special tokens
        formatted_text = f"<|startoftext|>Topic: {topic_str}\nConversation Starter: {prompt}<|endoftext|>"
        formatted_texts.append(formatted_text)

    return {"text": formatted_texts}

# Re-format your dataset
print("Re-formatting dataset with improved structure...")
formatted_dataset = dataset['train'].map(format_conversation_data, batched=True)
formatted_dataset = formatted_dataset.remove_columns(['topics', 'prompt'])

print("Sample formatted text:")
print(formatted_dataset[0]['text'])

## Step 4: Model and Tokenizer Setup

Load the GPT-2 model and tokenizer, and configure them for fine-tuning:

In [None]:
# Load GPT-2 model and tokenizer
model_name = "gpt2"  # You can also use "gpt2-medium" for better performance
tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Add padding token (GPT-2 doesn't have one by default)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

# Add special tokens BEFORE resizing embeddings
special_tokens = {
    "additional_special_tokens": ["<|startoftext|>", "<|topic|>", "<|starter|>"]
}
num_added_tokens = tokenizer.add_special_tokens(special_tokens)

# Resize model embeddings to accommodate new tokens
model.resize_token_embeddings(len(tokenizer))

print(f"Model loaded: {model_name}")
print(f"Added {num_added_tokens} special tokens")
print(f"New vocabulary size: {len(tokenizer)}")
print(f"Model parameters: {model.num_parameters():,}")

## Step 5: Tokenization Function

Create a function to tokenize formatted text data for training:

In [None]:
def tokenize_function(examples):
    """
    Tokenize the text data for GPT-2 training
    """
    # Tokenize the text
    tokenized = tokenizer(
        examples["text"],
        padding="max_length",  # Pad all sequences to max_length
        truncation=True,
        max_length=512,  # Adjust based on your needs and GPU memory
        return_tensors=None
    )

    # Set labels as lists (not tensor clones)
    tokenized["labels"] = [list(ids) for ids in tokenized["input_ids"]]

    return tokenized

# Re-apply the corrected tokenization
print("Re-tokenizing dataset with proper padding...")
tokenized_dataset = formatted_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["text"]
)

# Recreate the train/test split
train_test_split = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

print(f"Re-tokenization complete!")
print(f"Sample lengths should now be consistent: {len(tokenized_dataset[0]['input_ids'])}")

## Step 6: Data Splitting and Data Collator Setup

Split dataset into training and validation sets, and set up the data collator:

In [None]:
# Split the dataset into train and validation sets
train_test_split = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_test_split['train']
eval_dataset = train_test_split['test']

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(eval_dataset)}")

# Set up data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # GPT-2 uses causal language modeling, not masked language modeling
    return_tensors="pt"
)

Step 7: Training Arguments Configuration

In [None]:
import os
os.environ["WANDB_DISABLED"] = "true"

# Define training arguments
training_args = TrainingArguments(
    output_dir="./gpt2-conversation-starters",
    overwrite_output_dir=True,
    num_train_epochs=1,  # Reduced from 3
    per_device_train_batch_size=2,  # Reduced from 4
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=4,  # Increased to maintain effective batch size
    warmup_steps=100,  # Reduced from 500
    learning_rate=5e-5,
    logging_steps=50,   # More frequent logging
    eval_strategy="steps",
    eval_steps=200,     # More frequent evaluation
    save_steps=500,     # More frequent saving
    save_total_limit=2,
    prediction_loss_only=True,
    remove_unused_columns=False,
    dataloader_pin_memory=False,
    fp16=True,
    report_to=None,
    max_steps=1000,
)

print("Training arguments configured!")
print(f"Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"Total training steps: {len(train_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps) * training_args.num_train_epochs}")

## Step 8: Initialize the Trainer

Set up the Trainer object that will handle the fine-tuning process:

In [None]:
# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

print("Trainer initialized successfully!")
print(f"Training dataset size: {len(trainer.train_dataset)}")
print(f"Evaluation dataset size: {len(trainer.eval_dataset)}")

## Step 9: Start Fine-Tuning

In [None]:
# Check GPU status
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

Begin the actual training process:

In [None]:
# Start training
print("Starting fine-tuning...")
print("This may take 30-60 minutes depending on your GPU...")

# Train the model
training_result = trainer.train()

print("Training completed!")
print(f"Final training loss: {training_result.training_loss:.4f}")

## Step 10: Save and Test Fine-Tuned Model

In [None]:
# Save the fine-tuned model
print("Saving the fine-tuned model...")
trainer.save_model("./gpt2-conversation-starters-final")
tokenizer.save_pretrained("./gpt2-conversation-starters-final")
print("Model saved successfully!")

# Test your fine-tuned model
def generate_conversation_starter(topic, max_length=100):
    """
    Generate a conversation starter for a given topic
    """
    # Use the exact training format
    prompt = f"<|startoftext|>Topic: {topic}\nConversation Starter:"

    # Tokenize
    inputs = tokenizer.encode(prompt, return_tensors="pt").to(model.device)
    attention_mask = torch.ones_like(inputs)  # Explicit attention mask

    # Generate with stricter parameters
    with torch.no_grad():
        outputs = model.generate(
            inputs,
            attention_mask=attention_mask,  # Add attention mask
            max_length=inputs.shape[1] + 30,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id,
            # Remove early_stopping flag
        )

    # Decode and extract just the conversation starter
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Extract the conversation starter part
    if "Conversation Starter:" in generated_text:
        starter = generated_text.split("Conversation Starter:")[-1].strip()
        # Clean up - stop at first sentence
        if "?" in starter:
            starter = starter.split("?")[0] + "?"
        elif "." in starter:
            starter = starter.split(".")[0] + "."
        return starter

    return "Could not generate conversation starter"

# Test the improved version
test_topics = ["relationships", "science", "video games", "philosophy"]

print("\n=== Testing Generation of Conversation Primers ===")
for topic in test_topics:
    starter = generate_conversation_starter(topic)
    print(f"\nTopic: {topic}")
    print(f"Generated: {starter}")