# Fine-tune GPT-2 on Movie Script Data

This notebook fine-tunes a GPT-2 model on processed movie script data to generate dialogue and scene descriptions in the style of the training data.

## 1. Install and Import Required Libraries

In [None]:
# Install required packages
# !pip install transformers datasets torch accelerate

In [None]:
import json
import torch
from transformers import GPT2Tokenizer, GPT2LMHeadModel, Trainer, TrainingArguments, DataCollatorForLanguageModeling
from datasets import Dataset
import numpy as np

# Check if GPU is available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

## 2. Load and Prepare the Processed Script Data

In [None]:
# Load the processed script text file directly
with open('processed_scripts/star_wars_processed.txt', 'r', encoding='utf-8') as f:
    training_text = f.read()

print(f"Loaded script with {len(training_text)} characters")
print(f"\nFirst 500 characters:")
print(training_text[:500])

## 3. Load GPT-2 Model and Tokenizer

In [None]:
# Load pretrained GPT-2 model and tokenizer
# Options: "gpt2" (124M), "gpt2-medium" (355M), "gpt2-large" (774M), "gpt2-xl" (1.5B)
model_name = "gpt2-medium"  # Using larger model for better quality

tokenizer = GPT2Tokenizer.from_pretrained(model_name)
model = GPT2LMHeadModel.from_pretrained(model_name)

# Set padding token (GPT-2 doesn't have one by default)
tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = tokenizer.eos_token_id

print(f"Model loaded: {model_name}")
print(f"Model parameters: {model.num_parameters():,}")

## 4. Tokenize and Prepare Dataset

In [None]:
# Split the text into chunks for training
# GPT-2 has a max context length of 1024 tokens
max_length = 512  # Use smaller chunks for efficiency

# Tokenize the entire text
tokenized = tokenizer(training_text, return_tensors='pt', truncation=False)
input_ids = tokenized['input_ids'][0]

print(f"Total tokens: {len(input_ids)}")

# Split into chunks
def create_chunks(input_ids, chunk_size):
    """Split input_ids into chunks of specified size."""
    chunks = []
    for i in range(0, len(input_ids) - chunk_size, chunk_size // 2):  # 50% overlap
        chunk = input_ids[i:i + chunk_size]
        if len(chunk) == chunk_size:
            chunks.append(chunk.tolist())
    return chunks

chunks = create_chunks(input_ids, max_length)
print(f"Created {len(chunks)} training chunks")

# Create dataset
dataset_dict = {'input_ids': chunks}
dataset = Dataset.from_dict(dataset_dict)

# Split into train and validation
dataset = dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = dataset['train']
eval_dataset = dataset['test']

print(f"Training samples: {len(train_dataset)}")
print(f"Validation samples: {len(eval_dataset)}")

## 5. Configure Training Arguments

In [None]:
# Set up training arguments - optimized for gpt2-medium with limited memory
training_args = TrainingArguments(
    output_dir="./gpt2-movie-script",
    overwrite_output_dir=True,
    num_train_epochs=8,  # More epochs for better learning
    per_device_train_batch_size=2,  # Smaller batch for gpt2-medium memory requirements
    per_device_eval_batch_size=2,
    eval_strategy="steps",
    eval_steps=100,  # More frequent evaluation
    save_steps=100,  # Must match eval_steps when using load_best_model_at_end
    warmup_steps=100,  # Gradual warmup
    learning_rate=5e-5,  # Slightly higher for more aggressive learning
    weight_decay=0.01,
    logging_dir='./logs',
    logging_steps=25,  # More frequent logging
    save_total_limit=2,  # Keep fewer checkpoints to save space
    fp16=torch.cuda.is_available(),
    gradient_accumulation_steps=4,  # Effective batch size = 2 * 4 = 8
    report_to="none",
    load_best_model_at_end=True,  # Load best model after training
    metric_for_best_model="loss",
    lr_scheduler_type="cosine",  # Cosine learning rate schedule
)

print("Training configuration:")
print(f"  Epochs: {training_args.num_train_epochs}")
print(f"  Effective batch size: {training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps}")
print(f"  FP16: {training_args.fp16}")
print(f"  Learning rate: {training_args.learning_rate}")
print(f"  LR scheduler: {training_args.lr_scheduler_type}")

print(f"  Total training steps: ~{len(train_dataset) // (training_args.per_device_train_batch_size * training_args.gradient_accumulation_steps) * training_args.num_train_epochs}")print(f"  Learning rate: {training_args.learning_rate}")

## 6. Create Data Collator and Trainer

In [None]:
# Data collator for language modeling
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # GPT-2 uses causal language modeling, not masked LM
)

# Create Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

print("Trainer initialized and ready to train!")

## 7. Train the Model

In [None]:
# Start training
print("Starting training...")
trainer.train()

print("\nTraining complete!")
print("Model saved to: ./gpt2-movie-script")

## 8. Save the Fine-tuned Model

In [None]:
# Save the final model and tokenizer
model.save_pretrained("./gpt2-movie-script-final")
tokenizer.save_pretrained("./gpt2-movie-script-final")

print("Model and tokenizer saved to: ./gpt2-movie-script-final")

## 9. Test the Fine-tuned Model

### Verify the Fine-tuned Model

Check that we're using the fine-tuned model and not the base model.

In [None]:
# Verify we're using the fine-tuned model
import os

# Check 1: Model directory exists
model_dir = "./gpt2-movie-script-final"
if os.path.exists(model_dir):
    print(f"✓ Fine-tuned model directory exists: {model_dir}")
    
    # Check what files are in the directory
    files = os.listdir(model_dir)
    print(f"  Files in directory: {files}")
else:
    print(f"✗ Model directory not found. Using base GPT-2 model.")

# Check 2: Load and compare model
from transformers import GPT2LMHeadModel, GPT2Tokenizer

# Load the fine-tuned model
finetuned_model = GPT2LMHeadModel.from_pretrained(model_dir)
finetuned_tokenizer = GPT2Tokenizer.from_pretrained(model_dir)

# Load base model for comparison
base_model = GPT2LMHeadModel.from_pretrained("gpt2")

# Check 3: Compare a few weights to confirm they're different
finetuned_weight = finetuned_model.transformer.h[0].attn.c_attn.weight[0, :5]
base_weight = base_model.transformer.h[0].attn.c_attn.weight[0, :5]

print("\n✓ Model comparison:")
print(f"  Fine-tuned model first layer weights: {finetuned_weight}")
print(f"  Base GPT-2 first layer weights: {base_weight}")
print(f"  Weights are different: {not torch.allclose(finetuned_weight, base_weight)}")

# Check 4: Quick generation test to see if it uses script-like format
test_prompt = "[ACTION] A spaceship flies through space.\n[CHARACTER:HAN]\n[DIALOGUE]"
inputs = finetuned_tokenizer(test_prompt, return_tensors='pt')
output = finetuned_model.generate(
    inputs['input_ids'],
    max_length=50,
    temperature=0.7,
    do_sample=True,
    pad_token_id=finetuned_tokenizer.eos_token_id
)
generated = finetuned_tokenizer.decode(output[0], skip_special_tokens=True)

print(f"\n✓ Quick generation test with fine-tuned model:")
print(f"  Input: {test_prompt}")
print(f"  Output: {generated}")

# Update the model variable to use fine-tuned model
model = finetuned_model
tokenizer = finetuned_tokenizer
print("\n✓ Model and tokenizer variables updated to use fine-tuned version")

In [None]:
# Function to generate dialogue with context - improved quality settings
def generate_dialogue(context, character_name, max_length=200, temperature=0.7, num_return_sequences=1):
    """
    Generate dialogue for a character given action context.
    The prompt ends with [DIALOGUE] to force the model to generate only dialogue.
    """
    # Build prompt with context + character + dialogue tag
    prompt = f"{context}\n[CHARACTER:{character_name.upper()}]\n[DIALOGUE]"
    inputs = tokenizer(prompt, return_tensors='pt').to(device)
    
    outputs = model.generate(
        inputs['input_ids'],
        max_length=inputs['input_ids'].shape[1] + 30,  # Generate up to 30 new tokens
        min_length=inputs['input_ids'].shape[1] + 8,  # At least 8 new tokens
        temperature=0.7,  # Lower temperature for more focused output
        num_return_sequences=num_return_sequences,
        pad_token_id=tokenizer.eos_token_id,
        do_sample=True,
        top_k=40,  # More restrictive vocabulary
        top_p=0.9,  # More focused sampling
        repetition_penalty=1.5,  # Much stronger repetition penalty
        no_repeat_ngram_size=3,  # Prevent trigram repetition
        length_penalty=0.8,  # Slight preference for shorter responses
        early_stopping=True,
    )
    
    generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract just the dialogue part (after [DIALOGUE])
    if '[DIALOGUE]' in generated_text:
        dialogue = generated_text.split('[DIALOGUE]')[-1].strip()
        # Remove any tags that might have leaked through
        dialogue = dialogue.split('[')[0].strip()
        return dialogue
    return generated_text

# Move model to device
model = model.to(device)

# Test with Star Wars scenarios - all from Luke's perspective
starwars_scenarios = [
    """[ACTION] Luke Skywalker sits in the co-pilot seat of the Millennium Falcon, gripping the edges of his chair as the ship lurches violently to one side. Another blast from the Imperial Star Destroyer's turbolasers strikes the rear deflector shields. Sparks fly from the control panels in front of him. The warning klaxons blare throughout the cockpit, making his ears ring. Beside him, Chewbacca roars in alarm as more proximity alerts light up the console. Han banks the ship hard to port, and Luke feels his stomach drop as they narrowly avoid another volley of green laser fire streaking past the viewport.""",
    
    """[ACTION] The detention block is chaos. Stormtroopers lie unconscious on the floor around Luke Skywalker's feet. He stands in his stolen stormtrooper armor, helmet removed, sweat dripping down his face. Han Solo stands beside him at the entrance to cell 2187. The door hisses open and Luke sees a small figure in white robes sitting calmly on the detention bench. Princess Leia Organa looks up at him with a mixture of confusion and irritation. Luke's breath catches - she's even more beautiful than her hologram. She rises slowly, her sharp eyes examining him with obvious skepticism.""",
    
    """[ACTION] The vast desert of Tatooine stretches endlessly in every direction around Luke Skywalker. The twin suns of Tatoo I and Tatoo II beat down mercilessly on him, the heat oppressive even through his moisture farmer's tunic. He stands alone on a rocky ridge, his tunic flapping in the hot wind that never seems to stop. In his hands he grips a pair of macrobinoculars, pointed at the sky where he saw what looked like a space battle earlier. His aunt and uncle's moisture farm is barely visible in the distance behind him. Luke lowers the binoculars slowly, his young eyes full of longing and dreams of adventure beyond this desert prison he's called home his entire life.""",
    
    """[ACTION] Luke Skywalker stands in the cargo hold of the Millennium Falcon, facing the ancient Jedi Master Ben Kenobi. A training remote floats nearby, circling him slowly. Luke wears a blast helmet with the visor down, blocking his vision entirely. In his hands he holds an ignited lightsaber that hums with bright blue energy. The elegant weapon that Ben gave him feels both foreign and natural in his grip - lighter than he expected but balanced perfectly. He can hear the old man's footsteps as Ben circles him slowly, his weathered voice offering quiet guidance. This is Luke's first real lesson in the ways of the Force, and his heart pounds with nervous excitement.""",
]

starwars_characters = ["LUKE", "LUKE", "LUKE", "LUKE"]

# Test Luke in non-Star Wars scenarios (out-of-domain)
other_scenarios = [
    """[ACTION] Luke stands at the front of the line in the small coffee shop, packed with the morning rush. The espresso machine hisses and steams loudly behind the counter, making him flinch slightly. Behind him, the line of impatient customers stretches to the door. The person ahead of him just ordered an incredibly complicated drink with seven different modifications, and now the barista is looking at Luke expectantly. He realizes he has no idea what half these drinks even are. His face flushes as he fumbles with unfamiliar paper money in his pocket.""",
    
    """[ACTION] Luke sits at a cluttered desk in the dimly lit office building, the only person here at midnight. The hum of computers and the distant sound of a janitor's cart echo through the empty corridors around him. Files and photographs are spread out before him on the desk. The fluorescent lights flicker overhead, making his tired eyes ache. He's been reviewing the same evidence for hours, trying to find a crucial clue that keeps eluding him. His coffee has gone cold, untouched.""",
    
    """[ACTION] Luke grips the subway pole as heavy rain pounds against the windows of the crowded car. The train lurches to a stop between stations, lights flickering ominously. Around him, passengers groan in frustration. Luke checks his watch anxiously - he's already late for an important meeting, and now this. The intercom crackles to life with static as an announcement is about to be made. He closes his eyes and takes a breath, trying to stay calm.""",
    
    """[ACTION] Luke stands uncomfortably in the grand ballroom filled with elegantly dressed guests. Crystal chandeliers cast dancing light across the polished marble floor. A string quartet plays softly in the corner, music he doesn't recognize. He tugs at his formal wear - expensive clothes that fit perfectly but feel completely wrong on him. Across from him near the champagne fountain stands someone in equally expensive attire, their face twisted in a sneer. The tension between them is palpable. Other guests have noticed and started watching as the other person makes a cutting remark about Luke's background.""",
]

other_characters = ["LUKE", "LUKE", "LUKE", "LUKE"]

print("=== STAR WARS SCENARIOS (In-Domain) ===\n")
for i, (context, character) in enumerate(zip(starwars_scenarios, starwars_characters), 1):
    print(f"Test Case {i}")
    print(f"Context: {context[:100]}...")
    print(f"Character: {character}")
    dialogue = generate_dialogue(context, character)
    print(f"Generated Dialogue: {dialogue}\n")
    print("-" * 80 + "\n")

print("\n=== STAR WARS CHARACTERS IN MODERN SETTINGS (Out-of-Domain) ===\n")
for i, (context, character) in enumerate(zip(other_scenarios, other_characters), 1):
    print(f"Test Case {i}")
    print(f"Context: {context[:100]}...")
    print(f"Character: {character}")
    dialogue = generate_dialogue(context, character)
    print(f"Generated Dialogue: {dialogue}\n")
    print("-" * 80 + "\n")