In [None]:
## ðŸ¤– Full GPT-2 Fine-Tuning Program in Colab

# --- 1. SETUP AND INSTALLATION ---
# ----------------------------------
print("--- 1. Installing Libraries ---")
! pip install transformers datasets accelerate -q
# Install the latest version of accelerate and transformers for best compatibility
! pip install -U accelerate transformers -q

In [None]:
import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    GPT2LMHeadModel, 
    TrainingArguments, 
    Trainer,
    DataCollatorForLanguageModeling
)
import os

In [None]:
# Define constants
MODEL_ID = "EhabBelllkasy01/gpt2-all-recipes"
DATASET_NAME = "google/Synthetic-Persona-Chat"
SAVE_DIR = "./gpt2-persona-chat-finetuned-from-recipes"
BLOCK_SIZE = 128 # Max sequence length for tokenization

In [None]:
# --- 2. LOAD MODEL AND TOKENIZER ---
# ------------------------------------
print("\n--- 2. Loading Model and Tokenizer ---")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    model = GPT2LMHeadModel.from_pretrained(MODEL_ID)
    print(f"âœ… Loaded model: {MODEL_ID}")
except Exception as e:
    print(f"Error loading model: {e}")
    exit()

# Set pad token: GPT-2 tokenizer often requires this for training
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

In [None]:
# --- 3. LOAD AND PREPARE DATASET ---
# ------------------------------------
print("\n--- 3. Loading and Preparing Dataset ---")
dataset = load_dataset(DATASET_NAME)

# Use the training split
train_dataset = dataset["train"]

def format_conversation(example):
    """
    Formats the conversation and personas into a single sequence of text.
    Format: P1: [Persona 1] P2: [Persona 2] <|startofchat|> [Turn 1] <|eos|> [Turn 2] <|eos|>
    """
    # Combine personas
    persona_1 = " ".join(example["user 1 personas"])
    persona_2 = " ".join(example["user 2 personas"])
    personas = f"P1: {persona_1} P2: {persona_2}"
    
    # Concatenate the conversation turns, using the EOS token as a separator
    conversation = tokenizer.eos_token.join(example["Best Generated Conversation"])
    
    # Combine everything. The final EOS token is crucial for training the model 
    # to understand where a conversation sequence ends.
    full_text = f"{personas} <|startofchat|> {conversation} {tokenizer.eos_token}"
    return {"text": full_text}

In [None]:
# Map the formatting function to the dataset
processed_dataset = train_dataset.map(
    format_conversation, 
    remove_columns=train_dataset.column_names
)
print(f"âœ… Dataset examples formatted. Total examples: {len(processed_dataset)}")


def tokenize_function(examples):
    """Tokenizes the formatted text, truncating to BLOCK_SIZE."""
    return tokenizer(
        examples["text"], 
        truncation=True, 
        max_length=BLOCK_SIZE,
        padding="max_length" # Pad to max_length for consistent batching
    )

In [None]:
# Tokenize the processed dataset
tokenized_dataset = processed_dataset.map(
    tokenize_function, 
    batched=True, 
    num_proc=os.cpu_count(), # Use all available cores for fast tokenization
    remove_columns=["text"]
)

# Data collator for Causal Language Modeling (CLM)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False # We use CLM for GPT-2, not Masked LM
)

In [None]:
# --- 4. FINE-TUNING THE MODEL ---
# ---------------------------------
print("\n--- 4. Starting Fine-Tuning ---")

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./training_output",
    overwrite_output_dir=True,
    num_train_epochs=3, # Recommended starting point
    per_device_train_batch_size=4, # Adjust based on GPU memory (4-8 is common for Colab)
    gradient_accumulation_steps=4, # Effectively increases batch size to 16 (4*4)
    learning_rate=5e-5,
    save_strategy="epoch", # Save checkpoint at the end of each epoch
    logging_steps=500,
    report_to="none", # Disable reporting to external services
    fp16=torch.cuda.is_available(), # Use mixed precision if a GPU is available for faster training
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)

In [None]:
# Start training! This will take time depending on the GPU.
trainer.train()

print("\n--- Fine-Tuning Complete! ---")

In [None]:
# --- 5. SAVE AND TEST THE NEW MODEL ---
# ---------------------------------------
print("\n--- 5. Saving and Testing the New Model ---")

# Create the save directory if it doesn't exist
os.makedirs(SAVE_DIR, exist_ok=True)

# Save the model and tokenizer to the new local directory
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR) # Save the tokenizer too, especially its special tokens

print(f"âœ… New fine-tuned model and tokenizer saved locally to: {SAVE_DIR}")

In [None]:
# Load the saved model for testing
from transformers import pipeline
try:
    new_model = GPT2LMHeadModel.from_pretrained(SAVE_DIR)
    generator = pipeline(
        'text-generation', 
        model=new_model, 
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1 # Use GPU if available
    )
    
    # Test prompt
    test_prompt = "P1: I enjoy collecting antique books and reading mysteries. P2: I work as a chef and love making pasta. <|startofchat|> P1: I just finished a great book about a famous detective. What have you been up to?"

    print(f"\n--- Testing with Prompt ---\nPROMPT: {test_prompt}")

    # Generate text
    generated_text = generator(
        test_prompt, 
        max_length=150, 
        num_return_sequences=1,
        do_sample=True, 
        temperature=0.8, # Adjust temperature for creativity
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.2,
    )[0]['generated_text']

    print("\n--- GENERATED RESPONSE ---")
    # Clean up the output to only show the response part
    response_text = generated_text[len(test_prompt):].strip()
    # Find the first EOS token and stop there for a clean response
    response_text = response_text.split(tokenizer.eos_token)[0].strip()
    
    print(response_text)

except Exception as e:
    print(f"Error during testing: {e}")

In [None]:
"""
## ðŸ¤– Full GPT-2 Fine-Tuning Program in Colab

# --- 1. SETUP AND INSTALLATION ---
# ----------------------------------
print("--- 1. Installing Libraries ---")
! pip install transformers datasets accelerate -q
# Install the latest version of accelerate and transformers for best compatibility
! pip install -U accelerate transformers -q


import torch
from datasets import load_dataset
from transformers import (
    AutoTokenizer, 
    GPT2LMHeadModel, 
    TrainingArguments, 
    Trainer,
    DataCollatorForLanguageModeling
)
import os

# Define constants
MODEL_ID = "EhabBelllkasy01/gpt2-all-recipes"
DATASET_NAME = "google/Synthetic-Persona-Chat"
SAVE_DIR = "./gpt2-persona-chat-finetuned-from-recipes"
BLOCK_SIZE = 128 # Max sequence length for tokenization


# --- 2. LOAD MODEL AND TOKENIZER ---
# ------------------------------------
print("\n--- 2. Loading Model and Tokenizer ---")
try:
    tokenizer = AutoTokenizer.from_pretrained(MODEL_ID)
    model = GPT2LMHeadModel.from_pretrained(MODEL_ID)
    print(f"âœ… Loaded model: {MODEL_ID}")
except Exception as e:
    print(f"Error loading model: {e}")
    exit()

# Set pad token: GPT-2 tokenizer often requires this for training
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    
# --- 3. LOAD AND PREPARE DATASET ---
# ------------------------------------
print("\n--- 3. Loading and Preparing Dataset ---")
dataset = load_dataset(DATASET_NAME)

# Use the training split
train_dataset = dataset["train"]

def format_conversation(example):
    """
    Formats the conversation and personas into a single sequence of text.
    Format: P1: [Persona 1] P2: [Persona 2] <|startofchat|> [Turn 1] <|eos|> [Turn 2] <|eos|>
    """
    # Combine personas
    persona_1 = " ".join(example["User 1 Personas"])
    persona_2 = " ".join(example["User 2 Personas"])
    personas = f"P1: {persona_1} P2: {persona_2}"
    
    # Concatenate the conversation turns, using the EOS token as a separator
    conversation = tokenizer.eos_token.join(example["Conversation"])
    
    # Combine everything. The final EOS token is crucial for training the model 
    # to understand where a conversation sequence ends.
    full_text = f"{personas} <|startofchat|> {conversation} {tokenizer.eos_token}"
    return {"text": full_text}

# Map the formatting function to the dataset
processed_dataset = train_dataset.map(
    format_conversation, 
    remove_columns=train_dataset.column_names
)
print(f"âœ… Dataset examples formatted. Total examples: {len(processed_dataset)}")


def tokenize_function(examples):
    """Tokenizes the formatted text, truncating to BLOCK_SIZE."""
    return tokenizer(
        examples["text"], 
        truncation=True, 
        max_length=BLOCK_SIZE,
        padding="max_length" # Pad to max_length for consistent batching
    )

# Tokenize the processed dataset
tokenized_dataset = processed_dataset.map(
    tokenize_function, 
    batched=True, 
    num_proc=os.cpu_count(), # Use all available cores for fast tokenization
    remove_columns=["text"]
)

# Data collator for Causal Language Modeling (CLM)
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False # We use CLM for GPT-2, not Masked LM
)


# --- 4. FINE-TUNING THE MODEL ---
# ---------------------------------
print("\n--- 4. Starting Fine-Tuning ---")

# Define Training Arguments
training_args = TrainingArguments(
    output_dir="./training_output",
    overwrite_output_dir=True,
    num_train_epochs=3, # Recommended starting point
    per_device_train_batch_size=4, # Adjust based on GPU memory (4-8 is common for Colab)
    gradient_accumulation_steps=4, # Effectively increases batch size to 16 (4*4)
    learning_rate=5e-5,
    save_strategy="epoch", # Save checkpoint at the end of each epoch
    logging_steps=500,
    report_to="none", # Disable reporting to external services
    fp16=torch.cuda.is_available(), # Use mixed precision if a GPU is available for faster training
)

# Initialize the Trainer
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_dataset,
)

# Start training! This will take time depending on the GPU.
trainer.train()

print("\n--- Fine-Tuning Complete! ---")


# --- 5. SAVE AND TEST THE NEW MODEL ---
# ---------------------------------------
print("\n--- 5. Saving and Testing the New Model ---")

# Create the save directory if it doesn't exist
os.makedirs(SAVE_DIR, exist_ok=True)

# Save the model and tokenizer to the new local directory
trainer.save_model(SAVE_DIR)
tokenizer.save_pretrained(SAVE_DIR) # Save the tokenizer too, especially its special tokens

print(f"âœ… New fine-tuned model and tokenizer saved locally to: {SAVE_DIR}")

# Load the saved model for testing
from transformers import pipeline
try:
    new_model = GPT2LMHeadModel.from_pretrained(SAVE_DIR)
    generator = pipeline(
        'text-generation', 
        model=new_model, 
        tokenizer=tokenizer,
        device=0 if torch.cuda.is_available() else -1 # Use GPU if available
    )
    
    # Test prompt
    test_prompt = "P1: I enjoy collecting antique books and reading mysteries. P2: I work as a chef and love making pasta. <|startofchat|> P1: I just finished a great book about a famous detective. What have you been up to?"

    print(f"\n--- Testing with Prompt ---\nPROMPT: {test_prompt}")

    # Generate text
    generated_text = generator(
        test_prompt, 
        max_length=150, 
        num_return_sequences=1,
        do_sample=True, 
        temperature=0.8, # Adjust temperature for creativity
        top_k=50,
        top_p=0.95,
        repetition_penalty=1.2,
    )[0]['generated_text']

    print("\n--- GENERATED RESPONSE ---")
    # Clean up the output to only show the response part
    response_text = generated_text[len(test_prompt):].strip()
    # Find the first EOS token and stop there for a clean response
    response_text = response_text.split(tokenizer.eos_token)[0].strip()
    
    print(response_text)

except Exception as e:
    print(f"Error during testing: {e}")
"""