In [None]:
# --- STEP 1: Setup and Installs ---
# Install the necessary libraries
!pip install transformers datasets torch huggingface_hub

In [None]:
# Import required libraries
from google.colab import drive
import os
from transformers import (
    GPT2Tokenizer, 
    GPT2LMHeadModel, 
    DataCollatorForLanguageModeling, 
    Trainer, 
    TrainingArguments
)
from datasets import load_dataset
import torch
from huggingface_hub import notebook_login, HfFolder # <--- NEW IMPORTS

In [None]:
# --- STEP 2: Configuration and Login ---
# 1. Log in to the Hugging Face Model Hub (it will prompt for your token)
print("Logging into Hugging Face Hub...")
notebook_login() # A browser pop-up will appear; enter your User Access Token.

# 2. Define your desired Model ID/Repository name
# IMPORTANT: REPLACE 'YOUR_HF_USERNAME' with your actual username/org name
HF_REPO_ID = "EhabBelllkasy01/gpt2-all-recipes" 
print(f"Checkpoints will be pushed to: https://huggingface.co/{HF_REPO_ID}")

In [None]:
# 3. Define a local path (required by Trainer for temporary file storage)
DRIVE_PATH = "/content/drive/MyDrive/GPT2_Recipe_Temp_Files"
os.makedirs(DRIVE_PATH, exist_ok=True)
print(f"Temporary files will be saved locally to: {DRIVE_PATH}")

In [None]:
# --- STEP 3: Load Tokenizer and Add Custom Tokens ---
tokenizer = GPT2Tokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

# Define and Add Custom Special Tokens for recipe structure
special_tokens_dict = {
    'additional_special_tokens': [
        '<|title|>', 
        '<|ingredients|>', 
        '<|instructions|>', 
        '<|endofrecipe|>'
    ]
}
num_added_toks = tokenizer.add_special_tokens(special_tokens_dict)

In [None]:
# --- STEP 4: Load Model and Resize Embeddings ---
model = GPT2LMHeadModel.from_pretrained('gpt2')
model.resize_token_embeddings(len(tokenizer)) 
print(f"Added {num_added_toks} new tokens and resized model embeddings.")

In [None]:
# --- STEP 5: Load, Format, and Tokenize Dataset ---
print("\nLoading dataset...")
raw_datasets = load_dataset('corbt/all-recipes', split='train[:5%]')

In [None]:
# Define the tokenization function
def tokenize_and_format(examples):
    return tokenizer(
        examples['input'], 
        truncation=True, 
        max_length=512,
        padding='max_length'
    )

tokenized_datasets = raw_datasets.map(tokenize_and_format, batched=True, remove_columns=["input"])
split_datasets = tokenized_datasets.train_test_split(test_size=0.1, seed=42)
train_dataset = split_datasets['train']
eval_dataset = split_datasets['test']

print(f"Training on {len(train_dataset)} samples, validating on {len(eval_dataset)} samples.")

In [None]:
# --- STEP 6: Configure Training and Checkpointing ---
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

# Configure Training Arguments for Hub Push and Resuming
training_args = TrainingArguments(
    output_dir=DRIVE_PATH,                  # Local directory for temporary saves/logs
    num_train_epochs=5,                     
    per_device_train_batch_size=4,          
    per_device_eval_batch_size=4,
    learning_rate=5e-5,                     
    warmup_steps=500,
    weight_decay=0.01,
    logging_dir=f'{DRIVE_PATH}/logs',
    logging_steps=50,
    # evaluation_strategy="steps",
    # evaluation_strategy="steps", 
    eval_strategy="steps", # Evaluate every 'eval_steps' (FIXED NAME) 
    eval_steps=500,
    
    # --- HUB CHECKPOINTING AND PUSH ---
    save_strategy="steps",              # Save on steps, not epochs
    save_steps=1000,                    # Save and push a checkpoint every 1000 training steps
    save_total_limit=1,                 # Keep only the latest local checkpoint
    push_to_hub=True,                   # CRITICAL: Enable pushing to the Hub
    hub_model_id=HF_REPO_ID,            # Specify the repository name
    hub_token=HfFolder.get_token(),     # Use the token from the login
    hub_private_repo=False,             # Set to True for a private repo
)

# --- STEP 7: Initialize Trainer and Train ---
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    tokenizer=tokenizer, # Pass tokenizer for Hub auto-upload
)

In [None]:
print("\nStarting training...")
print("Checkpoints will be pushed to the Hugging Face Hub automatically.")
# The Trainer checks the Hub for the latest checkpoint of the specified HF_REPO_ID
trainer.train(
    # resume_from_checkpoint=True
)

In [None]:
# --- STEP 8: Save Final Model and Push ---
# Trainer automatically handles the final save and push when push_to_hub=True
print(f"\nFinal fine-tuned model has been pushed to the Hugging Face Hub under: {HF_REPO_ID}")