In [None]:
# Install necessary libraries
!pip install transformers datasets accelerate huggingface_hub


In [None]:

from huggingface_hub import notebook_login
# This will prompt you to enter your Hugging Face token.
# Make sure your token has 'write' access enabled.
notebook_login()

In [None]:
from datasets import load_dataset
from transformers import GPT2Tokenizer

# --- Setup for Data ---
dataset_name = "DialogueCharacter/english_preference_hh_helpful_unfiltered"
dataset = load_dataset(dataset_name)

# 1. Base Model Checkpoint (NEW: Load the tokenizer from your existing fine-tuned model)
pretrained_model_name = "EhabBelllkasy01/gpt2-persona-chat-finetuned"
tokenizer = GPT2Tokenizer.from_pretrained(pretrained_model_name)
tokenizer.pad_token = tokenizer.eos_token 
SEP_TOKEN = tokenizer.eos_token 


In [None]:

# --- Formatting Function (Re-using the correct function from the last successful attempt) ---

def format_and_tokenize_chosen(examples):
    formatted_texts = []
    
    # We use 'rejected' as the instruction and 'chosen' as the response
    for instruction_text, chosen_response in zip(examples['rejected'], examples['chosen']):
        text = f"User: {instruction_text}\nModel: {chosen_response}{SEP_TOKEN}"
        formatted_texts.append(text)

    return tokenizer(
        formatted_texts, 
        truncation=True, 
        max_length=512,
        padding='max_length' 
    )

# --- Apply Mapping ---

# Use the train split and remove the only existing columns
tokenized_datasets = dataset['train'].map(
    format_and_tokenize_chosen, 
    batched=True, 
    remove_columns=['chosen', 'rejected'] 
)

# Set the 'input_ids' as 'labels' for Causal Language Modeling
tokenized_datasets = tokenized_datasets.map(
    lambda examples: {'labels': examples['input_ids']}, 
    batched=True
)

print("Data preparation complete.")

In [None]:
from transformers import GPT2LMHeadModel, TrainingArguments, Trainer, DataCollatorForLanguageModeling

# --- Load Pre-trained Model Checkpoint (Goal 1) ---
model = GPT2LMHeadModel.from_pretrained(pretrained_model_name)

# --- Define Training Arguments (Goal 2 & 3) ---
# Goal 2: Save progress to checkpoints
# Goal 3: Save to new repo EhabBelllkasy01/gpt2-Dialogue-chat-finetuned

# NOTE: Replace 'EhabBelllkasy01/' with your actual Hugging Face username if needed.
hub_repo_id = "EhabBelllkasy01/gpt2-Dialogue-chat-finetuned"
output_dir = f"./{hub_repo_id.split('/')[-1]}_checkpoints" # Local directory for saving

training_args = TrainingArguments(
    output_dir=output_dir,
    overwrite_output_dir=True,
    num_train_epochs=3,                   
    per_device_train_batch_size=4,        
    logging_steps=500,
    learning_rate=5e-5,
    fp16=True,                            
    gradient_accumulation_steps=4,
    
    # --- Checkpoint and Hub Settings ---
    save_strategy="steps",                # Save based on steps
    save_steps=1000,                      # Save a checkpoint every 1000 steps
    save_total_limit=2,                   # Only keep the last 2 checkpoints
    push_to_hub=True,                     # Automatically push the model and checkpoints to the Hub
    hub_model_id=hub_repo_id,             # The target repository name
    hub_private_repo=False,               # Set to True if you want the repo to be private
)

# --- Define Data Collator ---
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer, 
    mlm=False
)

# --- Initialize and Train ---
trainer = Trainer(
    model=model,
    args=training_args,
    data_collator=data_collator,
    train_dataset=tokenized_datasets,
)


In [None]:

print(f"Starting training and pushing checkpoints to Hugging Face Hub: {hub_repo_id}")
trainer.train()
print("Training complete!")


In [None]:

# --- Final Save and Push ---
# Pushes the final fine-tuned model to the Hub (overwriting the last checkpoint).
trainer.push_to_hub()
print(f"Final model successfully uploaded to https://huggingface.co/{hub_repo_id}")