In [None]:
# Standard Hugging Face fine-tuning without Unsloth
import os
import torch
from transformers import (
    AutoModelForCausalLM, 
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset
import math

from huggingface_hub import login
login(token="")
    

In [None]:
import os
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
os.environ["CUDA_LAUNCH_BLOCKING"] = "1"

In [None]:
# Step 1: Load model
model_name = "meta-llama/Llama-3.2-1B-Instruct"  

print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

print("Loading model...")
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.bfloat16 if torch.cuda.is_available() and torch.cuda.is_bf16_supported() else torch.float16,
    low_cpu_mem_usage=True,
    device_map="auto"
)

In [None]:
# Load dataset
dataset = load_dataset("mlabonne/FineTome-100k", split="train[:1000]")

In [None]:

# First, let's examine the dataset structure
print("Dataset example structure:", dataset[0])

# Now let's fix the formatting function
def format_instruction(example):
    # Process a single example (batched=False)
    formatted_texts = []
    conversations = example["conversations"]
    
    # Check if conversations is a list of dictionaries
    if isinstance(conversations, list) and all(isinstance(x, dict) for x in conversations):
        for i in range(0, len(conversations), 2):
            if i + 1 < len(conversations):
                instruction = conversations[i]["value"]
                output = conversations[i + 1]["value"]
                
                # Format as instruction example
                formatted_text = f"<s>[INST] {instruction} [/INST] {output}</s>"
                formatted_texts.append(formatted_text)
    
    return {"formatted_text": formatted_texts}

print("Formatting conversations...")
# Use batched=False to process one example at a time
formatted_dataset = dataset.map(
    format_instruction,
    batched=False,
    remove_columns=dataset.column_names
)

# Verify the formatting worked
print("Checking first formatted example:")
if len(formatted_dataset) > 0 and len(formatted_dataset[0]["formatted_text"]) > 0:
    print(formatted_dataset[0]["formatted_text"][0][:100] + "...")  # Print first 100 chars

# Continue with tokenization
print("Tokenizing dataset...")
def tokenize_function(examples):
    # Flatten the list of texts since each example now contains a list of formatted texts
    texts_to_tokenize = [text for text_list in examples["formatted_text"] for text in text_list]
    
    tokenized = tokenizer(
        texts_to_tokenize,
        padding="max_length",
        truncation=True,
        max_length=1024,
        return_tensors="pt"
    )
    
    # Create input_ids and labels (for causal language modeling)
    tokenized["labels"] = tokenized["input_ids"].clone()
    return tokenized

tokenized_dataset = formatted_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=["formatted_text"]
)

print(f"Dataset size after processing: {len(tokenized_dataset)}")

In [None]:
# Split dataset
print("Splitting dataset...")
train_val_split = tokenized_dataset.train_test_split(test_size=0.1, seed=42)
train_dataset = train_val_split["train"]
eval_dataset = train_val_split["test"]

In [None]:
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False  # Not using masked language modeling
)

In [None]:
# Step 3: Configure training arguments
print("Setting up training arguments...")
training_args = TrainingArguments(
    output_dir="llama3_full_finetune",
    overwrite_output_dir=True,
    num_train_epochs=1,
    per_device_train_batch_size=1,
    gradient_checkpointing=True,
    torch_compile=False,
    per_device_eval_batch_size=1,
    gradient_accumulation_steps=32,  # Increase for memory efficiency
    max_steps=30,
    eval_steps=100,
    save_steps=100,
    warmup_steps=50,
    learning_rate=1e-5,  # Lower learning rate for stability
    weight_decay=0.01,
    logging_dir="./logs",
    logging_steps=10,
    eval_strategy="steps",
    # Use fp16 if bfloat16 not available
    fp16=torch.cuda.is_available() and not torch.cuda.is_bf16_supported(),
    bf16=torch.cuda.is_available() and torch.cuda.is_bf16_supported(),
    # Add gradient clipping
    max_grad_norm=1.0,
    # Disable tqdm progress bar if running in notebook
    disable_tqdm=False,
    # Report to console only
    report_to="none",
    # Load best model at end of training
    load_best_model_at_end=True,
    # Optimizer
    optim="adamw_torch",
    # Save total limit (save disk space)
    save_total_limit=2,
)

# Step 4: Create the Trainer
print("Setting up trainer...")
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
)

In [None]:
# Step 5: Start training
print("Starting training...")
try:
    train_result = trainer.train()
    print("Training completed successfully!")
    
    # Step 6: Save the model
    print("Saving model...")
    trainer.save_model("llama3_finetuned_full")
    
    # Run final evaluation
    print("Running final evaluation...")
    eval_results = trainer.evaluate()
    print(f"Final perplexity: {math.exp(eval_results['eval_loss'])}")
    
except Exception as e:
    print(f"Training error: {e}")
    
    # Print more diagnostic information
    import traceback
    traceback.print_exc()
    
    # Try to save the model even if training failed
    try:
        print("Attempting to save partial model...")
        trainer.save_model("llama3_finetuned_partial")
    except:
        print("Could not save partial model")