In [1]:
!pip install -q unsloth transformers accelerate bitsandbytes
!pip install -q datasets


In [14]:
import os
import json
import random
import torch
import numpy as np
from datasets import Dataset
from transformers import TrainingArguments, Trainer, DataCollatorForLanguageModeling
from unsloth import FastLanguageModel
from unsloth import is_bfloat16_supported
import warnings
warnings.filterwarnings('ignore')

# 1. Set random seeds for reproducibility
def set_seed(seed=3407):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)

set_seed()

# The file has already been uploaded as 'enriched_data_merged.json'
filename = "enriched_data_merged.json"
print(f"Using uploaded file: {filename}")

# 2. Load and prepare the dataset
def load_data(file_path):
    """Load data from the JSON file"""
    try:
        with open(file_path, 'r', encoding='utf-8') as f:
            return json.load(f)
    except json.JSONDecodeError:
        raise ValueError("The file is not a valid JSON file")

# 3. Format data for instruction tuning
def format_prompt(entry):
    """Format a single data entry into a prompt"""
    context = f"Year: {entry['year']}\nSetting: {entry['setting']}\nTopic: {entry['topic']}\nPersona: {entry['persona']}"
    instruction_text = f"{entry['instruction']}\n\n{entry['input']}"

    prompt = f"<|begin_of_text|><|system|>\nYou are Noam Chomsky.<|end_of_text|>\n<|user|>\n{context}\n\n{instruction_text}<|end_of_text|>\n<|assistant|>\n{entry['output']}<|end_of_text|>"

    return prompt

# 4. Split dataset into train and validation sets
def split_dataset(data, train_ratio=0.9):
    """Split the dataset into train and validation sets"""
    random.shuffle(data)
    split_idx = int(len(data) * train_ratio)
    return data[:split_idx], data[split_idx:]

# 5. Main function to run the training
def main():
    # Configuration - for Colab
    # output_dir = "/unsloth/outputs"
    model_name = "unsloth/Llama-3.2-1B-Instruct-bnb-4bit"
    max_seq_length = 512
    batch_size = 1
    max_steps = 350  # Only train for 10 steps

    # Create output directory
    # os.makedirs(output_dir, exist_ok=True)

    # Load and prepare data
    print("Processing data...")
    raw_data = load_data(filename)

    # If data is too large for quick testing, limit it
    if len(raw_data) > 50:
        print(f"Using only first 50 examples from {len(raw_data)} total for quick testing")
        raw_data = raw_data[:50]

    train_data, val_data = split_dataset(raw_data)
    print(f"Dataset sizes - Train: {len(train_data)}, Val: {len(val_data)}")

    # Format data
    train_texts = [format_prompt(entry) for entry in train_data]
    val_texts = [format_prompt(entry) for entry in val_data]

    # Load model and tokenizer
    print("Loading model...")
    model, tokenizer = FastLanguageModel.from_pretrained(
        model_name=model_name,
        max_seq_length=max_seq_length,
        dtype=torch.bfloat16 if is_bfloat16_supported() else torch.float16,
        load_in_4bit=True,
    )

    # Set padding token
    tokenizer.pad_token = tokenizer.eos_token

    # Prepare model for LoRA training
    model = FastLanguageModel.get_peft_model(
        model,
        r=8,
        target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
        lora_alpha=16,
        lora_dropout=0.1,
        bias="none",
        use_gradient_checkpointing="unsloth",
    )

    # Tokenize datasets
    print("Tokenizing datasets...")

    def tokenize_function(examples):
        return tokenizer(examples, truncation=True, max_length=max_seq_length)

    # Create datasets with simple text format
    train_dataset = Dataset.from_dict({"text": train_texts})
    val_dataset = Dataset.from_dict({"text": val_texts})

    # Map the tokenize function to the datasets
    tokenized_train = train_dataset.map(
        lambda x: tokenize_function(x["text"]),
        batched=True,
        remove_columns=["text"]
    )

    tokenized_val = val_dataset.map(
        lambda x: tokenize_function(x["text"]),
        batched=True,
        remove_columns=["text"]
    )

    # Use a data collator that will handle the labels
    data_collator = DataCollatorForLanguageModeling(
        tokenizer=tokenizer,
        mlm=False  # We want causal language modeling, not masked
    )

    # Define training arguments
    training_args = TrainingArguments(
        output_dir="/lama/instr",
        per_device_train_batch_size=batch_size,
        per_device_eval_batch_size=batch_size,
        max_steps=max_steps,
        warmup_steps=100,
        num_train_epochs=10,
        learning_rate=1e-5,
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),
        logging_steps=1,
        optim="adamw_8bit",
        # output_dir=output_dir,
        report_to="none",
        save_strategy="steps",
        save_steps=100,
        save_total_limit=1,
    )

    # Create trainer
    trainer = Trainer(
        model=model,
        args=training_args,
        train_dataset=tokenized_train,
        eval_dataset=tokenized_val,
        data_collator=data_collator,
    )

    # Train the model
    print(f"Starting training for {max_steps} steps...")
    trainer.train()

    # Save the fine-tuned model and tokenizer
    print("Saving model...")
    model_save_path = f"{output_dir}/fine-tuned-model-qwen"
    model.save_pretrained(model_save_path)
    tokenizer.save_pretrained(model_save_path)

    print(f"Training complete! Model saved to {model_save_path}")

    # Provide code to download the model
    print("\nTo download your trained model, run:")
    print("from google.colab import files")
    print("!zip -r /content/fine-tuned-model.zip /content/outputs/fine-tuned-model")
    print("files.download('/content/fine-tuned-model.zip')")

    return model, tokenizer

# Run the training process
if __name__ == "__main__":
    main()

Using uploaded file: enriched_data_merged.json
Processing data...
Using only first 50 examples from 8519 total for quick testing
Dataset sizes - Train: 45, Val: 5
Loading model...
==((====))==  Unsloth 2025.3.19: Fast Llama patching. Transformers: 4.51.0.
   \\   /|    NVIDIA GeForce RTX 3090 Ti. Num GPUs = 1. Max memory: 23.988 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.6.0+cu124. CUDA: 8.6. CUDA Toolkit: 12.4. Triton: 3.2.0
\        /    Bfloat16 = TRUE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!
Tokenizing datasets...


Map:   0%|          | 0/45 [00:00<?, ? examples/s]

Map:   0%|          | 0/5 [00:00<?, ? examples/s]

PermissionError: [Errno 13] Permission denied: '/lama'