In [None]:
import os
import torch
from datasets import load_dataset
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    pipeline # Added for testing
)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from functools import partial
import time

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"CUDA device count: {torch.cuda.device_count()}")
    print(f"Current CUDA device: {torch.cuda.current_device()}")
    print(f"Device name: {torch.cuda.get_device_name(torch.cuda.current_device())}")
    device = "cuda"
else:
    print("WARNING: CUDA not available. Training will be extremely slow.")
    device = "cpu"

# --- Configuration You Might Change ---

# 1. Model ID from Hugging Face (Python-specific CodeLlama is good for this task)
model_id = "codellama/CodeLlama-7b-Python-hf"

# 2. Path to your training data file (JSON Lines format)
#    Make sure this file exists in the 'data' subdirectory relative to the notebook
#    or provide the full path.
dataset_file = "data/train.jsonl"
if not os.path.exists(dataset_file):
     print(f"WARNING: Dataset file '{dataset_file}' not found. Please create it or update the path.")


# 3. Directory where the final trained adapter layers will be saved
output_dir = "./requests_codellama_final"

# --- Training Settings ---
num_epochs = 10            # How many times the model sees the entire dataset
batch_size = 4            # How many examples to process in parallel on the GPU (adjust based on memory)
gradient_accumulation_steps = 4 # Process this many batches before updating weights
learning_rate = 2e-4       # Model learning rate
max_seq_length = 512       # Maximum length of text the model processes at once
log_steps = 1              # Print training loss every N steps (1 = very frequent)

# --- Quantization & Precision ---
use_8bit_quantization = True # Use 8-bit to save memory (recommended for 7B on many GPUs)
use_bf16 = torch.cuda.is_available() and torch.cuda.is_bf16_supported() # Use bfloat16 if available

print("-" * 30)
print(f"Configuration:")
print(f"  Model ID: {model_id}")
print(f"  Dataset: {dataset_file}")
print(f"  Output Dir: {output_dir}")
print(f"  Epochs: {num_epochs}")
print(f"  Batch Size: {batch_size}")
print(f"  Gradient Accumulation: {gradient_accumulation_steps}")
print(f"  Effective Batch Size: {batch_size * gradient_accumulation_steps}")
print(f"  Learning Rate: {learning_rate}")
print(f"  Max Sequence Length: {max_seq_length}")
print(f"  Use 8-bit Quantization: {use_8bit_quantization}")
print(f"  Use BF16 Precision: {use_bf16}")
print("-" * 30)

In [None]:
# --- Configure Quantization ---
if use_8bit_quantization:
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
    )
    print("Using 8-bit quantization.")
else:
    bnb_config = None # No quantization
    print("Not using 8-bit quantization.")


# --- Determine Data Type ---
compute_dtype = torch.bfloat16 if use_bf16 else torch.float16
print(f"Using compute dtype: {compute_dtype}")

# --- Load Model ---
print(f"Loading base model: {model_id}...")
start_time = time.time()
model = AutoModelForCausalLM.from_pretrained(
    model_id,
    quantization_config=bnb_config,
    device_map="auto", # Handles placing model layers on devices
    torch_dtype=compute_dtype, # Load non-quantized parts in this dtype
)
end_time = time.time()
print(f"Base model loaded in {end_time - start_time:.2f} seconds.")
print(f"Model device map: {model.hf_device_map}") # Shows how model is distributed

# --- Load Tokenizer ---
print(f"Loading tokenizer for {model_id}...")
tokenizer = AutoTokenizer.from_pretrained(model_id)

if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token
    print("Set tokenizer pad_token to eos_token")
# Ensure padding is done on the right side
tokenizer.padding_side = "right"

print("Tokenizer loaded.")

In [None]:
print("\n--- Testing Base Model (Before Fine-tuning) ---")

# Sample prompt similar to our fine-tuning task
prompt_text = "How do I send a POST request with JSON data using requests?"

# Format the prompt clearly for the model
# Using a common instruction format often helps base models
input_text = f"""Instruction:
{prompt_text}

Python Code:
```python
"""

print(f"Test Prompt:\n{input_text}")

# Convert prompt text to tokens
inputs = tokenizer(input_text, return_tensors="pt").to(model.device)

# Generate code
print("\nGenerating...")
start_time = time.time()
with torch.no_grad(): # Disable gradient calculations for inference
    outputs = model.generate(
        **inputs,
        max_new_tokens=150,      # Limit output length
        temperature=0.6,         # Controls randomness (lower = more predictable)
        do_sample=True,          # Use sampling
        top_p=0.9,               # Nucleus sampling
        pad_token_id=tokenizer.eos_token_id, # Stop when EOS is generated
    )
end_time = time.time()
print(f"Generation finished in {end_time - start_time:.2f} seconds.")

# Convert output tokens back to text
# We skip special tokens like padding or EOS in the decoded output
generated_code = tokenizer.decode(outputs[0], skip_special_tokens=True)

print("\n--- Base Model Output ---")
# Often the model might just repeat the prompt or generate incomplete/incorrect code initially
print(generated_code)
print("-" * 25)

In [None]:
# Description: Load your custom dataset and prepare it in the format needed for training.

# --- Define Formatting and Tokenizing Functions ---

def create_formatted_input(prompt, completion, tokenizer):
    """ Combines prompt and completion with special tokens for training """
    # Format: "Prompt: [Your Prompt]\nCompletion: [Your Code Snippet]<EOS>"
    return f"Prompt: {prompt}\nCompletion: {completion}{tokenizer.eos_token}"

def tokenize_function(examples, tokenizer):
    """ Tokenizes a batch of examples """
    prompts = examples['prompt']
    completions = examples['completion']
    # Create the full text for each example in the batch
    full_texts = [create_formatted_input(p, c, tokenizer) for p, c in zip(prompts, completions)]
    # Tokenize the batch
    tokenized_outputs = tokenizer(
        full_texts,
        max_length=max_seq_length, # Use max_length from config
        padding="max_length",      # Pad to max length
        truncation=True,           # Truncate longer sequences
    )
    return tokenized_outputs

# --- Load and Process ---
print(f"\nLoading dataset from: {dataset_file}")
if not os.path.exists(dataset_file):
    raise FileNotFoundError(f"Dataset file not found: {dataset_file}. Please ensure it's in the correct path.")

raw_dataset = load_dataset('json', data_files=dataset_file, split='train')
print(f"Dataset loaded. Number of examples: {len(raw_dataset)}")
print("Showing first example:", raw_dataset[0])

print("\nTokenizing dataset (this may take a moment)...")
# Use 'partial' to pass the tokenizer to the mapping function easily
tokenize_with_tokenizer = partial(tokenize_function, tokenizer=tokenizer)

# Apply the tokenization to the entire dataset
# `batched=True` processes multiple examples at once for speed
# `remove_columns` cleans up the dataset by removing the original text columns
tokenized_dataset = raw_dataset.map(
    tokenize_with_tokenizer,
    batched=True,
    remove_columns=raw_dataset.column_names
)
print("Dataset successfully tokenized.")
print("Columns in tokenized dataset:", tokenized_dataset.column_names) # Should be ['input_ids', 'attention_mask']

In [None]:
# Description: Set up LoRA to efficiently fine-tune the model by only
# training a small number of adapter parameters.

print("\n--- Configuring LoRA ---")

# 1. Prepare the quantized model for LoRA training
#    (Important step when using quantization + PEFT)
model.gradient_checkpointing_enable() # Saves more memory during training
model = prepare_model_for_kbit_training(model)
print("Model prepared for LoRA training.")

# 2. Define LoRA configuration
lora_config = LoraConfig(
    r=8,                             # LoRA rank (dimension of trainable matrices)
    lora_alpha=16,                   # LoRA alpha (scaling factor)
    target_modules=["q_proj", "v_proj"], # Layers to apply LoRA to (check model architecture if needed)
    lora_dropout=0.05,               # Dropout probability for LoRA layers
    bias="none",                     # Typically 'none' for LoRA
    task_type="CAUSAL_LM"            # Must be set for Causal Language Models
)
print("LoRA configuration created.")

# 3. Apply LoRA to the model
model = get_peft_model(model, lora_config)
print("LoRA applied to the model.")

# 4. Print trainable parameters
#    Verify that only a small fraction of parameters are trainable (<1%)
model.print_trainable_parameters()

In [None]:
# Description: Configure the Hugging Face Trainer, which handles the training loop,
# optimization, logging, and saving.

print("\n--- Setting Up Trainer ---")

# --- Define Training Arguments ---
training_args = TrainingArguments(
    output_dir=output_dir,               # Directory for checkpoints and final output
    num_train_epochs=num_epochs,
    per_device_train_batch_size=batch_size,
    gradient_accumulation_steps=gradient_accumulation_steps,
    learning_rate=learning_rate,
    logging_strategy="steps",
    logging_steps=log_steps,             # Log loss frequently
    save_strategy="epoch",               # Save checkpoint every epoch
    save_total_limit=1,                  # Keep only the last checkpoint
    bf16=use_bf16,                       # Use bfloat16 if supported
    fp16=not use_bf16,                   # Use float16 if bfloat16 is not supported
    optim="paged_adamw_8bit",            # Optimizer optimized for quantized models
    gradient_checkpointing=True,         # Enable gradient checkpointing (memory saving)
    report_to="tensorboard",             # Log metrics for TensorBoard visualization
    # evaluation_strategy="no",          # No evaluation during training in this simple setup
    # load_best_model_at_end=False,      # Not needed without evaluation
)
print("Training Arguments configured.")

# --- Initialize Trainer ---
trainer = Trainer(
    model=model,                          # Our LoRA-adapted model
    args=training_args,                   # Training configuration
    train_dataset=tokenized_dataset,      # The tokenized dataset
    tokenizer=tokenizer,                  # The tokenizer
    data_collator=DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False), # Handles batch padding
)
print("Trainer initialized.")

In [None]:
# Description: Run the training loop. Monitor the 'loss' value in the output -
# it should generally decrease, indicating the model is learning.

print("\n--- Starting Fine-tuning Training ---")
print(f"Will train for {num_epochs} epochs.")
print(f"Loss will be logged every {log_steps} steps.")
print(f"Checkpoints (and final model) will be saved in '{output_dir}'.")

# Make sure the output directory exists
os.makedirs(output_dir, exist_ok=True)

# Calculate total steps for info
total_steps = (len(tokenized_dataset) // (batch_size * gradient_accumulation_steps)) * num_epochs
print(f"Estimated total training steps: {total_steps}")

# Start training!
start_time = time.time()
train_result = trainer.train() # This runs the training loop
end_time = time.time()

print(f"\n--- Training Finished ---")
print(f"Training took {end_time - start_time:.2f} seconds.")

# You can optionally display training metrics
# print("\nTraining Metrics:")
# print(train_result.metrics)

In [None]:
# Description: Save the trained LoRA adapter layers and associated files.
# This is the result of your fine-tuning that you'll use for inference.

print("\n--- Saving Final Model Adapter ---")

# Ensure the final output directory exists
os.makedirs(output_dir, exist_ok=True)

# Save the LoRA adapter weights, config, tokenizer, etc.
# Using trainer.save_model() is convenient as it saves everything needed.
trainer.save_model(output_dir)
print(f"Final adapter and tokenizer configuration saved to: {output_dir}")

# You might also want to save the full trainer state if you plan to resume later
# trainer.save_state()
# print("Trainer state saved.")

# Check if the adapter files were actually saved
adapter_model_file = os.path.join(output_dir, "adapter_model.bin")
adapter_config_file = os.path.join(output_dir, "adapter_config.json")

if os.path.exists(adapter_model_file) and os.path.exists(adapter_config_file):
    print("Adapter files verified in the output directory.")
else:
    print("WARNING: Final adapter files not found in the main output directory.")
    print("Check if training completed successfully and if there were saving errors.")
    print("You might need to load from the last checkpoint directory inside:", output_dir)