In [1]:
import os
import torch
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    Trainer,
    TrainingArguments,
    DataCollatorForLanguageModeling,
    TrainerCallback,
    EarlyStoppingCallback
)
from datasets import load_dataset
import numpy as np
import matplotlib.pyplot as plt
import time
import json
from tqdm import tqdm
import argparse
import logging
from datetime import datetime


# ------------------ Custom Callback for Tracking Metrics ------------------ #
class MetricsTrackingCallback(TrainerCallback):
    """
    Callback to track and save training metrics during training.
    Includes loss, validation loss, and training time.
    """
    
    def __init__(self, log_dir="./logs"):
        self.log_dir = log_dir
        os.makedirs(log_dir, exist_ok=True)
        
        self.train_losses = []
        self.eval_losses = []
        self.train_times = []
        self.start_time = None
        
        # Setup logging
        self.log_file = os.path.join(log_dir, f"training_log_{datetime.now().strftime('%Y%m%d_%H%M%S')}.txt")
        logging.basicConfig(
            filename=self.log_file,
            level=logging.INFO,
            format='%(asctime)s - %(levelname)s - %(message)s'
        )
        self.logger = logging.getLogger(__name__)
    
    def on_train_begin(self, args, state, control, **kwargs):
        """Record the starting time when training begins."""
        self.start_time = time.time()
        self.logger.info(f"Training started at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        self.logger.info(f"Training arguments: {args}")
    
    def on_log(self, args, state, control, logs=None, **kwargs):
        """Record metrics at each logging step."""
        logs = logs or {}
        
        # Record training loss
        if 'loss' in logs:
            self.train_losses.append((state.global_step, logs['loss']))
            self.logger.info(f"Step {state.global_step}: Training loss = {logs['loss']}")
        
        # Record eval loss
        if 'eval_loss' in logs:
            self.eval_losses.append((state.global_step, logs['eval_loss']))
            self.logger.info(f"Step {state.global_step}: Evaluation loss = {logs['eval_loss']}")
        
        # Record elapsed time
        if self.start_time is not None:
            elapsed_time = time.time() - self.start_time
            self.train_times.append((state.global_step, elapsed_time))
            self.logger.info(f"Step {state.global_step}: Training time = {elapsed_time:.2f}s")
    
    def on_train_end(self, args, state, control, **kwargs):
        """Save all metrics at the end of training."""
        # Calculate total training time
        total_time = time.time() - self.start_time if self.start_time is not None else 0
        
        # Log final metrics
        self.logger.info(f"Training completed at {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}")
        self.logger.info(f"Total training time: {total_time:.2f} seconds")
        self.logger.info(f"Final training loss: {self.train_losses[-1][1] if self.train_losses else 'N/A'}")
        self.logger.info(f"Final evaluation loss: {self.eval_losses[-1][1] if self.eval_losses else 'N/A'}")
        
        # Save metrics to JSON file
        metrics_file = os.path.join(self.log_dir, "training_metrics.json")
        metrics = {
            "train_losses": self.train_losses,
            "eval_losses": self.eval_losses,
            "train_times": self.train_times,
            "total_time": total_time
        }
        
        with open(metrics_file, 'w') as f:
            json.dump(metrics, f, indent=2)
        
        # Create visualization of training progress
        self.visualize_training_progress()
    
    def visualize_training_progress(self):
        """Create visualization of training and validation loss over time."""
        plt.figure(figsize=(12, 8))
        
        # Plot training loss
        if self.train_losses:
            steps, losses = zip(*self.train_losses)
            plt.plot(steps, losses, label='Training Loss')
        
        # Plot validation loss
        if self.eval_losses:
            steps, losses = zip(*self.eval_losses)
            plt.plot(steps, losses, label='Validation Loss')
        
        plt.xlabel('Training Steps')
        plt.ylabel('Loss')
        plt.title('Training and Validation Loss')
        plt.legend()
        plt.grid(True, linestyle='--', alpha=0.7)
        plt.tight_layout()
        
        # Save plot
        plt.savefig(os.path.join(self.log_dir, "training_loss.png"))
        plt.close()


# ------------------ Dataset Preparation ------------------ #
class CodeAlpacaDataset(Dataset):
    """
    Custom dataset for CodeAlpaca data with Python code examples.
    """
    
    def __init__(self, tokenizer, max_length=512, data_subset="train"):
        """
        Initialize the dataset.
        
        Args:
            tokenizer: Tokenizer to use for encoding
            max_length: Maximum sequence length
            data_subset: Data subset to use ("train" or "validation")
        """
        self.tokenizer = tokenizer
        self.max_length = max_length
        
        # Load dataset
        self.alpaca_data = load_dataset("HuggingFaceH4/CodeAlpaca_20K", split="train")
        
        # Filter for Python code
        python_keywords = ['def ', 'import ', 'lambda ', 'class ']
        def is_python_code(text):
            return any(keyword in text for keyword in python_keywords)
        
        self.python_dataset = self.alpaca_data.filter(lambda example: is_python_code(example['completion']))
        print(f"Loaded {len(self.python_dataset)} Python code examples from CodeAlpaca dataset")
        
        # Split into train/validation sets (90% / 10%)
        if data_subset == "train":
            self.dataset = self.python_dataset.select(range(int(len(self.python_dataset) * 0.9)))
        else:  # validation
            self.dataset = self.python_dataset.select(range(int(len(self.python_dataset) * 0.9), len(self.python_dataset)))
        
        print(f"Using {len(self.dataset)} examples for {data_subset}")
    
    def __len__(self):
        return len(self.dataset)
    
    def __getitem__(self, idx):
        """Get a formatted and tokenized example."""
        example = self.dataset[idx]
        
        # Format the input
        input_text = self.format_example(example)
        
        # Tokenize
        encodings = self.tokenizer(
            input_text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        
        # Remove the batch dimension
        item = {key: val.squeeze(0) for key, val in encodings.items()}
        item["labels"] = item["input_ids"].clone()
        
        return item
    
    def format_example(self, example):
        """Format an example for instruction fine-tuning."""
        return (
            "Below is an instruction that describes a task. "
            "Write a response that appropriately completes the request.\n\n"
            f"### Instruction:\n{example['prompt']}\n\n### Response:\n{example['completion']}"
        )

  from .autonotebook import tqdm as notebook_tqdm
2025-04-20 18:11:51.799126: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:467] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1745152911.810684  183252 cuda_dnn.cc:8579] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1745152911.814227  183252 cuda_blas.cc:1407] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
W0000 00:00:1745152911.823348  183252 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745152911.823359  183252 computation_placer.cc:177] computation placer already registered. Please check linkage and avoid linking the same target more than once.
W0000 00:00:1745152911.823360  183252

[2025-04-20 18:11:53,299] [INFO] [real_accelerator.py:239:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


In [2]:
model_name = "gpt2-medium"

print(f"Loading tokenizer from {model_name}...")
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

Loading tokenizer from gpt2-medium...


In [3]:
# Setup model
print(f"Loading model from {model_name}...")
model = AutoModelForCausalLM.from_pretrained(model_name)


# Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
model = model.to(device)

Loading model from gpt2-medium...
Using device: cuda


In [13]:
from peft import (
    get_peft_model,
    LoraConfig,
    TaskType,
    PeftModel,
    PeftConfig
)

# Configure LoRA adapters
lora_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=16,                     # Rank of the update matrices
    lora_alpha=32,           # LoRA scaling factor
    lora_dropout=0.1,        # Dropout probability for LoRA layers
    # Target modules to apply LoRA to (specific to GPT-2 architecture)
    target_modules=["c_attn", "c_proj", "mlp.c_fc", "mlp.c_proj"],
)

In [14]:
# Create the LoRA model
model = get_peft_model(model, lora_config)



In [15]:
# Print trainable parameters information
model.print_trainable_parameters()

trainable params: 6,291,456 || all params: 361,114,624 || trainable%: 1.7422


In [16]:
max_length = 512

# Create datasets
train_dataset = CodeAlpacaDataset(tokenizer, max_length=max_length, data_subset="train")
eval_dataset = CodeAlpacaDataset(tokenizer, max_length=max_length, data_subset="validation")

Loaded 6409 Python code examples from CodeAlpaca dataset
Using 5768 examples for train
Loaded 6409 Python code examples from CodeAlpaca dataset
Using 641 examples for validation


In [17]:
# Setup data collator
data_collator = DataCollatorForLanguageModeling(
tokenizer=tokenizer,
mlm=False
)

In [18]:
output_dir = "./outputs"
batch_size = 2
num_train_epochs = 3
gradient_accumulation_steps = 4
eval_steps = 500
save_steps = 1000
logging_steps = 100
learning_rate = 5e-4  # Higher learning rate for adapters
weight_decay = 0.01
warmup_steps = 500
early_stopping_patience = 3
per_device_train_batch_size = 2
training_args = TrainingArguments(
    output_dir=os.path.join(output_dir, "checkpoints"),
    overwrite_output_dir=True,
    num_train_epochs=num_train_epochs,
    per_device_train_batch_size=per_device_train_batch_size,
    per_device_eval_batch_size=2,
    gradient_accumulation_steps=8,
    eval_steps=250,
    save_steps=250,
    logging_steps=100,
    save_total_limit=3,
    eval_strategy="steps",
    load_best_model_at_end=True,
    learning_rate=learning_rate,
    weight_decay=weight_decay,
    warmup_steps=warmup_steps,
    fp16=True,
    lr_scheduler_type="cosine",
    metric_for_best_model="eval_loss",
    greater_is_better=False,
)

In [19]:
# Setup callbacks
callbacks = [
    MetricsTrackingCallback(log_dir=os.path.join(output_dir, "logs")),
    EarlyStoppingCallback(early_stopping_patience=3)
]

In [20]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=train_dataset,
    eval_dataset=eval_dataset,
    data_collator=data_collator,
    callbacks=callbacks
)

# Train the model
print(f"Starting adapter fine-tuning for {num_train_epochs} epochs...")
trainer.train()

No label_names provided for model class `PeftModelForCausalLM`. Since `PeftModel` hides base models input arguments, if label_names is not given, label_names can't be set automatically within `Trainer`. Note that empty label_names list will be used instead.


Starting adapter fine-tuning for 3 epochs...


Step,Training Loss,Validation Loss
250,1.09,0.915187
500,0.8798,0.812423
750,0.8276,0.760824
1000,0.7646,0.739933


TrainOutput(global_step=1080, training_loss=0.9716171229327166, metrics={'train_runtime': 2364.958, 'train_samples_per_second': 7.317, 'train_steps_per_second': 0.457, 'total_flos': 1.636675620962304e+16, 'train_loss': 0.9716171229327166, 'epoch': 2.9930651872399445})

In [36]:
# Save the fine-tuned model
model_save_path = os.path.join(output_dir, "gpt2-medium_lora_eps")
trainer.save_model(model_save_path)
tokenizer.save_pretrained(model_save_path)
print(f"Model and tokenizer saved to {model_save_path}")

Model and tokenizer saved to ./outputs/gpt2-medium_lora_eps


In [38]:
# After training
peft_model_path = os.path.join(output_dir, "gpt2-medium_lora_python")
model.save_pretrained(peft_model_path)
tokenizer.save_pretrained(peft_model_path)

('./outputs/gpt2-medium_lora_python/tokenizer_config.json',
 './outputs/gpt2-medium_lora_python/special_tokens_map.json',
 './outputs/gpt2-medium_lora_python/vocab.json',
 './outputs/gpt2-medium_lora_python/merges.txt',
 './outputs/gpt2-medium_lora_python/added_tokens.json',
 './outputs/gpt2-medium_lora_python/tokenizer.json')

In [37]:
# When loading later:
from peft import PeftModel, PeftConfig

# First load the base model
base_model = AutoModelForCausalLM.from_pretrained("gpt2-medium")

# Then load the PEFT adapters
model_path = "./outputs/gpt2-medium_lora_eps"
loaded_model = PeftModel.from_pretrained(base_model, model_path)



In [45]:
# Inference 
# Path to your saved LoRA adapters
lora_model_path = "./outputs/gpt2-medium_lora_python"  # Adjust path as needed

# Step 1: Load the base model
base_model = AutoModelForCausalLM.from_pretrained("gpt2-medium")
tokenizer = AutoTokenizer.from_pretrained("gpt2-medium")
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Step 2: Load the LoRA adapters onto the base model
lora_model = PeftModel.from_pretrained(base_model, lora_model_path)

# Step 3: Move model to GPU if available
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
lora_model = lora_model.to(device)
lora_model.eval()  # Set model to evaluation mode



PeftModelForCausalLM(
  (base_model): LoraModel(
    (model): GPT2LMHeadModel(
      (transformer): GPT2Model(
        (wte): Embedding(50257, 1024)
        (wpe): Embedding(1024, 1024)
        (drop): Dropout(p=0.1, inplace=False)
        (h): ModuleList(
          (0-23): 24 x GPT2Block(
            (ln_1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
            (attn): GPT2Attention(
              (c_attn): lora.Linear(
                (base_layer): Conv1D(nf=3072, nx=1024)
                (lora_dropout): ModuleDict(
                  (default): Dropout(p=0.1, inplace=False)
                )
                (lora_A): ModuleDict(
                  (default): Linear(in_features=1024, out_features=16, bias=False)
                )
                (lora_B): ModuleDict(
                  (default): Linear(in_features=16, out_features=3072, bias=False)
                )
                (lora_embedding_A): ParameterDict()
                (lora_embedding_B): ParameterDict()
    

In [40]:
from gpt2_python_benchmark import CodeBLEU

In [41]:
# ------------------ Alpaca-style Fine-tuning ------------------ #
def prepare_alpaca_dataset():
    """
    Load and filter the CodeAlpaca dataset for Python code examples.
    
    Returns:
        Filtered dataset with Python code examples
    """
    # Load dataset
    alpaca_data = load_dataset("HuggingFaceH4/CodeAlpaca_20K", split="train")
    
    # Filter for Python code
    python_keywords = ['def ', 'import ', 'lambda ']
    def is_python_code(text):
        return any(keyword in text for keyword in python_keywords)
    
    python_dataset = alpaca_data.filter(lambda example: is_python_code(example['completion']))
    return python_dataset

In [46]:
def generate_code(prompt, max_length=500, temperature=0.5):
    """
    Generate Python code given a natural language prompt.
    
    Args:
        prompt: Natural language description of the code to generate
        max_length: Maximum length of generated text
        temperature: Sampling temperature
        
    Returns:
        Generated code as a string
    """
    # Format the prompt for code generation
    formatted_prompt = f"# Python program to {prompt}\n\ndef"
    
    # Encode the prompt
    input_ids = tokenizer.encode(formatted_prompt, return_tensors="pt").to(device)
    
    # Generate code
    output = lora_model.generate(
        input_ids,
        max_length=max_length,
        temperature=temperature,
        top_p=0.95,
        do_sample=True,
        pad_token_id=tokenizer.pad_token_id
    )
    
    # Decode the generated code
    generated_code = tokenizer.decode(output[0], skip_special_tokens=True)
    
    # Extract the code part (remove the prompt)
    code = generated_code[len(formatted_prompt):]
    
    # Format the code for better presentation
    formatted_code = f"def{code}"
    
    return formatted_code

In [47]:
from data_loader import *
data_loader = BenchmarkDataManager("benchmark_data.json")
data_loader.load_data()
data = data_loader.get_all_data()

Loaded 20 benchmark questions from benchmark_data.json
Loaded 20 benchmark questions from benchmark_data.json


In [48]:
data = data[:5]

In [None]:
import tqdm 
evaluator = CodeBLEU()
for item in data:
    question = item["question"]
    reference = item["reference"]
    generated_code = generate_code(question)
    scores = evaluator.calculate_codebleu(reference, generated_code)
    print("Question: {}".format(question))
    print("reference: {}".format(reference))
    print("generated_code: {}".format(generated_code))
    print("scores: {}".format(scores))

The attention mask is not set and cannot be inferred from input because pad token is same as eos token. As a consequence, you may observe unexpected behavior. Please pass your input's `attention_mask` to obtain reliable results.


Question: find the maximum element in a list
reference: def find_max(lst):
    if not lst:
        return None
    max_val = lst[0]
    for val in lst:
        if val > max_val:
            max_val = val
    return max_val
generated_code: def max_element ( list ):

""" Returns the maximum element in a list """

return max (list, 1 )

def find_max ( list ):

""" Returns the maximum element in a list """

return max (list, 1 )

def find_first ( list ):

""" Returns the first element in a list """

return max (list, 1 )

def find_last ( list ):

""" Returns the last element in a list """

return max (list, 1 )

def find_first_last ( list ):

""" Returns the first element in a list last last """

return max (list, 1 )

def find_last_last ( list ):

""" Returns the last element in a list last last last last """

return max (list, 1 )

def find_first_last_last ( list ):

""" Returns the first element in a list first last last last last """

return max (list, 1 )

def find_first_last_last ( l