In [3]:
import torch, transformers, peft, datasets
from importlib.metadata import version
print(f"PyTorch: {torch.__version__}")
print(f"Transformers: {transformers.__version__}")
print(f"PEFT: {peft.__version__}")
print(f"Datasets: {datasets.__version__}")
print(f"BitsAndBytes: {version('bitsandbytes')}")

PyTorch: 2.0.1
Transformers: 4.34.0
PEFT: 0.5.0
Datasets: 2.14.5
BitsAndBytes: 0.41.1


In [4]:
import os
import random
import logging
import torch
import numpy as np
from tqdm import tqdm
from datasets import load_dataset
from transformers import (AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig, DataCollatorForLanguageModeling)
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training

# Set up logging
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Set random seed for reproducibility
def set_seed(seed: int = 42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)

set_seed()

# --- Check for GPU availability ---
def check_gpu_availability():
    """Check for GPU availability and print details if available."""
    if torch.cuda.is_available():
        gpu_count = torch.cuda.device_count()
        gpu_name = torch.cuda.get_device_name(0)
        logger.info(f"✅ {gpu_count} NVIDIA GPU(s) detected: {gpu_name}")
        for i in range(gpu_count):
            if i > 0:  # Already printed the first one
                logger.info(f"  GPU {i}: {torch.cuda.get_device_name(i)}")
        return True
    else:
        logger.warning("❌ No NVIDIA GPU detected! Training will be extremely slow on CPU.")
        logger.warning("Consider using a GPU-enabled environment (Google Colab, Kaggle, etc.)")
        return False

# Check if we have a GPU
has_gpu = check_gpu_availability()
if not has_gpu:
    user_response = input("Continue without GPU? (y/n): ")
    if user_response.lower() != 'y':
        logger.info("Exiting as requested.")
        raise SystemExit("Exiting due to no GPU available")
    logger.warning("Continuing without GPU, but training will be extremely slow.")

# --- Check package versions ---
try:
    import importlib.metadata as importlib_metadata
except ImportError:
    import importlib_metadata

def get_package_version(package_name):
    """Get package version or return 'Not installed' if not found."""
    try:
        return importlib_metadata.version(package_name)
    except importlib_metadata.PackageNotFoundError:
        return "Not installed"

# Print library versions
logger.info(f"PyTorch version: {torch.__version__}")
logger.info(f"Transformers version: {get_package_version('transformers')}")
logger.info(f"PEFT version: {get_package_version('peft')}")
logger.info(f"Datasets version: {get_package_version('datasets')}")
logger.info(f"BitsAndBytes version: {get_package_version('bitsandbytes')}")
logger.info(f"Accelerate version: {get_package_version('accelerate')}")

# --- Configuration ---
model_id = "EleutherAI/pythia-1b-deduped"  # Example: Choose a ~1B param model
dataset_name = "imdb"  # Replace with your chosen dataset
dataset_text_field = "text"  # Field containing the text
output_dir = "./fine_tuned_model"
peft_output_dir = "./peft_adapter"

# Training parameters
num_epochs = 1
batch_size = 4 if torch.cuda.is_available() else 2
learning_rate = 2e-4
max_length = 512
gradient_accumulation_steps = 4
warmup_steps = 50
logging_steps = 10

# Flag to control quantization
skip_quantization = False

# --- Load Dataset ---
try:
    dataset = load_dataset(dataset_name, split="train[:5%]")
    logger.info(f"Successfully loaded dataset: {dataset_name}")
    logger.info(f"Dataset size: {len(dataset)} examples")
    logger.info(f"Example data point: {dataset[0]}")
    
    # Map numeric labels to readable text for sentiment analysis
    label_mapping = {0: "negative", 1: "positive"}
    
    def format_instruction(sample):
        """Format the dataset into instruction-response pairs."""
        # Map label if it is numeric
        label_text = sample['label']
        if isinstance(label_text, int):
            label_text = label_mapping.get(label_text, str(label_text))
        return {
            "text": (
                f"### Instruction:\nClassify the sentiment of the following movie review.\n\n"
                f"### Input:\n{sample['text']}\n\n"
                f"### Response:\n{label_text}"
            )
        }
    
    dataset = dataset.map(format_instruction)
    logger.info(f"Formatted dataset. Example: {dataset[0]['text'][:200]}...")
    
except Exception as e:
    logger.error(f"Error loading dataset {dataset_name}: {e}")
    logger.error("Ensure the dataset exists and is accessible, or prepare your data manually.")
    raise RuntimeError(f"Failed to load dataset: {e}")

# --- Load Model & Tokenizer ---
model_loaded = False

try:
    logger.info(f"Loading model: {model_id}")
    
    # Determine the device configuration based on available hardware
    if torch.cuda.is_available():
        device_map = "auto"  # Let the library distribute across GPUs
        torch_dtype = torch.float16  # Use half precision on GPU
    else:
        device_map = "cpu"
        torch_dtype = torch.float32  # Use full precision on CPU
        # Force skip quantization on CPU
        skip_quantization = True
        logger.warning("Running on CPU - forcing skip_quantization=True")
    
    if not skip_quantization:
        try:
            import bitsandbytes as bnb
            logger.info(f"Using BitsAndBytes for 4-bit quantization, version: {bnb.__version__}")
            
            bnb_config = BitsAndBytesConfig(
                load_in_4bit=True,
                bnb_4bit_quant_type="nf4",
                bnb_4bit_compute_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
            )
            
            model = AutoModelForCausalLM.from_pretrained(
                model_id,
                quantization_config=bnb_config,
                device_map=device_map,
                trust_remote_code=True
            )
            logger.info("Successfully loaded model with 4-bit quantization")
        except Exception as bnb_error:
            logger.warning(f"BitsAndBytes quantization failed: {bnb_error}")
            logger.warning("Falling back to standard precision loading")
            skip_quantization = True
    
    if skip_quantization:
        logger.info("Loading model without quantization")
        model = AutoModelForCausalLM.from_pretrained(
            model_id,
            device_map=device_map,
            torch_dtype=torch_dtype,
            trust_remote_code=True
        )
        logger.info("Successfully loaded model without quantization")
    
    model.config.use_cache = False
    tokenizer = AutoTokenizer.from_pretrained(model_id, trust_remote_code=True)
    
    if tokenizer.pad_token is None:
        tokenizer.pad_token = tokenizer.eos_token
    tokenizer.padding_side = "right"
    
    logger.info("Successfully loaded model and tokenizer")
    model_loaded = True
    
except Exception as e:
    logger.error(f"Error loading model {model_id}: {e}")
    logger.error("Try a different model or check your internet connection.")
    model_loaded = False

# Verify model exists and is properly loaded
if not model_loaded or 'model' not in locals():
    logger.error("Model loading failed. Cannot continue with fine-tuning.")
    raise RuntimeError("Model loading failed")

# --- Determine LoRA Target Modules ---
if "pythia" in model_id.lower() or "neox" in model_id.lower():
    target_modules = ["query_key_value"]
elif "llama" in model_id.lower() or "mistral" in model_id.lower():
    target_modules = ["q_proj", "k_proj", "v_proj", "o_proj"]
elif "opt" in model_id.lower() or "bloom" in model_id.lower():
    target_modules = ["q_proj", "k_proj", "v_proj", "out_proj"]
elif "gpt-j" in model_id.lower():
    target_modules = ["q_proj", "k_proj", "v_proj"]
elif "phi" in model_id.lower():
    target_modules = ["Wqkv", "out_proj"]
else:
    target_modules = ["query_key_value"]
    logger.warning(f"Using default target modules for {model_id}. Consider inspecting model.named_modules() for available modules.")

# --- PEFT Configuration (LoRA) ---
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    target_modules=target_modules,
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)

# Apply PEFT based on whether we're using quantization or not
try:
    if not skip_quantization:
        model = prepare_model_for_kbit_training(model)
        logger.info("Prepared model for k-bit training")
    
    model = get_peft_model(model, lora_config)
    logger.info("Applied LoRA configuration to model")
    
    logger.info("Trainable parameters:")
    model.print_trainable_parameters()
except Exception as e:
    logger.error(f"Error configuring PEFT: {e}")
    raise RuntimeError(f"Failed to configure PEFT: {e}")

# --- Prepare dataset for training ---
logger.info("Preparing dataset for training...")
try:
    # Prepare tokenization function
    def tokenize_function(examples):
        """Tokenize the texts and prepare for causal language modeling."""
        return tokenizer(
            examples[dataset_text_field],
            truncation=True,
            max_length=max_length,
            padding="max_length",
            return_tensors="pt"
        )
    
    # Apply tokenization to the dataset
    tokenized_dataset = dataset.map(
        tokenize_function,
        batched=True,
        remove_columns=[col for col in dataset.column_names if col != dataset_text_field]
    )
    
    # Create the labels for causal language modeling (same as input_ids)
    def add_labels(examples):
        examples["labels"] = examples["input_ids"].copy()
        return examples
    
    tokenized_dataset = tokenized_dataset.map(add_labels)
    logger.info(f"Dataset processed. Example features: {list(tokenized_dataset[0].keys())}")
    
    # Create PyTorch dataset
    class CausalLMDataset(torch.utils.data.Dataset):
        def __init__(self, tokenized_dataset):
            self.tokenized_dataset = tokenized_dataset
            
        def __len__(self):
            return len(self.tokenized_dataset)
            
        def __getitem__(self, idx):
            item = self.tokenized_dataset[idx]
            return {
                "input_ids": torch.tensor(item["input_ids"]),
                "attention_mask": torch.tensor(item["attention_mask"]),
                "labels": torch.tensor(item["labels"])
            }
    
    train_dataset = CausalLMDataset(tokenized_dataset)
    
    # Create DataLoader
    train_dataloader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=batch_size,
        shuffle=True
    )
    
    logger.info(f"Created DataLoader with batch size {batch_size}")
    
except Exception as e:
    logger.error(f"Error preparing data: {e}")
    raise RuntimeError(f"Failed to prepare data: {e}")

# --- Manual Training Loop ---
logger.info("Setting up manual training loop...")

try:
    # Adjust batch size based on GPU memory
    if torch.cuda.is_available():
        gpu_mem = torch.cuda.get_device_properties(0).total_memory / 1024**3  # in GB
        if gpu_mem < 12:  # For GPUs with less than 12GB memory
            batch_size = 2
            logger.info(f"Detected GPU with {gpu_mem:.1f}GB memory. Using batch size {batch_size}")
    
    # Set up optimizer with weight decay
    from torch.optim import AdamW
    from transformers import get_linear_schedule_with_warmup
    
    # Only optimize parameters that require gradients
    optimizer = AdamW(
        [p for p in model.parameters() if p.requires_grad],
        lr=learning_rate,
        weight_decay=0.01
    )
    
    # Learning rate scheduler with warmup
    total_steps = len(train_dataloader) * num_epochs // gradient_accumulation_steps
    lr_scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=warmup_steps,
        num_training_steps=total_steps
    )
    
    # For tracking loss
    total_loss = 0
    running_loss = 0
    step_count = 0
    
    # Make sure the output directory exists
    os.makedirs(output_dir, exist_ok=True)
    
    # Training loop
    model.train()
    logger.info(f"Starting training for {num_epochs} epochs")
    
    for epoch in range(num_epochs):
        logger.info(f"Epoch {epoch+1}/{num_epochs}")
        epoch_loss = 0
        
        # Use tqdm for progress display
        progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch+1}")
        
        # Reset gradients at the beginning of each epoch
        optimizer.zero_grad()
        
        for batch_idx, batch in enumerate(progress_bar):
            # Move batch to device
            batch = {k: v.to(model.device) for k, v in batch.items()}
            
            # Forward pass
            outputs = model(**batch)
            loss = outputs.loss / gradient_accumulation_steps  # Normalize loss
            loss.backward()
            
            # Update tracking
            running_loss += loss.item() * gradient_accumulation_steps
            epoch_loss += loss.item() * gradient_accumulation_steps
            step_count += 1
            
            # Update progress bar description
            progress_bar.set_postfix({"loss": running_loss / step_count})
            
            # Update weights and reset gradients every gradient_accumulation_steps
            if (batch_idx + 1) % gradient_accumulation_steps == 0:
                optimizer.step()
                lr_scheduler.step()
                optimizer.zero_grad()
                
                # Log every logging_steps gradient accumulation steps
                if (batch_idx + 1) // gradient_accumulation_steps % logging_steps == 0:
                    logger.info(
                        f"Epoch {epoch+1}, Step {(batch_idx + 1) // gradient_accumulation_steps}: "
                        f"Loss = {running_loss / step_count:.4f}, "
                        f"LR = {lr_scheduler.get_last_lr()[0]:.7f}"
                    )
        
        # Epoch complete
        avg_epoch_loss = epoch_loss / len(train_dataloader)
        logger.info(f"Epoch {epoch+1} complete. Average loss: {avg_epoch_loss:.4f}")
        
        # Save checkpoint at the end of each epoch
        checkpoint_path = os.path.join(output_dir, f"checkpoint-epoch-{epoch+1}")
        model.save_pretrained(checkpoint_path)
        tokenizer.save_pretrained(checkpoint_path)
        logger.info(f"Saved checkpoint to {checkpoint_path}")
    
    logger.info("Training completed successfully")
    
except Exception as e:
    logger.error(f"Error in manual training loop: {e}")
    raise RuntimeError(f"Training failed: {e}")

# --- Save PEFT Adapter ---
logger.info(f"Saving final PEFT adapter to {peft_output_dir}")
try:
    model.save_pretrained(peft_output_dir)
    tokenizer.save_pretrained(peft_output_dir)
    logger.info(f"Adapter weights and tokenizer saved to {peft_output_dir}")
except Exception as e:
    logger.error(f"Error saving adapter: {e}")
    raise RuntimeError(f"Failed to save adapter: {e}")

# --- Evaluation ---
logger.info("Running evaluation on test set...")
try:
    test_dataset = load_dataset(dataset_name, split="test[:50]")
    test_dataset = test_dataset.map(format_instruction)
    
    def evaluate_model(model, tokenizer, test_dataset, num_samples=10):
        """Evaluate the fine-tuned model by generating responses and comparing them to the ground truth."""
        model.eval()
        results = []
        
        for i, sample in enumerate(test_dataset):
            if i >= num_samples:
                break
            input_text = sample['text'].split("### Response:")[0] + "### Response:"
            ground_truth = sample['text'].split("### Response:")[-1].strip()
            inputs = tokenizer(input_text, return_tensors="pt").to(model.device)
            
            with torch.no_grad():
                outputs = model.generate(
                    input_ids=inputs["input_ids"],
                    max_new_tokens=50,
                    temperature=0.7,
                    num_return_sequences=1,
                )
            
            generated_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
            response = generated_text.split("### Response:")[-1].strip()
            
            # For sentiment classification tasks, use robust metric comparison.
            if dataset_name == "imdb":
                correct = ground_truth.lower() in response.lower()
            else:
                correct = None
            
            results.append({
                "input": input_text,
                "ground_truth": ground_truth,
                "generated": response,
                "correct": correct
            })
            
            if i < 3:
                logger.info(f"\nExample {i+1}:")
                logger.info(f"Input: {input_text[:100]}...")
                logger.info(f"Ground truth: {ground_truth}")
                logger.info(f"Generated: {response}")
                logger.info(f"Correct: {correct}")
        
        if all(r["correct"] is not None for r in results):
            accuracy = sum(r["correct"] for r in results) / len(results)
            logger.info(f"\nSample accuracy: {accuracy:.2f}")
        
        return results
    
    from peft import PeftModel
    
    # Load the model for evaluation
    eval_device_map = "auto" if torch.cuda.is_available() else "cpu"
    eval_model = AutoModelForCausalLM.from_pretrained(
        model_id,
        device_map=eval_device_map,
        torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32
    )
    eval_model = PeftModel.from_pretrained(eval_model, peft_output_dir)
    
    results = evaluate_model(eval_model, tokenizer, test_dataset, num_samples=10)
    logger.info("Evaluation complete.")
    
except Exception as e:
    logger.error(f"Error during evaluation: {e}")
    logger.error("You'll need to implement proper evaluation based on your task.")

logger.info("\nFine-tuning process complete!")
logger.info("Next steps:")
logger.info("1. Analyze your results and model performance.")
logger.info("2. Experiment with different hyperparameters.")
logger.info("3. Consider pushing your model adapter to the Hugging Face Hub.")
logger.info("4. Prepare your report documenting your process and findings.")

INFO:__main__:Exiting as requested.


SystemExit: Exiting due to no GPU available

  warn("To exit: use 'exit', 'quit', or Ctrl-D.", stacklevel=1)
