In [None]:
# Install required packages
# Run this cell to install all necessary dependencies

!pip install transformers==4.36.0
!pip install peft==0.7.1
!pip install bitsandbytes==0.41.3
!pip install accelerate==0.25.0
!pip install datasets==2.16.0
!pip install safetensors==0.4.1
!pip install rouge-score==0.1.2
!pip install nltk==3.8.1
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu118


In [None]:
# Import necessary libraries
import torch
import pandas as pd
import numpy as np
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoTokenizer,
    AutoModelForCausalLM,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling,
    BitsAndBytesConfig
)
from peft import (
    LoraConfig,
    get_peft_model,
    TaskType,
    prepare_model_for_kbit_training
)
from datasets import Dataset as HFDataset
from sklearn.model_selection import train_test_split
from rouge_score import rouge_scorer
import nltk
from tqdm import tqdm
import warnings
import os
import gc

warnings.filterwarnings('ignore')

# Download NLTK data for evaluation
nltk.download('punkt', quiet=True)

# Check GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")


In [None]:
# Configuration parameters
class Config:
    # Model configuration
    MODEL_NAME = "stanford-crfm/BioMedLM"  # 2.7B parameter model
    MAX_LENGTH = 512  # Maximum sequence length
    
    # LoRA configuration
    LORA_R = 16  # LoRA rank
    LORA_ALPHA = 32  # LoRA alpha
    LORA_DROPOUT = 0.1  # LoRA dropout
    LORA_TARGET_MODULES = ["q_proj", "v_proj", "k_proj", "o_proj", "gate_proj", "up_proj", "down_proj"]
    
    # Training configuration
    BATCH_SIZE = 4  # Batch size per device
    GRADIENT_ACCUMULATION_STEPS = 4  # Effective batch size = 16
    LEARNING_RATE = 2e-5
    NUM_EPOCHS = 3
    WARMUP_STEPS = 100
    WEIGHT_DECAY = 0.01
    
    # Data configuration
    VALIDATION_SPLIT = 0.2
    SEED = 42
    
    # Output paths
    OUTPUT_DIR = "./biomedlm-lora-medquad"
    CHECKPOINT_DIR = "./checkpoints"
    
    # Quantization configuration for memory efficiency
    LOAD_IN_8BIT = False  # Set to True if you have memory constraints
    LOAD_IN_4BIT = True   # More aggressive quantization
    
config = Config()


In [None]:
# Load the MedQuAD dataset
print("Loading MedQuAD dataset...")
df = pd.read_csv('medquad.csv')
print(f"Dataset shape: {df.shape}")
print(f"Columns: {df.columns.tolist()}")
print(f"\nFirst few examples:")
print(df.head())

# Check for missing values
print(f"\nMissing values per column:")
print(df.isnull().sum())

# Remove rows with missing questions or answers
df = df.dropna(subset=['question', 'answer'])
print(f"\nDataset shape after removing missing values: {df.shape}")

# Create instruction-following format for fine-tuning
def create_prompt(question, answer=None):
    """Create a prompt in instruction-following format"""
    prompt = f"""Below is a medical question. Provide a detailed, evidence-based answer.

Question: {question}

Answer:"""
    if answer:
        prompt += f" {answer}"
    return prompt

# Apply formatting
df['text'] = df.apply(lambda row: create_prompt(row['question'], row['answer']), axis=1)

# Display sample formatted text
print("\nSample formatted text:")
print(df['text'].iloc[0][:500] + "...")

# Split into train and validation sets
train_df, val_df = train_test_split(
    df, 
    test_size=config.VALIDATION_SPLIT, 
    random_state=config.SEED,
    stratify=df['source'] if 'source' in df.columns else None
)

print(f"\nTrain set size: {len(train_df)}")
print(f"Validation set size: {len(val_df)}")

# Convert to Hugging Face datasets
train_dataset = HFDataset.from_pandas(train_df[['text']])
val_dataset = HFDataset.from_pandas(val_df[['text']])


In [None]:
# Load tokenizer
print("Loading tokenizer...")
tokenizer = AutoTokenizer.from_pretrained(config.MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
tokenizer.padding_side = "right"

# Configure quantization for memory efficiency
if config.LOAD_IN_4BIT:
    bnb_config = BitsAndBytesConfig(
        load_in_4bit=True,
        bnb_4bit_quant_type="nf4",
        bnb_4bit_compute_dtype=torch.float16,
        bnb_4bit_use_double_quant=True,
    )
elif config.LOAD_IN_8BIT:
    bnb_config = BitsAndBytesConfig(
        load_in_8bit=True,
        bnb_8bit_compute_dtype=torch.float16,
    )
else:
    bnb_config = None

# Load the model
print(f"Loading BioMedLM model...")
model = AutoModelForCausalLM.from_pretrained(
    config.MODEL_NAME,
    quantization_config=bnb_config,
    device_map="auto",
    trust_remote_code=True,
    torch_dtype=torch.float16,
)

# Prepare model for k-bit training
if config.LOAD_IN_4BIT or config.LOAD_IN_8BIT:
    model = prepare_model_for_kbit_training(model)

# Configure LoRA
print("Configuring LoRA...")
lora_config = LoraConfig(
    r=config.LORA_R,
    lora_alpha=config.LORA_ALPHA,
    target_modules=config.LORA_TARGET_MODULES,
    lora_dropout=config.LORA_DROPOUT,
    bias="none",
    task_type=TaskType.CAUSAL_LM,
)

# Apply LoRA to the model
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

# Enable gradient checkpointing for memory efficiency
model.enable_input_require_grads()
model.gradient_checkpointing_enable()


In [None]:
# Tokenization function
def tokenize_function(examples):
    """Tokenize the text examples"""
    # Tokenize with truncation and padding
    model_inputs = tokenizer(
        examples["text"],
        max_length=config.MAX_LENGTH,
        truncation=True,
        padding="max_length",
        return_tensors=None
    )
    
    # Set labels (same as input_ids for causal LM)
    model_inputs["labels"] = model_inputs["input_ids"].copy()
    
    return model_inputs

# Tokenize datasets
print("Tokenizing training dataset...")
tokenized_train = train_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=train_dataset.column_names,
    desc="Tokenizing train dataset"
)

print("Tokenizing validation dataset...")
tokenized_val = val_dataset.map(
    tokenize_function,
    batched=True,
    remove_columns=val_dataset.column_names,
    desc="Tokenizing validation dataset"
)

print(f"Tokenized train samples: {len(tokenized_train)}")
print(f"Tokenized validation samples: {len(tokenized_val)}")

# Data collator
data_collator = DataCollatorForLanguageModeling(
    tokenizer=tokenizer,
    mlm=False,  # Causal LM, not masked LM
    pad_to_multiple_of=8  # For tensor core efficiency
)


In [None]:
# Training arguments
training_args = TrainingArguments(
    output_dir=config.OUTPUT_DIR,
    num_train_epochs=config.NUM_EPOCHS,
    per_device_train_batch_size=config.BATCH_SIZE,
    per_device_eval_batch_size=config.BATCH_SIZE,
    gradient_accumulation_steps=config.GRADIENT_ACCUMULATION_STEPS,
    warmup_steps=config.WARMUP_STEPS,
    weight_decay=config.WEIGHT_DECAY,
    learning_rate=config.LEARNING_RATE,
    logging_dir=f"{config.OUTPUT_DIR}/logs",
    logging_steps=50,
    evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=3,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    push_to_hub=False,
    report_to=["tensorboard"],
    fp16=True,  # Mixed precision training
    optim="paged_adamw_8bit",  # Memory-efficient optimizer
    group_by_length=True,  # Group sequences of similar length
    ddp_find_unused_parameters=False,
    gradient_checkpointing=True,
    seed=config.SEED,
)

# Initialize trainer
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_train,
    eval_dataset=tokenized_val,
    data_collator=data_collator,
    tokenizer=tokenizer,
)

# Print training configuration summary
total_steps = len(tokenized_train) // (config.BATCH_SIZE * config.GRADIENT_ACCUMULATION_STEPS) * config.NUM_EPOCHS
print(f"\nTraining Configuration Summary:")
print(f"- Total training samples: {len(tokenized_train)}")
print(f"- Total validation samples: {len(tokenized_val)}")
print(f"- Effective batch size: {config.BATCH_SIZE * config.GRADIENT_ACCUMULATION_STEPS}")
print(f"- Total training steps: {total_steps}")
print(f"- Warmup steps: {config.WARMUP_STEPS}")
print(f"- Learning rate: {config.LEARNING_RATE}")
print(f"- LoRA rank: {config.LORA_R}")
print(f"- LoRA alpha: {config.LORA_ALPHA}")


In [None]:
# Start training
print("Starting fine-tuning...")
print("=" * 50)

# Train the model
train_result = trainer.train()

# Save training metrics
trainer.save_metrics("train", train_result.metrics)

# Print training summary
print("\nTraining completed!")
print(f"Total training time: {train_result.metrics['train_runtime']:.2f} seconds")
print(f"Training samples per second: {train_result.metrics['train_samples_per_second']:.2f}")
print(f"Final training loss: {train_result.metrics['train_loss']:.4f}")

# Save the final model
trainer.save_model()
print(f"\nModel saved to: {config.OUTPUT_DIR}")


In [None]:
# Evaluation function
def evaluate_model(model, tokenizer, eval_dataset, num_samples=100):
    """Evaluate model on a subset of the validation set"""
    model.eval()
    
    # Initialize ROUGE scorer
    scorer = rouge_scorer.RougeScorer(['rouge1', 'rouge2', 'rougeL'], use_stemmer=True)
    
    results = {
        'rouge1': [],
        'rouge2': [],
        'rougeL': [],
        'exact_match': [],
        'predictions': []
    }
    
    # Sample evaluation examples
    eval_samples = eval_dataset.select(range(min(num_samples, len(eval_dataset))))
    
    print(f"Evaluating on {len(eval_samples)} samples...")
    
    with torch.no_grad():
        for idx, sample in enumerate(tqdm(eval_samples, desc="Evaluating")):
            # Extract question and reference answer
            text = sample['text']
            parts = text.split("Answer:")
            if len(parts) == 2:
                question_part = parts[0] + "Answer:"
                reference_answer = parts[1].strip()
            else:
                continue
            
            # Tokenize input
            inputs = tokenizer(
                question_part,
                return_tensors="pt",
                max_length=config.MAX_LENGTH,
                truncation=True
            ).to(model.device)
            
            # Generate prediction
            with torch.cuda.amp.autocast():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=256,
                    temperature=0.7,
                    do_sample=True,
                    top_p=0.9,
                    pad_token_id=tokenizer.eos_token_id
                )
            
            # Decode prediction
            prediction = tokenizer.decode(outputs[0], skip_special_tokens=True)
            # Extract only the generated answer part
            if "Answer:" in prediction:
                prediction = prediction.split("Answer:")[-1].strip()
            
            # Calculate ROUGE scores
            scores = scorer.score(reference_answer, prediction)
            results['rouge1'].append(scores['rouge1'].fmeasure)
            results['rouge2'].append(scores['rouge2'].fmeasure)
            results['rougeL'].append(scores['rougeL'].fmeasure)
            
            # Check exact match (normalized)
            exact_match = reference_answer.lower().strip() == prediction.lower().strip()
            results['exact_match'].append(exact_match)
            
            # Store prediction for analysis
            results['predictions'].append({
                'question': question_part.split("Question:")[-1].split("Answer:")[0].strip(),
                'reference': reference_answer[:200] + "..." if len(reference_answer) > 200 else reference_answer,
                'prediction': prediction[:200] + "..." if len(prediction) > 200 else prediction
            })
            
            # Print sample predictions
            if idx < 3:
                print(f"\nExample {idx + 1}:")
                print(f"Question: {results['predictions'][-1]['question']}")
                print(f"Reference: {results['predictions'][-1]['reference']}")
                print(f"Prediction: {results['predictions'][-1]['prediction']}")
    
    # Calculate average scores
    avg_results = {
        'rouge1': np.mean(results['rouge1']),
        'rouge2': np.mean(results['rouge2']),
        'rougeL': np.mean(results['rougeL']),
        'exact_match': np.mean(results['exact_match'])
    }
    
    return avg_results, results

# Run evaluation
print("\nRunning evaluation on validation set...")
avg_scores, detailed_results = evaluate_model(model, tokenizer, val_dataset, num_samples=100)

# Print evaluation results
print("\n" + "=" * 50)
print("EVALUATION RESULTS")
print("=" * 50)
print(f"ROUGE-1 F1 Score: {avg_scores['rouge1']:.4f}")
print(f"ROUGE-2 F1 Score: {avg_scores['rouge2']:.4f}")
print(f"ROUGE-L F1 Score: {avg_scores['rougeL']:.4f}")
print(f"Exact Match Accuracy: {avg_scores['exact_match']:.4f}")

# Save evaluation results
import json
with open(f"{config.OUTPUT_DIR}/evaluation_results.json", "w") as f:
    json.dump({
        'average_scores': avg_scores,
        'sample_predictions': detailed_results['predictions'][:10]
    }, f, indent=2)

print(f"\nEvaluation results saved to: {config.OUTPUT_DIR}/evaluation_results.json")


In [None]:
# Save the model in SafeTensors format
from safetensors.torch import save_file
import os

print("Saving model in SafeTensors format...")

# Create directory for SafeTensors
safetensors_dir = f"{config.OUTPUT_DIR}/safetensors"
os.makedirs(safetensors_dir, exist_ok=True)

# Get the LoRA weights
lora_state_dict = model.get_peft_state_dict()

# Save LoRA weights in SafeTensors format
save_file(lora_state_dict, f"{safetensors_dir}/adapter_model.safetensors")

# Save the configuration
model.peft_config['default'].save_pretrained(safetensors_dir)

# Save tokenizer
tokenizer.save_pretrained(safetensors_dir)

print(f"Model saved in SafeTensors format at: {safetensors_dir}")

# Also save the merged model (optional, requires more disk space)
print("\nMerging LoRA weights with base model...")
merged_model = model.merge_and_unload()

# Save merged model
merged_dir = f"{config.OUTPUT_DIR}/merged_model"
os.makedirs(merged_dir, exist_ok=True)
merged_model.save_pretrained(merged_dir, safe_serialization=True)
tokenizer.save_pretrained(merged_dir)

print(f"Merged model saved at: {merged_dir}")

# Calculate model sizes
import os
def get_dir_size(path):
    total = 0
    for dirpath, dirnames, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total += os.path.getsize(fp)
    return total / (1024**3)  # Convert to GB

lora_size = get_dir_size(safetensors_dir)
print(f"\nLoRA adapter size: {lora_size:.2f} GB")
if os.path.exists(merged_dir):
    merged_size = get_dir_size(merged_dir)
    print(f"Merged model size: {merged_size:.2f} GB")


In [None]:
# Function to generate answers for test questions
def generate_medical_answer(question, model, tokenizer, max_length=512):
    """Generate an answer for a medical question"""
    # Format the question
    prompt = create_prompt(question)
    
    # Tokenize
    inputs = tokenizer(
        prompt,
        return_tensors="pt",
        max_length=max_length,
        truncation=True
    ).to(model.device)
    
    # Generate
    with torch.cuda.amp.autocast():
        outputs = model.generate(
            **inputs,
            max_new_tokens=256,
            temperature=0.7,
            do_sample=True,
            top_p=0.9,
            pad_token_id=tokenizer.eos_token_id,
            eos_token_id=tokenizer.eos_token_id
        )
    
    # Decode
    response = tokenizer.decode(outputs[0], skip_special_tokens=True)
    
    # Extract answer
    if "Answer:" in response:
        answer = response.split("Answer:")[-1].strip()
    else:
        answer = response
    
    return answer

# Test with sample medical questions
test_questions = [
    "What are the main symptoms of diabetes?",
    "How is hypertension diagnosed?",
    "What are the treatment options for migraine headaches?",
    "What causes glaucoma and how can it be prevented?",
    "What are the side effects of chemotherapy?"
]

print("Testing fine-tuned model with sample questions:")
print("=" * 50)

for i, question in enumerate(test_questions):
    print(f"\nQuestion {i+1}: {question}")
    answer = generate_medical_answer(question, model, tokenizer)
    print(f"Answer: {answer[:500]}..." if len(answer) > 500 else f"Answer: {answer}")
    print("-" * 50)


In [None]:
# Clean up GPU memory
if torch.cuda.is_available():
    torch.cuda.empty_cache()
    gc.collect()
    
print("Fine-tuning notebook completed successfully!")
print(f"Models saved at:")
print(f"  - LoRA adapter: {config.OUTPUT_DIR}/safetensors")
print(f"  - Merged model: {config.OUTPUT_DIR}/merged_model")
print(f"  - Checkpoints: {config.CHECKPOINT_DIR}")
