#### 00. Install dependancies

In [7]:
%%capture

# Core training libraries
!pip install -q \
    transformers==4.44.2 \
    datasets==2.20.0 \
    tokenizers==0.19.1 \
    accelerate==0.34.2 \
    peft==0.12.0 \
    trl==0.9.6 \
    bitsandbytes==0.43.1 \
    evaluate==0.4.2

# Utilities
!pip install -q \
    numpy \
    pandas \
    scikit-learn \
    rich \
    pyyaml \
    python-dotenv \
    tqdm

# Evaluation (requires pydantic v2)
!pip install -q --upgrade pydantic
!pip install -q google-genai rouge-score

print(" Installation complete!")
print(" All dependencies compatible (pydantic v2 + google-genai)")


## 1. Setting Up Environment Variables (Secrets)

In [10]:

# Create .env file with API key
import os
from google.colab import userdata
# Write .env file
# with open('.env', 'w') as f:
#     # Add the secrets if needed
#     f.write('GOOGLE_API_KEY=<api_key_here>\n')
#     f.write('HF_TOKEN=<api_key_here>\n')

# print(" .env file created")

with open('.env', 'w') as f:
    # Add the secrets if needed
    f.write(f'GOOGLE_API_KEY={userdata.get('GOOGLE_API_KEY')}\n')
    f.write(f'HF_TOKEN={userdata.get('HF_TOKEN')}\n')

print(" .env file created")



 .env file created


In [11]:
# Verify it's loaded
from dotenv import load_dotenv
load_dotenv()
# Show only key names for security
try:
    with open('.env', 'r') as f:
        print(" Keys in .env file:")
        print("="*60)
        for line in f:
            line = line.strip()
            if line and not line.startswith('#'):
                key = line.split('=')[0]
                value_preview = line.split('=')[1][:10] + "..." if '=' in line else ""
                print(f"  {key} = {value_preview}")
        print("="*60)
except FileNotFoundError:
    print(" .env file not found")

 Keys in .env file:
  GOOGLE_API_KEY = AIzaSyCkD_...
  HF_TOKEN = hf_SyHkPTh...


## 2. Environment & GPU Check

In [12]:
import sys
import torch

print("="*60)
print("ENVIRONMENT CHECK")
print("="*60)
print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")

if torch.cuda.is_available():
    print(f"CUDA version: {torch.version.cuda}")
    print(f"Device name: {torch.cuda.get_device_name(0)}")
    print(f"Device capability: {torch.cuda.get_device_capability(0)}")
    print(f"Total VRAM: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.2f} GB")
else:
    print(" WARNING: CUDA not available. Training will be VERY slow on CPU.")

print("="*60)

ENVIRONMENT CHECK
Python version: 3.12.12 (main, Oct 10 2025, 08:52:57) [GCC 11.4.0]
PyTorch version: 2.9.0+cpu
CUDA available: False


## 3. Seeds & Determinism

Setting up random seeds for reproducibility. .

In [13]:
import os
import random
import numpy as np
import torch

SEED = 42

# Set environment variable for Python hash seed
os.environ['PYTHONHASHSEED'] = str(SEED)

# Set seeds
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)

if torch.cuda.is_available():
    torch.cuda.manual_seed(SEED)
    torch.cuda.manual_seed_all(SEED)
    # Note: These settings may impact performance
    torch.backends.cudnn.deterministic = True
    torch.backends.cudnn.benchmark = False

print(f" Seeds set to {SEED} for reproducibility")

 Seeds set to 42 for reproducibility


## 4. Hugging Face Login

If you want to push your finetuned adapter to the Hugging Face Hub, uncomment and run the login line below.

Hugging Face token with write permissions. Get one at: https://huggingface.co/settings/tokens

In [14]:

os.environ["HF_TOKEN"] = os.getenv("HF_TOKEN")
!hf auth login --token $HF_TOKEN

print("ℹ Hugging Face login skipped. Uncomment login() to push models to Hub.")

The token has not been saved to the git credentials helper. Pass `add_to_git_credential=True` in this function directly or `--add-to-git-credential` if using via `hf`CLI if you want to set the git credential as well.
Token is valid (permission: write).
The token `Sahas AI` has been saved to /root/.cache/huggingface/stored_tokens
Your token has been saved to /root/.cache/huggingface/token
Login successful.
Note: Environment variable`HF_TOKEN` is set and is the current active token independently from the token you've just configured.
ℹ Hugging Face login skipped. Uncomment login() to push models to Hub.


## 5. Configuration (Single Source of Truth)

All hyperparameters and settings in one place. **Edit here** to customize your training.

In [17]:
import torch
from pprint import pprint

# Auto-detect compute dtype (BF16 requires compute capability >= 8.0)
use_bf16 = torch.cuda.is_available() and torch.cuda.get_device_capability(0)[0] >= 8
compute_dtype = torch.bfloat16 if use_bf16 else torch.float16

CONFIG = {
    # Model
    "base_model": "Qwen/Qwen2.5-1.5B-Instruct",
    # Alternative for tighter VRAM: "TinyLlama/TinyLlama-1.1B-Chat-v1.0"
    # For GGUF export, prefer: "meta-llama/Llama-3.2-3B-Instruct" or Mistral models

    # Dataset
    "dataset_name": "lavita/AlpaCare-MedInstruct-52k",
    "dataset_split": "train",
    "dataset_subsample": 500,  # Colab-safe: 500 | Local: 1500
    "train_val_split": 0.9,  # 90% train, 10% validation

    # Tokenization
    "max_length": 512,  # Colab: 512 | Local: 1024

    # Training
    "num_train_epochs": 1,
    "max_steps": 250,  # Colab: 250 | Local: 600
    "per_device_train_batch_size": 1,  # Colab: 1 | Local: 2
    "gradient_accumulation_steps": 64,  # Colab: 64 | Local: 32
    "learning_rate": 2e-5,
    "warmup_ratio": 0.03,
    "logging_steps": 10,
    "save_steps": 200,
    "eval_steps": 100,
    "save_total_limit": 2,

    # LoRA
    "lora_r": 16,
    "lora_alpha": 32,
    "lora_dropout": 0.05,
    "lora_target_modules": ["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],

    # Quantization
    "load_in_4bit": True,
    "bnb_4bit_compute_dtype": compute_dtype,
    "bnb_4bit_quant_type": "nf4",
    "bnb_4bit_use_double_quant": True,

    # Output
    "output_dir": "outputs/adapter",
    "push_to_hub": False,

    # Generation
    "max_new_tokens": 128,
    "temperature": 0.0,  # Deterministic
    "do_sample": True,

    # HF credentials
    'hf_username': 'p-sahas',
    'hub_model_name': 'sahas-medical-assistant',
}

# Effective batch size
effective_batch_size = CONFIG["per_device_train_batch_size"] * CONFIG["gradient_accumulation_steps"]

print("="*60)
print("CONFIGURATION (COLAB FREE TIER)")
print("="*60)
pprint(CONFIG)
print("="*60)
print(f"Compute dtype: {compute_dtype}")
print(f"Using BF16: {use_bf16}")
print(f"Effective batch size: {effective_batch_size}")
print("="*60)

CONFIGURATION (COLAB FREE TIER)
{'base_model': 'Qwen/Qwen2.5-1.5B-Instruct',
 'bnb_4bit_compute_dtype': torch.float16,
 'bnb_4bit_quant_type': 'nf4',
 'bnb_4bit_use_double_quant': True,
 'dataset_name': 'lavita/AlpaCare-MedInstruct-52k',
 'dataset_split': 'train',
 'dataset_subsample': 500,
 'do_sample': True,
 'eval_steps': 100,
 'gradient_accumulation_steps': 64,
 'hf_username': 'p-sahas',
 'hub_model_name': 'sahas-medical-assistant',
 'learning_rate': 2e-05,
 'load_in_4bit': True,
 'logging_steps': 10,
 'lora_alpha': 32,
 'lora_dropout': 0.05,
 'lora_r': 16,
 'lora_target_modules': ['q_proj',
                         'k_proj',
                         'v_proj',
                         'o_proj',
                         'gate_proj',
                         'up_proj',
                         'down_proj'],
 'max_length': 512,
 'max_new_tokens': 128,
 'max_steps': 250,
 'num_train_epochs': 1,
 'output_dir': 'outputs/adapter',
 'per_device_train_batch_size': 1,
 'push_to_hub': False,


#### FP16 vs BF16

- BF -> Brain Float
- Usually FP16 prioratize precision
    - 5 exponent bits
    - 10 mantissa bits
- But BF prioratize dynamic range
    - 8 exponent bits
    - 7 mantiss bits


## 6. Dataset Loader (+ Fallback)

Load the medical instruction dataset, map fields robustly, and create train/validation splits.

In [19]:
from datasets import load_dataset, Dataset
import json

def load_medical_dataset(dataset_name, split, subsample, seed=42):
    """Load dataset with robust field mapping and fallback."""

    try:
        # Try loading from Hugging Face
        print(f" Loading dataset: {dataset_name}...")
        dataset = load_dataset(dataset_name, split=split)
        dataset = dataset.shuffle(seed=seed).select(range(min(subsample, len(dataset))))
        print(f" Loaded {len(dataset)} examples from Hugging Face")

    except Exception as e:
        print(f" Failed to load from Hugging Face: {e}")
        print(" Creating synthetic fallback dataset...")

        # Create synthetic medical instruction data
        synthetic_data = []
        templates = [
            {
                "instruction": "Explain the following medical term in simple language.",
                "input": "Hypertension",
                "output": "Hypertension, commonly known as high blood pressure, is a condition where the force of blood against artery walls is consistently too high. This can lead to serious health complications if left untreated."
            },
            {
                "instruction": "What are the common symptoms of the following condition?",
                "input": "Type 2 Diabetes",
                "output": "Common symptoms of Type 2 Diabetes include increased thirst, frequent urination, increased hunger, fatigue, blurred vision, slow-healing sores, and frequent infections."
            },
            {
                "instruction": "Provide general advice for managing the following health issue.",
                "input": "Chronic back pain",
                "output": "Managing chronic back pain typically involves: maintaining good posture, regular low-impact exercise like swimming or walking, maintaining a healthy weight, using proper lifting techniques, and consulting with healthcare providers for appropriate treatment options."
            },
        ]

        # Duplicate to reach ~120 examples
        for i in range(40):
            for template in templates:
                synthetic_data.append(template)

        # Save to temporary JSONL
        with open("/tmp/synthetic_medical.jsonl", "w") as f:
            for item in synthetic_data[:subsample]:
                f.write(json.dumps(item) + "\n")

        dataset = load_dataset("json", data_files="/tmp/synthetic_medical.jsonl", split="train")
        print(f" Created synthetic dataset with {len(dataset)} examples")

    return dataset


def map_dataset_fields(example):
    """Robustly map dataset fields to instruction/input/output schema."""

    # Try to find instruction
    instruction = None
    for key in ["instruction", "question", "prompt", "task"]:
        if key in example and example[key]:
            instruction = str(example[key]).strip()
            break

    # Try to find input (optional)
    input_text = ""
    for key in ["input", "context", "passage", "history"]:
        if key in example and example[key]:
            input_text = str(example[key]).strip()
            break

    # Try to find output/target
    output = None
    for key in ["output", "response", "answer", "target", "completion"]:
        if key in example and example[key]:
            output = str(example[key]).strip()
            break

    return {
        "instruction": instruction,
        "input": input_text,
        "output": output
    }


# Load dataset
dataset = load_medical_dataset(
    CONFIG["dataset_name"],
    CONFIG["dataset_split"],
    CONFIG["dataset_subsample"],
    seed=SEED
)

print(f"\n Dataset before cleaning: {len(dataset)} examples")

# Map fields
dataset = dataset.map(map_dataset_fields)

# Drop rows with missing instruction or output
dataset = dataset.filter(lambda x: x["instruction"] is not None and x["output"] is not None)

print(f" Dataset after cleaning: {len(dataset)} examples")
print(f" Dropped {CONFIG['dataset_subsample'] - len(dataset)} examples with missing data\n")

# Split into train/validation
split_dataset = dataset.train_test_split(
    train_size=CONFIG["train_val_split"],
    seed=SEED
)
train_dataset = split_dataset["train"]
val_dataset = split_dataset["test"]

print(f" Train: {len(train_dataset)} | Validation: {len(val_dataset)}")
print("\n Sample example:")
print(train_dataset[0])

 Loading dataset: lavita/AlpaCare-MedInstruct-52k...
 Loaded 500 examples from Hugging Face

 Dataset before cleaning: 500 examples


Map:   0%|          | 0/500 [00:00<?, ? examples/s]

Filter:   0%|          | 0/500 [00:00<?, ? examples/s]

 Dataset after cleaning: 500 examples
 Dropped 0 examples with missing data

 Train: 450 | Validation: 50

 Sample example:
{'input': '<noinput>',
 'instruction': 'Ask about the possible genetic risks your child might face '
                "related to Down Syndrome, given that you're a 40years old "
                'pregnant woman.',
 'output': 'As a 40-year-old pregnant woman, your age does increase the risk '
           "of having a baby with Down syndrome. However, it's important to "
           'note that the majority of babies born to women in their 40s are '
           'still healthy and do not have Down syndrome. \n'
           '\n'
           'The risk of having a baby with Down syndrome at the age of 40 is '
           'approximately 1 in 100. This means that out of 100 pregnancies at '
           'this age, around 1 will be affected by Down syndrome. \n'
           '\n'
           'To get more accurate information about your individual risk, you '
           'may consider un

In [None]:
import pandas as pd

# Convert first 50 samples to dataframe
df_preview = pd.DataFrame(train_dataset[:50])

# Display with formatting
pd.set_option('display.max_colwidth', 100)  # Limit column width for readability
print(f" Displaying first 50 samples out of {len(dataset)} total examples\n")
df_preview