In [1]:
# # # Install Hugging Face libraries
# %pip install  --upgrade \
#   "evaluate" \
#   "tensorboard" \
#   "flash-attn" \
#   "liger-kernel" \
#   "setuptools" \
#   "deepspeed" \
#   "lm-eval[api]" \
#   "torch"\
#   "torchvision" \
#   "transformers" \
#   "datasets" \
#   "accelerate" \
#   "bitsandbytes" \
#   "trl" \
#   "peft" \
#   "lighteval" \
#   "hf-transfer"

### Import libraries and frameworks

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, BitsAndBytesConfig
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import is_liger_kernel_available
from datasets import load_dataset
from trl import SFTTrainer, TrlParser, ModelConfig, SFTConfig, get_peft_config
from peft import AutoPeftModelForCausalLM

import pandas as pd
from datasets import Dataset, DatasetDict

### Check device

In [3]:
device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
print(f"Device: {device}")

Device: cuda


### Load data

In [4]:
df = pd.read_csv('mle_screening_dataset.csv')

In [5]:
df.shape

(16406, 2)

In [6]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
sample = df.sample()
print(f"Question: {sample['question'].values[0]}\n")
print(f"Answer: {sample['answer'].values[0]}")

Question: What are the genetic changes related to Tourette syndrome ?

Answer: A variety of genetic and environmental factors likely play a role in causing Tourette syndrome. Most of these factors are unknown, and researchers are studying risk factors before and after birth that may contribute to this complex disorder. Scientists believe that tics may result from changes in brain chemicals (neurotransmitters) that are responsible for producing and controlling voluntary movements.  Mutations involving the SLITRK1 gene have been identified in a small number of people with Tourette syndrome. This gene provides instructions for making a protein that is active in the brain. The SLITRK1 protein probably plays a role in the development of nerve cells, including the growth of specialized extensions (axons and dendrites) that allow each nerve cell to communicate with nearby cells. It is unclear how mutations in the SLITRK1 gene can lead to this disorder.  Most people with Tourette syndrome do n

### Load model

In [None]:
# Use BitsAndBytesConfig for quantization that helps to reduce model size
from transformers import BitsAndBytesConfig

In [None]:
# 1. APPLY QUANTIZATION (This was missing!)
nf4_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_use_double_quant=True,
    bnb_4bit_compute_dtype=torch.bfloat16  # Use bfloat16 for better memory efficiency
)

# %%
# Load model WITH quantization
model_name = "microsoft/MediPhi-Instruct"
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=nf4_config,  
    torch_dtype=torch.bfloat16,      # Use bfloat16 for memory efficiency
    device_map="auto",               # Automatically distribute across GPUs
    trust_remote_code=True
)

tokenizer = AutoTokenizer.from_pretrained(
    model_name,
    trust_remote_code=True,
    padding_side="right"  # Ensure consistent padding
)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

In [9]:
model_name = "microsoft/MediPhi-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Process dataset

In [10]:
# Convert data from pandas
dataset = Dataset.from_pandas(df)

# First split: 75% train, 25% temp (val + test)
train_temp_split = dataset.train_test_split(test_size=0.25, seed=42)
train_dataset = train_temp_split["train"]  # 80% of data
temp_dataset = train_temp_split["test"]    # 20% of data

# Second split: Split temp into 12.5% validation, 12.5% test
val_test_split = temp_dataset.train_test_split(test_size=0.5, seed=42)
val_dataset = val_test_split["train"]      # 12.5% of original data
test_dataset = val_test_split["test"]      # 12.5% of original data

# Step 3: Create a DatasetDict to store all splits
dataset = DatasetDict({
    "train": train_dataset,
    "val": val_dataset,
    "test": test_dataset
})

In [11]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 12304
    })
    val: Dataset({
        features: ['question', 'answer'],
        num_rows: 2051
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 2051
    })
})

In [12]:
 # Create system prompt
system_message = """
You are a smart medical assiatnt to help user question about their queries

To answer question, follow the following instructions:
1. **Understand the question**: Clearly identify the question and any important given values.
3. **Answer Step-by-Step**: Iteratively progress your answer
4. **Double Check**: If applicable, double check the question for accuracy and sense.
"""
 
# Remove the existing "text" column if it exists to avoid conflicts
def processes_data(sample):
    question = str(sample["question"] or "").strip()
    answer = str(sample["answer"] or "").strip()
    
    if not question or not answer:
        return {"text": ""}  # Always return string
    
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": question},
        {"role": "assistant", "content": answer}
    ]
    
    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False
    )
    
    return {"text": text}  # Always return string

# Remove existing text column and apply preprocessing
dataset = dataset.remove_columns(["text"] if "text" in dataset['train'].column_names else [])
dataset = dataset.map(processes_data, batched=False)

Map:   0%|          | 0/12304 [00:00<?, ? examples/s]

Map:   0%|          | 0/2051 [00:00<?, ? examples/s]

Map:   0%|          | 0/2051 [00:00<?, ? examples/s]

In [13]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'text'],
        num_rows: 12304
    })
    val: Dataset({
        features: ['question', 'answer', 'text'],
        num_rows: 2051
    })
    test: Dataset({
        features: ['question', 'answer', 'text'],
        num_rows: 2051
    })
})

In [14]:
dataset['test'][0]

{'question': 'How many people are affected by Denys-Drash syndrome ?',
 'answer': 'The prevalence of Denys-Drash syndrome is unknown; at least 150 affected individuals have been reported in the scientific literature.',
 'text': '<|system|>\n\nYou are a smart medical assiatnt to help user question about their queries\n\nTo answer question, follow the following instructions:\n1. **Understand the question**: Clearly identify the question and any important given values.\n3. **Answer Step-by-Step**: Iteratively progress your answer\n4. **Double Check**: If applicable, double check the question for accuracy and sense.\n<|end|>\n<|user|>\nHow many people are affected by Denys-Drash syndrome ?<|end|>\n<|assistant|>\nThe prevalence of Denys-Drash syndrome is unknown; at least 150 affected individuals have been reported in the scientific literature.<|end|>\n<|endoftext|>'}

In [15]:
dataset.save_to_disk("./dataset")

Saving the dataset (0/1 shards):   0%|          | 0/12304 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2051 [00:00<?, ? examples/s]

Saving the dataset (0/1 shards):   0%|          | 0/2051 [00:00<?, ? examples/s]

### Understand model architechture

In [16]:
model

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (norm): Phi3RMSNorm((3072,), eps=1e-05)
    (rotary_emb): Phi3RotaryEmbedding()
  )
  (lm_head): Linear(in_features=3072, out_features=32064, 

In [17]:
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()
print(
    f"trainable params: {trainable_params} || "
    f"all params: {all_param} || "
    f"trainable%: {100 * trainable_params / all_param:.2f}%"
)

trainable params: 3821079552 || all params: 3821079552 || trainable%: 100.00%


In [18]:
tokenizer

LlamaTokenizerFast(name_or_path='microsoft/MediPhi-Instruct', vocab_size=32000, model_max_length=131072, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '<|endoftext|>', 'unk_token': '<unk>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=False),
	32000: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32001: AddedToken("<|assistant|>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=True),
	32002: AddedToken("<|placeholder1|>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=True),

### Before training test the model

In [19]:
from transformers import pipeline, StoppingCriteria

In [20]:
question = dataset['test'][0]['answer']
answer = dataset['test'][0]['answer']

print(f"Question: {question}\n")
print(f"Answer: {answer}")

Question: The prevalence of Denys-Drash syndrome is unknown; at least 150 affected individuals have been reported in the scientific literature.

Answer: The prevalence of Denys-Drash syndrome is unknown; at least 150 affected individuals have been reported in the scientific literature.


In [21]:
# Check what token ID 32007 represents
print(f"Token 32007: '{tokenizer.decode([32007])}'")

Token 32007: '<|end|>'


In [22]:
# https://huggingface.co/microsoft/MediPhi-Instruct
prompt = "Operative Report:\nPerformed: Cholecystectomy\nOperative Findings: The gallbladder contained multiple stones and had thickening of its wall. Mild peritoneal fluid was noted."

# Hugging Face pipeline for text generation does apply apply_chat_template under the hood. 
# So we do not need to process for the text generation
messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": question},
]

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

#  stops generation when the model generates token ID 32007
class EosListStoppingCriteria(StoppingCriteria):
  def __init__(self, eos_sequence = [32007]):
      self.eos_sequence = eos_sequence

  def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
      last_ids = input_ids[:,-len(self.eos_sequence):].tolist()
      return self.eos_sequence in last_ids

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
    "stopping_criteria": [EosListStoppingCriteria()]

}
output = pipe(messages, **generation_args)
print(f"AI: {output[0]['generated_text']}")

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


AI:  Denys-Drash syndrome is a rare genetic disorder, and its exact prevalence is not well-documented. However, at least 150 cases have been reported in scientific literature, indicating that it is a rare condition.


In [23]:
print(f"Answer: {answer}")

Answer: The prevalence of Denys-Drash syndrome is unknown; at least 150 affected individuals have been reported in the scientific literature.


In [24]:
# From the above testing, it is clear that Medphi is generating more or less similar text generation.
# WIth fine tiuning the model might learn more numances of the dataset provided. 

### Model training

In [25]:
from transformers import BitsAndBytesConfig
import torch
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, get_peft_model
from peft.optimizers import create_lorafa_optimizer

In [26]:
lora_config = LoraConfig(
    r=8,
    lora_alpha=32,
    bias="none",
    target_modules = ['o_proj', 'qkv_proj', 'gate_up_proj', 'down_proj'],
    task_type="CAUSAL_LM"
)

In [27]:
peft_model = get_peft_model(model, lora_config)

In [28]:
peft_model.print_trainable_parameters()

trainable params: 12,582,912 || all params: 3,833,662,464 || trainable%: 0.3282


In [29]:
import evaluate
import numpy as np
from sklearn.metrics import accuracy_score

def compute_metrics(eval_pred):
    predictions, labels = eval_pred
    
    # Decode predictions and labels
    decoded_preds = tokenizer.batch_decode(predictions, skip_special_tokens=True)
    decoded_labels = tokenizer.batch_decode(labels, skip_special_tokens=True)
    
    # Custom metrics for medical/chat evaluation
    metrics = {}
    
    # 1. BLEU Score (text similarity)
    bleu = evaluate.load("bleu")
    bleu_score = bleu.compute(predictions=decoded_preds, references=decoded_labels)
    metrics.update(bleu_score)
    
    return metrics

In [None]:
from transformers import TrainingArguments
from trl import SFTTrainer, SFTConfig
import os 

# Set environment variables to fix tokenizer warnings and memory issues
# os.environ["TOKENIZERS_PARALLELISM"] = "false"
os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"

sft_config = SFTConfig(
    # Basic training parameters
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=8,
    
    # Optimization
    learning_rate=2e-4,
    weight_decay=0.001,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    optim="adamw_torch",
    
    # Evaluation and saving
    eval_strategy="steps",
    eval_steps=20,
    save_strategy="steps",
    save_steps=20,
    save_total_limit=2,
    greater_is_better=False,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    
    # Logging
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=20,

    # Memory and performance
    dataloader_drop_last=True,
    dataloader_num_workers=4,
    remove_unused_columns=False,
    
    # Mixed precision training
    bf16=True if torch.cuda.is_bf16_supported() else False,
    
    # SFT-specific parameters
    max_length=1024,
    packing=True,  # Pack multiple short sequences into one
    dataset_text_field="text",
    
    # Gradient settings
    max_grad_norm=0.3,
    gradient_checkpointing=True,  # Save memory at cost of speed
)

In [31]:
# Create Trainer object
trainer = SFTTrainer(
    model=peft_model,
    args=sft_config,
    train_dataset=dataset['train'],
    eval_dataset=dataset['val'],
    peft_config=lora_config,
    # compute_metrics=compute_metrics
)



Adding EOS to train dataset:   0%|          | 0/12304 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/12304 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/12304 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/2051 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/2051 [00:00<?, ? examples/s]

Packing eval dataset:   0%|          | 0/2051 [00:00<?, ? examples/s]

[2025-08-17 21:15:18,836] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)


/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


[2025-08-17 21:15:20,461] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False


In [None]:
train_result = trainer.train()

In [None]:
# | Step | Training Loss | Validation Loss |
# |------|---------------|-----------------|
# | 20   | 1.179900      | 0.830046        |
# | 40   | 0.783400      | 0.716754        |
# | 60   | 0.718500      | 0.689175        |
# | 80   | 0.683800      | 0.670964        |
# | 100  | 0.664100      | 0.659222        |
# | 120  | 0.663800      | 0.651528        |
# | 140  | 0.644100      | 0.644649        |

### Save teh adapter

In [None]:
def save_lora_adapter(trainer, save_path="./lora_adapter"):
    
    # Save the adapter
    trainer.model.save_pretrained(save_path)
    trainer.tokenizer.save_pretrained(save_path)
    
    print(f"LoRA adapter saved to: {save_path}")
    print(f"Adapter size: {get_directory_size(save_path):.2f} MB")
    
    return save_path

In [None]:
def get_directory_size(path):
    total = 0
    for dirpath, dirnames, filenames in os.walk(path):
        for f in filenames:
            fp = os.path.join(dirpath, f)
            total += os.path.getsize(fp)
    return total / (1024 * 1024)

In [None]:
adapter_path = save_lora_adapter(trainer)

## Push to Huggingface

In [None]:
from huggingface_hub import HfApi, create_repo

In [None]:
token = 'removed it'

In [None]:
api = HfApi(token=token)

In [None]:
repo_id="sabber/medphi-medical-qa-adapter"

In [None]:
create_repo(repo_id=repo_id, token=token, exist_ok=True)

In [None]:
files_to_upload = [
    "adapter_config.json",
    "adapter_model.safetensors",  # or adapter_model.bin
    "tokenizer.json",
    "tokenizer_config.json",
    "special_tokens_map.json"
]

for file in files_to_upload:
    file_path = os.path.join(adapter_path, file)
    if os.path.exists(file_path):
        api.upload_file(
            path_or_fileobj=file_path,
            path_in_repo=file,
            repo_id=repo_id,
            token=token
        )
        print(f"Uploaded: {file}")

### Test model with trained adapter

In [None]:
os.listdir("./")

In [None]:
os.listdir("./lora_adapter")

In [None]:
import json

In [None]:
# 1. First, let's check the current adapter config
with open("./lora_adapter/adapter_config.json", "r") as f:
    adapter_config = json.load(f)

print("Current adapter config:")
print(adapter_config)

# 2. Add the missing base model path if it's not there
if "base_model_name_or_path" not in adapter_config or adapter_config["base_model_name_or_path"] is None:
    adapter_config["base_model_name_or_path"] = "microsoft/MediPhi-Instruct"
    
    # Save the fixed config
    with open("./lora_adapter/adapter_config.json", "w") as f:
        json.dump(adapter_config, f, indent=2)
    
    print("✅ Fixed adapter_config.json with base model path")
else:
    print("✅ Base model path already exists")

In [None]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

ft_model = AutoPeftModelForCausalLM.from_pretrained(
    "./lora_adapter",
    torch_dtype="auto",
    device_map="auto"
)
ft_tokenizer = AutoTokenizer.from_pretrained("./lora_adapter")