In [1]:
# Install Hugging Face libraries
# %pip install  --upgrade \
#   "tensorboard" \
#   "flash-attn" \
#   "liger-kernel" \
#   "setuptools" \
#   "deepspeed" \
#   "lm-eval[api]" \
#   "torch"\
#   "torchvision" \
#   "transformers" \
#   "datasets" \
#   "accelerate" \
#   "bitsandbytes" \
#   "trl" \
#   "peft" \
#   "lighteval" \
#   "hf-transfer"

### IMport libraries and framewoerks

In [2]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, set_seed, BitsAndBytesConfig
from transformers.trainer_utils import get_last_checkpoint
from transformers.utils import is_liger_kernel_available
from datasets import load_dataset
from trl import SFTTrainer, TrlParser, ModelConfig, SFTConfig, get_peft_config
from peft import AutoPeftModelForCausalLM

In [3]:
device = torch.accelerator.current_accelerator().type if hasattr(torch, "accelerator") else "cuda"
print(f"Device: {device}")

Device: cuda


### Load data

In [4]:
import pandas as pd
from datasets import Dataset, DatasetDict

In [5]:
df = pd.read_csv('mle_screening_dataset.csv')

In [6]:
df.shape

(16406, 2)

In [7]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)
sample = df.sample()
print(f"Question: {sample['question'].values[0]}\n")
print(f"Answer: {sample['answer'].values[0]}")

Question: What is (are) Guillain-Barre syndrome ?

Answer: Guillain-Barr syndrome is a rare disorder in which the body's immune system attacks part of the peripheral nervous system. Symptoms include muscle weakness, numbness, and tingling sensations, which can increase in intensity until the muscles cannot be used at all. Usually Guillain-Barr syndrome occurs a few days or weeks after symptoms of a viral infection. Occasionally, surgery or vaccinations will trigger the syndrome. It remains unclear why only some people develop Guillain-Barr syndrome but there may be a genetic predisposition in some cases. Diagnosed patients should be admitted to a hospital for early treatment. There is no cure for Guillain-Barr syndrome, but treatments such as plasma exchange (plasmapheresis) and high dose immunoglobulins may reduce the severity and duration of symptoms. Recovery can take as little as a few days to as long as a few years. About 30% of those with Guillain-Barr syndrome have residual weak

### Load model

In [8]:
model_name = "microsoft/MediPhi-Instruct"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]

### Process dataset

In [13]:
# Convert data from pandas
dataset = Dataset.from_pandas(df)

# First split: 80% train, 20% temp (val + test)
train_temp_split = dataset.train_test_split(test_size=0.25, seed=42)
train_dataset = train_temp_split["train"]  # 80% of data
temp_dataset = train_temp_split["test"]    # 20% of data

# Second split: Split temp into 12.5% validation, 12.5% test
val_test_split = temp_dataset.train_test_split(test_size=0.5, seed=42)
val_dataset = val_test_split["train"]      # 12.5% of original data
test_dataset = val_test_split["test"]      # 12.5% of original data

# Step 3: Create a DatasetDict to store all splits
dataset = DatasetDict({
    "train": train_dataset,
    "val": val_dataset,
    "test": test_dataset
})

In [14]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer'],
        num_rows: 12304
    })
    val: Dataset({
        features: ['question', 'answer'],
        num_rows: 2051
    })
    test: Dataset({
        features: ['question', 'answer'],
        num_rows: 2051
    })
})

In [15]:
 # Create system prompt
system_message = """
You are a smart medical assiatnt to help user question about their queries

To answer question, follow the following instructions:
1. **Understand the question**: Clearly identify the question and any important given values.
3. **Answer Step-by-Step**: Iteratively progress your answer
4. **Double Check**: If applicable, double check the question for accuracy and sense.
"""
 
# Remove the existing "text" column if it exists to avoid conflicts
def processes_data(sample):
    question = str(sample["question"] or "").strip()
    answer = str(sample["answer"] or "").strip()
    
    if not question or not answer:
        return {"text": ""}  # Always return string
    
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": question},
        {"role": "assistant", "content": answer}
    ]
    
    text = tokenizer.apply_chat_template(
        messages, tokenize=False, add_generation_prompt=False
    )
    
    return {"text": text}  # Always return string

# Remove existing text column and apply preprocessing
dataset = dataset.remove_columns(["text"] if "text" in dataset['train'].column_names else [])
dataset = dataset.map(processes_data, batched=False)

Map:   0%|          | 0/12304 [00:00<?, ? examples/s]

Map:   0%|          | 0/2051 [00:00<?, ? examples/s]

Map:   0%|          | 0/2051 [00:00<?, ? examples/s]

In [16]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'text'],
        num_rows: 12304
    })
    val: Dataset({
        features: ['question', 'answer', 'text'],
        num_rows: 2051
    })
    test: Dataset({
        features: ['question', 'answer', 'text'],
        num_rows: 2051
    })
})

In [17]:
dataset['test'][0]

{'question': 'How many people are affected by Denys-Drash syndrome ?',
 'answer': 'The prevalence of Denys-Drash syndrome is unknown; at least 150 affected individuals have been reported in the scientific literature.',
 'text': '<|system|>\n\nYou are a smart medical assiatnt to help user question about their queries\n\nTo answer question, follow the following instructions:\n1. **Understand the question**: Clearly identify the question and any important given values.\n3. **Answer Step-by-Step**: Iteratively progress your answer\n4. **Double Check**: If applicable, double check the question for accuracy and sense.\n<|end|>\n<|user|>\nHow many people are affected by Denys-Drash syndrome ?<|end|>\n<|assistant|>\nThe prevalence of Denys-Drash syndrome is unknown; at least 150 affected individuals have been reported in the scientific literature.<|end|>\n<|endoftext|>'}

### Understand model architechture

In [18]:
model

Phi3ForCausalLM(
  (model): Phi3Model(
    (embed_tokens): Embedding(32064, 3072, padding_idx=32000)
    (layers): ModuleList(
      (0-31): 32 x Phi3DecoderLayer(
        (self_attn): Phi3Attention(
          (o_proj): Linear(in_features=3072, out_features=3072, bias=False)
          (qkv_proj): Linear(in_features=3072, out_features=9216, bias=False)
        )
        (mlp): Phi3MLP(
          (gate_up_proj): Linear(in_features=3072, out_features=16384, bias=False)
          (down_proj): Linear(in_features=8192, out_features=3072, bias=False)
          (activation_fn): SiLU()
        )
        (input_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
        (post_attention_layernorm): Phi3RMSNorm((3072,), eps=1e-05)
        (resid_attn_dropout): Dropout(p=0.0, inplace=False)
        (resid_mlp_dropout): Dropout(p=0.0, inplace=False)
      )
    )
    (norm): Phi3RMSNorm((3072,), eps=1e-05)
    (rotary_emb): Phi3RotaryEmbedding()
  )
  (lm_head): Linear(in_features=3072, out_features=32064, 

In [19]:
trainable_params = 0
all_param = 0
for _, param in model.named_parameters():
    all_param += param.numel()
    if param.requires_grad:
        trainable_params += param.numel()
print(
    f"trainable params: {trainable_params} || "
    f"all params: {all_param} || "
    f"trainable%: {100 * trainable_params / all_param:.2f}%"
)

trainable params: 3821079552 || all params: 3821079552 || trainable%: 100.00%


In [20]:
tokenizer

LlamaTokenizerFast(name_or_path='microsoft/MediPhi-Instruct', vocab_size=32000, model_max_length=131072, is_fast=True, padding_side='left', truncation_side='right', special_tokens={'bos_token': '<s>', 'eos_token': '<|endoftext|>', 'unk_token': '<unk>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=False, added_tokens_decoder={
	0: AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	1: AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	2: AddedToken("</s>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=False),
	32000: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	32001: AddedToken("<|assistant|>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=True),
	32002: AddedToken("<|placeholder1|>", rstrip=True, lstrip=False, single_word=False, normalized=False, special=True),

### Before training test the model

In [21]:
from transformers import pipeline, StoppingCriteria

In [22]:
question = dataset['test'][0]['answer']
answer = dataset['test'][0]['answer']

print(f"Question: {question}\n")
print(f"Answer: {answer}")

Question: The prevalence of Denys-Drash syndrome is unknown; at least 150 affected individuals have been reported in the scientific literature.

Answer: The prevalence of Denys-Drash syndrome is unknown; at least 150 affected individuals have been reported in the scientific literature.


In [23]:
# Check what token ID 32007 represents
print(f"Token 32007: '{tokenizer.decode([32007])}'")

Token 32007: '<|end|>'


In [24]:
# https://huggingface.co/microsoft/MediPhi-Instruct
prompt = "Operative Report:\nPerformed: Cholecystectomy\nOperative Findings: The gallbladder contained multiple stones and had thickening of its wall. Mild peritoneal fluid was noted."

# Hugging Face pipeline for text generation does apply apply_chat_template under the hood. 
# So we do not need to process for the text generation
messages = [
    {"role": "system", "content": system_message},
    {"role": "user", "content": question},
]

pipe = pipeline(
    "text-generation",
    model=model,
    tokenizer=tokenizer,
)

#  stops generation when the model generates token ID 32007
class EosListStoppingCriteria(StoppingCriteria):
  def __init__(self, eos_sequence = [32007]):
      self.eos_sequence = eos_sequence

  def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
      last_ids = input_ids[:,-len(self.eos_sequence):].tolist()
      return self.eos_sequence in last_ids

generation_args = {
    "max_new_tokens": 500,
    "return_full_text": False,
    "temperature": 0.0,
    "do_sample": False,
    "stopping_criteria": [EosListStoppingCriteria()]

}
output = pipe(messages, **generation_args)
print(f"AI: {output[0]['generated_text']}")

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


AI:  Denys-Drash syndrome is a rare genetic disorder, and its exact prevalence is not well-documented. However, at least 150 cases have been reported in scientific literature, indicating that it is a rare condition.


In [25]:
print(f"Answer: {answer}")

Answer: The prevalence of Denys-Drash syndrome is unknown; at least 150 affected individuals have been reported in the scientific literature.


In [26]:
# From the above testing, it is clear that Medphi is generating more or less similar text generation.
# WIth fine tiuning the model might learn more numances of the dataset provided. 

### Model training

In [27]:
from transformers import BitsAndBytesConfig
import torch
from trl import SFTTrainer, SFTConfig
from peft import LoraConfig, get_peft_model
from peft.optimizers import create_lorafa_optimizer

In [28]:
torch.bfloat16

torch.bfloat16

In [29]:
lora_rank = 16
lora_alpha = 32
max_seq_length = 2048
load_in_4bit = True

In [30]:
lora_config = LoraConfig(
    r=lora_rank,
    lora_alpha=lora_alpha,
    bias="none",
    target_modules = ['o_proj', 'qkv_proj', 'gate_up_proj', 'down_proj'],
    task_type="CAUSAL_LM"
)

In [31]:
peft_model = get_peft_model(model, lora_config)
optimizer = create_lorafa_optimizer(
    model=peft_model,
    r=lora_rank,
    lora_alpha=lora_alpha,
    lr=7e-5,
)

In [32]:
peft_model.print_trainable_parameters()

trainable params: 16,252,928 || all params: 3,846,245,376 || trainable%: 0.4226


In [33]:
from transformers import TrainingArguments
from trl import SFTTrainer, SFTConfig

# path where the Trainer will save its checkpoints and logs
output_dir = 'data/medphi-chat-v0'

sft_config = SFTConfig(
    # Basic training parameters
    output_dir="./results",
    num_train_epochs=3,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=4,
    
    # Optimization
    learning_rate=2e-4,
    weight_decay=0.001,
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    optim="paged_adamw_32bit",  # Memory efficient optimizer
    
    # Evaluation and saving
    # evaluation_strategy="steps",
    eval_steps=500,
    save_strategy="steps",
    save_steps=500,
    save_total_limit=2,
    # load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    
    # Logging
    logging_dir="./logs",
    logging_strategy="steps",
    logging_steps=100,
    report_to="tensorboard",  # or "wandb" if you use weights & biases
    
    # Memory and performance
    dataloader_drop_last=True,
    dataloader_num_workers=4,
    remove_unused_columns=False,
    
    # Mixed precision training
    fp16=False,  # Set to True if your GPU supports it
    bf16=True if torch.cuda.is_bf16_supported() else False,
    
    # SFT-specific parameters
    max_length=2048,
    packing=True,  # Pack multiple short sequences into one
    dataset_text_field="text",
    
    # Gradient settings
    max_grad_norm=0.3,
    gradient_checkpointing=True,  # Save memory at cost of speed
)

In [34]:
# Create Trainer object
trainer = SFTTrainer(
    model=peft_model,
    args=sft_config,
    train_dataset=dataset['train'],
    eval_dataset=dataset['val'],
    peft_config=lora_config,
)



Adding EOS to train dataset:   0%|          | 0/12304 [00:00<?, ? examples/s]

Tokenizing train dataset:   0%|          | 0/12304 [00:00<?, ? examples/s]

Packing train dataset:   0%|          | 0/12304 [00:00<?, ? examples/s]

Adding EOS to eval dataset:   0%|          | 0/2051 [00:00<?, ? examples/s]

Tokenizing eval dataset:   0%|          | 0/2051 [00:00<?, ? examples/s]

Packing eval dataset:   0%|          | 0/2051 [00:00<?, ? examples/s]

[2025-08-17 17:27:47,568] [INFO] [real_accelerator.py:254:get_accelerator] Setting ds_accelerator to cuda (auto detect)


df: /root/.triton/autotune: No such file or directory
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status
/usr/bin/ld: cannot find -laio: No such file or directory
collect2: error: ld returned 1 exit status


[2025-08-17 17:27:48,989] [INFO] [logging.py:107:log_dist] [Rank -1] [TorchCheckpointEngine] Initialized with serialization = False


In [None]:
train_result = trainer.train()

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

Step,Training Loss
