In [20]:
import subprocess
import json
import torch
import numpy as np
from datasets import Dataset, DatasetDict
from datasets import load_from_disk
from transformers import pipeline, StoppingCriteria

In [23]:
from peft import AutoPeftModelForCausalLM
from transformers import AutoTokenizer

ft_model = AutoPeftModelForCausalLM.from_pretrained(
    "./lora_adapter",
    torch_dtype="auto",
    device_map="auto"
)
ft_tokenizer = AutoTokenizer.from_pretrained("./lora_adapter")

Loading checkpoint shards:   0%|          | 0/2 [00:00<?, ?it/s]



In [24]:
dataset = load_from_disk("./dataset")

In [25]:
dataset

DatasetDict({
    train: Dataset({
        features: ['question', 'answer', 'text'],
        num_rows: 12304
    })
    val: Dataset({
        features: ['question', 'answer', 'text'],
        num_rows: 2051
    })
    test: Dataset({
        features: ['question', 'answer', 'text'],
        num_rows: 2051
    })
})

In [26]:
def generate_answer(question):
    # https://huggingface.co/microsoft/MediPhi-Instruct
    system_message = """
    You are a smart medical assiatnt to help user question about their queries
    
    To answer question, follow the following instructions:
    1. **Understand the question**: Clearly identify the question and any important given values.
    3. **Answer Step-by-Step**: Iteratively progress your answer
    4. **Double Check**: If applicable, double check the question for accuracy and sense.
    """
    
    # Hugging Face pipeline for text generation does apply apply_chat_template under the hood. 
    # So we do not need to process for the text generation
    messages = [
        {"role": "system", "content": system_message},
        {"role": "user", "content": question},
    ]
    
    pipe = pipeline(
        "text-generation",
        model=ft_model,
        tokenizer=ft_tokenizer,
    )
    
    #  stops generation when the model generates token ID 32007
    class EosListStoppingCriteria(StoppingCriteria):
      def __init__(self, eos_sequence = [32007]):
          self.eos_sequence = eos_sequence
    
      def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
          last_ids = input_ids[:,-len(self.eos_sequence):].tolist()
          return self.eos_sequence in last_ids
    
    generation_args = {
        "max_new_tokens": 500,
        "return_full_text": False,
        "temperature": 0.0,
        "do_sample": False,
        "stopping_criteria": [EosListStoppingCriteria()]
    
    }
    output = pipe(messages, **generation_args)
    return output

In [33]:
import numpy as np

def print_test_result(iterations=10):
    for i in range(iterations):
        index = np.random.randint(0, len(dataset['test']))
        question = dataset['test'][index]['question']  # Fixed: was 'answer'
        answer = dataset['test'][index]['answer']
        
        output = generate_answer(question)
        
        # Header with test number
        print("=" * 80)
        print(f"🧪 TEST {i+1}")
        print("=" * 80)
        
        # Question section
        print("\n🤔 QUESTION:")
        print("-" * 40)
        print(f"{question}")
        
        # Original answer section
        print("\n✅ ORIGINAL ANSWER:")
        print("-" * 40)
        print(f"{answer}")
        
        # AI response section
        print("\n🤖 FINE-TUNED AI RESPONSE:")
        print("-" * 40)
        print(f"{output[0]['generated_text']}")
        
        # Separator
        print("\n" + "~" * 80 + "\n")

In [32]:
print_test_result(iterations = 10)

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.
Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🧪 TEST 1

🤔 QUESTION:
----------------------------------------
How many people are affected by Lenz microphthalmia syndrome ?

✅ ORIGINAL ANSWER:
----------------------------------------
Lenz microphthalmia syndrome is a very rare condition; its incidence is unknown. It has been identified in only a few families worldwide.

🤖 FINE-TUNED AI RESPONSE:
----------------------------------------
 To answer this question, I would need to conduct research as the number of people affected by Lenz microphthalmia syndrome is not readily available in the current database.

1. **Research**: I would start by searching for recent studies or databases that track genetic disorders.
2. **Data Collection**: I would look for information on the prevalence of Lenz microphthalmia syndrome.
3. **Analysis**: I would analyze the data to provide an estimate of the number of people affected.
4. **Answer**: Based on the research, I would provide an answer, such as "Lenz microphthalmia syndrome is a rare genetic di

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🧪 TEST 2

🤔 QUESTION:
----------------------------------------
What are the symptoms of Horizontal gaze palsy with progressive scoliosis ?

✅ ORIGINAL ANSWER:
----------------------------------------
What are the signs and symptoms of Horizontal gaze palsy with progressive scoliosis? The Human Phenotype Ontology provides the following list of signs and symptoms for Horizontal gaze palsy with progressive scoliosis. If the information is available, the table below includes how often the symptom is seen in people with this condition. You can use the MedlinePlus Medical Dictionary to look up the definitions for these medical terms. Signs and Symptoms Approximate number of patients (when available) Kyphosis 90% Cognitive impairment 50% Nystagmus 50% Short neck 50% Seizures 7.5% Sensorineural hearing impairment 7.5% Autosomal recessive inheritance - Congenital onset - Horizontal supranuclear gaze palsy - Progressive ophthalmoplegia - Thoracolumbar scoliosis - The Human Phenotype Ontology (HP

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🧪 TEST 3

🤔 QUESTION:
----------------------------------------
What is (are) Apraxia ?

✅ ORIGINAL ANSWER:
----------------------------------------
Apraxia (called "dyspraxia" if mild) is a neurological disorder characterized by loss of the ability to execute or carry out skilled movements and gestures, despite having the desire and the physical ability to perform them. Apraxia results from dysfunction of the cerebral hemispheres of the brain, especially the parietal lobe, and can arise from many diseases or damage to the brain.  There are several kinds of apraxia, which may occur alone or together. The most common is buccofacial or orofacial apraxia, which causes the inability to carry out facial movements on command such as licking lips, whistling, coughing, or winking. Other types of apraxia include limb-kinetic apraxia (the inability to make fine, precise movements with an arm or leg), ideomotor apraxia (the inability to make the proper movement in response to a verbal command), id

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🧪 TEST 4

🤔 QUESTION:
----------------------------------------
How to diagnose Urinary Incontinence in Men ?

✅ ORIGINAL ANSWER:
----------------------------------------
Men should tell a health care professional, such as a family practice physician, a nurse, an internist, or a urologista doctor who specializes in urinary problemsthey have UI, even if they feel embarrassed. To diagnose UI, the health care professional will
                
- take a medical history  - conduct a physical exam  - order diagnostic tests
                
Medical History
                
Taking a medical history can help a health care professional diagnose UI. He or she will ask the patient or caretaker to provide a medical history, a review of symptoms, a description of eating habits, and a list of prescription and over-the-counter medications the patient is taking. The health care professional will ask about current and past medical conditions.
                
The health care professional also will ask ab

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🧪 TEST 5

🤔 QUESTION:
----------------------------------------
What is (are) Genetic Disorders ?

✅ ORIGINAL ANSWER:
----------------------------------------
Genes are the building blocks of heredity. They are passed from parent to child. They hold DNA, the instructions for making proteins. Proteins do most of the work in cells. They move molecules from one place to another, build structures, break down toxins, and do many other maintenance jobs.    Sometimes there is a mutation, a change in a gene or genes. The mutation changes the gene's instructions for making a protein, so the protein does not work properly or is missing entirely. This can cause a medical condition called a genetic disorder.    You can inherit a gene mutation from one or both parents. A mutation can also happen during your lifetime.    There are three types of genetic disorders:       - Single-gene disorders, where a mutation affects one gene. Sickle cell anemia is an example.    - Chromosomal disorders, where chro

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🧪 TEST 6

🤔 QUESTION:
----------------------------------------
How many people are affected by 46,XX testicular disorder of sex development ?

✅ ORIGINAL ANSWER:
----------------------------------------
Approximately 1 in 20,000 individuals with a male appearance have 46,XX testicular disorder.

🤖 FINE-TUNED AI RESPONSE:
----------------------------------------
 To answer this question, I will need to conduct research to find the most recent and reliable data on the prevalence of 46,XX testicular disorder of sex development.

Step 1: Research
I will search for scientific studies, medical databases, and health organization reports to gather information on the prevalence of 46,XX testicular disorder of sex development.

Step 2: Data Analysis
After collecting the data, I will analyze it to determine the number of people affected by this condition.

Step 3: Answer Formulation
Based on the data analysis, I will formulate a clear and concise answer to the question.

Step 4: Verification
I wi

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🧪 TEST 7

🤔 QUESTION:
----------------------------------------
What is (are) deafness-dystonia-optic neuronopathy syndrome ?

✅ ORIGINAL ANSWER:
----------------------------------------
Deafness-dystonia-optic neuronopathy (DDON) syndrome, also known as Mohr-Tranebjrg syndrome, is characterized by hearing loss that begins early in life, problems with movement, impaired vision, and behavior problems. This condition occurs almost exclusively in males.  The first symptom of DDON syndrome is hearing loss caused by nerve damage in the inner ear (sensorineural hearing loss), which begins in early childhood. The hearing impairment worsens over time, and most affected individuals have profound hearing loss by age 10.  People with DDON syndrome typically begin to develop problems with movement during their teens, although the onset of these symptoms varies among affected individuals. Some people experience involuntary tensing of the muscles (dystonia), while others have difficulty coordinating 

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🧪 TEST 8

🤔 QUESTION:
----------------------------------------
What is (are) High Blood Cholesterol ?

✅ ORIGINAL ANSWER:
----------------------------------------
If TLC (Therapeutic Lifestyle Changes) cannot lower your LDL cholesterol level enough by itself, your doctor may prescribe cholesterol-lowering medicines. The following medicines are used together with TLC to help lower your LDL (bad) cholesterol level. - statins  - ezetimibe  - bile acid sequestrants  - nicotinic acid  - fibrates. statins ezetimibe bile acid sequestrants nicotinic acid fibrates. Statins  - are very effective in lowering LDL (bad) cholesterol levels   - are safe for most people  - have side effects that are infrequent, but potentially serious such as liver and muscle problems. are very effective in lowering LDL (bad) cholesterol levels are safe for most people have side effects that are infrequent, but potentially serious such as liver and muscle problems. Ezetimibe - lowers LDL (bad) cholesterol   - may be u

Device set to use cuda:0
The following generation flags are not valid and may be ignored: ['temperature']. Set `TRANSFORMERS_VERBOSITY=info` for more details.


🧪 TEST 9

🤔 QUESTION:
----------------------------------------
What are the symptoms of Multiple endocrine neoplasia type 2A ?

✅ ORIGINAL ANSWER:
----------------------------------------
What are the signs and symptoms of Multiple endocrine neoplasia type 2A? The Human Phenotype Ontology provides the following list of signs and symptoms for Multiple endocrine neoplasia type 2A. If the information is available, the table below includes how often the symptom is seen in people with this condition. You can use the MedlinePlus Medical Dictionary to look up the definitions for these medical terms. Signs and Symptoms Approximate number of patients (when available) Abnormality of the integument - Aganglionic megacolon - Autosomal dominant inheritance - Elevated calcitonin - Elevated urinary epinephrine - Hypercortisolism - Hyperparathyroidism - Hypertension - Medullary thyroid carcinoma - Parathyroid adenoma - Pheochromocytoma - The Human Phenotype Ontology (HPO) has collected information on 

In [41]:
import os

In [42]:
def run_medical_evaluation(model_path, adapter_path="./lora_adapter", tasks=None):
   
    results = {}
    
    # Create results directory
    os.makedirs("./evaluations", exist_ok=True)
    
    for task in medical_tasks:
        print(f"Evaluating {task}...")
        
        # Build model args with adapter
        model_args = f"pretrained={model_path},peft={adapter_path}"
        
        cmd = [
            "lm_eval",
            "--model", "hf",
            "--model_args", model_args,
            "--tasks", task,
            "--batch_size", "4",  # Reduced for stability
            "--output_path", f"./evaluations/{task}"
        ]
        
        try:
            result = subprocess.run(cmd, capture_output=True, text=True)
            
            if result.returncode == 0:
                # Try to load results
                result_file = f"./evaluations/{task}/evaluations.json"
                if os.path.exists(result_file):
                    with open(result_file) as f:
                        task_results = json.load(f)
                    results[task] = task_results
                    print(f"✅ {task} completed")
                else:
                    print(f"❌ {task} - results file not found")
            else:
                print(f"❌ {task} failed: {result.stderr}")
                
        except Exception as e:
            print(f"❌ Error evaluating {task}: {e}")
    
    return results

# Usage


In [43]:
# Usage
model_path = "./lora_adapter"
results = run_medical_evaluation(model_path)
medical_tasks = [
    "medqa",
    "medmcqa", 
    "pubmedqa",
    "mmlu_anatomy",
    "mmlu_clinical_knowledge",
    "mmlu_college_medicine",
    "mmlu_medical_genetics",
    "mmlu_professional_medicine"
]
medical_results = run_medical_evaluation(model_path, medical_tasks)

Evaluating medqa...
❌ medqa failed: 2025-08-17:21:00:33 ERROR    [__main__:419] Tasks were not found: medqa
                                               Try `lm-eval --tasks list` for list of available tasks
Traceback (most recent call last):
  File "/usr/local/bin/lm_eval", line 8, in <module>
    sys.exit(cli_evaluate())
             ^^^^^^^^^^^^^^
  File "/usr/local/lib/python3.11/dist-packages/lm_eval/__main__.py", line 423, in cli_evaluate
    raise ValueError(
ValueError: Tasks not found: medqa. Try `lm-eval --tasks {list_groups,list_subtasks,list_tags,list}` to list out all available names for task groupings; only (sub)tasks; tags; or all of the above, or pass '--verbosity DEBUG' to troubleshoot task registration issues.

Evaluating medmcqa...
❌ medmcqa failed: 2025-08-17:21:00:44 INFO     [__main__:446] Selected Tasks: ['medmcqa']
2025-08-17:21:00:44 INFO     [evaluator:202] Setting random seed to 0 | Setting numpy seed to 1234 | Setting torch manual seed to 1234 | Setting fe