Install libraries

In [None]:
!pip install -q kaggle
!pip install -q transformers accelerate bitsandbytes peft trl datasets scikit-learn matplotlib

Dataset loading

In [None]:
import os,zipfile,json
os.makedirs("/root/.kaggle", exist_ok=True)
!mv "kaggle (9).json" /root/.kaggle/
os.chmod("/root/.kaggle/kaggle (9).json", 0o600)
print("Kaggle API set successfully!")

In [None]:
zip_path="/content/archive (19).zip"
extract_path="/content/data"
with zipfile.ZipFile(zip_path, 'r') as zip_ref:
    zip_ref.extractall(extract_path)
print("extracted files :",os.listdir(extract_path))

JSONL CREATION

In [None]:
import pandas as pd
from sklearn.model_selection import train_test_split
import json
import os
csv_path = "/content/data/DiseaseAndSymptoms.csv"
df = pd.read_csv(csv_path)
symptom_cols = [c for c in df.columns if c.lower().startswith("symptom")]
print("Symptom columns:", symptom_cols[:10], "...")
def row_to_example(row):
    symptoms = [str(s).strip() for s in row[symptom_cols] if pd.notnull(s)]
    symptom_str = ", ".join(symptoms)

    disease = str(row["Disease"]).strip()

    example = {
        "instruction": "Identify the most likely disease pattern from the dataset based on these symptoms.",
        "input": symptom_str,
        "output": (
            f"Disease: {disease}\n"
            f"Explanation: These symptoms frequently appear together for {disease} "
            f"in the training dataset examples.\n"
            "Note: This is NOT medical or diagnostic advice. "
            "For any real health concerns, please consult a licensed doctor or emergency services."
        ),

        "label": disease
    }
    return example

examples = [row_to_example(r) for _, r in df.iterrows()]
print("Total examples:", len(examples))

train_data, test_data = train_test_split(examples, test_size=0.2, random_state=42, stratify=[e["label"] for e in examples])

os.makedirs("data_llm", exist_ok=True)

def write_jsonl(path, data):
    with open(path, "w", encoding="utf-8") as f:
        for ex in data:
            f.write(json.dumps(ex, ensure_ascii=False) + "\n")

write_jsonl("data_llm/train.jsonl", train_data)
write_jsonl("data_llm/test.jsonl", test_data)

len(train_data), len(test_data)

LLM fine tunning

In [None]:
from datasets import load_dataset
data_files={"train":"data_llm/train.jsonl","test":"data_llm/test.jsonl"}
raw_datasets=load_dataset("json",data_files=data_files,split={"train": "train", "test":"test"})
def format_example(example):
    return{
        "text": (
            f"Instruction: {example['instruction']}\n"
            f"Symptoms: {example['input']}\n\n"
            f"Response:\n{example['output']}"
        )
    }
datset=raw_datasets["train"].map(format_example)
datset_test=raw_datasets["test"].map(format_example)
datset[0]

In [None]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, BitsAndBytesConfig
from peft import LoraConfig
from trl import SFTTrainer
from transformers import TrainingArguments

base_model_name = "TinyLlama/TinyLlama-1.1B-Chat-v1.0"

# 4-bit quantization (QLoRA)
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16
)

tokenizer = AutoTokenizer.from_pretrained(base_model_name, use_fast=True)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(
    base_model_name,
    quantization_config=bnb_config,
    device_map="auto",
)
lora_config = LoraConfig(
    r=16,
    lora_alpha=32,
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM",
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj", "gate_proj", "up_proj", "down_proj"],
)


In [None]:
output_dir = "tinyllama_disease_qlora"

training_args = TrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=1,
    gradient_accumulation_steps=1,
    num_train_epochs=1,
    learning_rate=2e-4,
    logging_steps=50,
    save_strategy="epoch",
    eval_strategy="epoch",
    warmup_ratio=0.03,
    lr_scheduler_type="cosine",
    weight_decay=0.01,
    bf16=torch.cuda.is_available(),
    report_to="none"
)
dataset_small = datset.select(range(200))
dataset_test_small = datset_test.select(range(100))
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset_small,
    eval_dataset=dataset_test_small,
    peft_config=lora_config,
    args=training_args,
)

trainer.train()

In [None]:
save_dir = "tinyllama_disease_qlora_adapter"
trainer.model.save_pretrained(save_dir)
tokenizer.save_pretrained(save_dir)
print("Saved adapter to:", save_dir)

In [None]:
from peft import AutoPeftModelForCausalLM
inference_model = AutoPeftModelForCausalLM.from_pretrained(
    save_dir,
    device_map="auto",)
inference_model.eval()


In [None]:
import re
from tqdm import tqdm

def build_prompt(example):
    return (
        f"Instruction: {example['instruction']}\n"
        f"Symptoms: {example['input']}\n\n"
        "Response:\n"
    )

def extract_disease_from_output(text):
    """
    Try to extract the disease name from lines like:
    'Disease: dengue'
    'Possible condition: Dengue-like pattern (from dataset)'
    """
    m = re.search(r"Disease:\s*([^\n\r]+)", text, flags=re.IGNORECASE)
    if m:
        return m.group(1).strip()

    m2 = re.search(r"Possible\s+condition:\s*([^\n\r]+)", text, flags=re.IGNORECASE)
    if m2:
        return m2.group(1).strip()

    return "Unknown"

y_true = []
y_pred = []

for ex in tqdm(raw_datasets["test"]):
    prompt = build_prompt(ex)
    inputs = tokenizer(prompt, return_tensors="pt").to(inference_model.device)

    with torch.no_grad():
        output_ids = inference_model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=False,
            temperature=0.0
        )

    decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True)
    if "Response:" in decoded:
        generated_part = decoded.split("Response:", 1)[1]
    else:
        generated_part = decoded

    pred_disease = extract_disease_from_output(generated_part)
    y_pred.append(pred_disease)
    y_true.append(ex["label"])


EVALUATION MATRIX

In [None]:
from collections import Counter
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import numpy as np
counter = Counter(y_true)
top_20_diseases = [d for d, _ in counter.most_common(20)]

# Filter to those
filtered_true = []
filtered_pred = []
for t, p in zip(y_true, y_pred):
    if t in top_20_diseases:
        filtered_true.append(t)
        filtered_pred.append(p if p in top_20_diseases else "Other/Unknown")

labels = top_20_diseases + ["Other/Unknown"]

cm = confusion_matrix(filtered_true, filtered_pred, labels=labels)

fig, ax = plt.subplots(figsize=(12, 10))
im = ax.imshow(cm, interpolation="nearest")
ax.set_title("Confusion Matrix â€“ Disease (Top 20) vs Predicted Disease")
plt.colorbar(im, ax=ax)

tick_marks = np.arange(len(labels))
ax.set_xticks(tick_marks)
ax.set_xticklabels(labels, rotation=90)
ax.set_yticks(tick_marks)
ax.set_yticklabels(labels)

ax.set_ylabel("Actual Disease")
ax.set_xlabel("Predicted Disease")

plt.tight_layout()
plt.savefig("confusion_matrix_top20.png", dpi=200)
plt.show()


DEMO QUERY

In [None]:
def query_model(symptoms_text: str):
    instruction = "Identify the most likely disease pattern from the dataset based on these symptoms."
    prompt = (
        f"Instruction: {instruction}\n"
        f"Symptoms: {symptoms_text}\n\n"
        "Response:\n"
    )

    inputs = tokenizer(prompt, return_tensors="pt").to(inference_model.device)

    with torch.no_grad():
        output_ids = inference_model.generate(
            **inputs,
            max_new_tokens=128,
            do_sample=False,
            temperature=0.0
        )

    decoded = tokenizer.decode(output_ids[0], skip_special_tokens=True)

    if "Response:" in decoded:
        decoded = decoded.split("Response:", 1)[1].strip()

    return decoded
test_cases = [
    "Fever, headache, body pain",#test case1
    "Cough, sore throat, runny nose",#test case2
    "Abdominal pain, vomiting, diarrhea"#test case3
]
print("MODEL RESPONSES FOR 3 TEST CASES ARE:")
for i, symptoms in enumerate(test_cases, start=1):
    print(f"------ Test Case {i} ------")
    print("Symptoms:", symptoms)
    print("\nModel Output:\n")
    print(query_model(symptoms))
