In [13]:
!pip install --upgrade datasets

Collecting datasets
  Downloading datasets-3.6.0-py3-none-any.whl.metadata (19 kB)
Collecting fsspec<=2025.3.0,>=2023.1.0 (from fsspec[http]<=2025.3.0,>=2023.1.0->datasets)
  Downloading fsspec-2025.3.0-py3-none-any.whl.metadata (11 kB)
Downloading datasets-3.6.0-py3-none-any.whl (491 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m491.5/491.5 kB[0m [31m13.6 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading fsspec-2025.3.0-py3-none-any.whl (193 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m193.6/193.6 kB[0m [31m18.8 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: fsspec, datasets
  Attempting uninstall: fsspec
    Found existing installation: fsspec 2025.3.2
    Uninstalling fsspec-2025.3.2:
      Successfully uninstalled fsspec-2025.3.2
  Attempting uninstall: datasets
    Found existing installation: datasets 2.14.4
    Uninstalling datasets-2.14.4:
      Successfully uninstalled datasets-2.14.4
[31mERROR: pip's dependency r

In [1]:
import os
import torch
from torch.optim import AdamW
from torch.utils.data import Dataset, DataLoader
from transformers import (
    AutoModelForSeq2SeqLM,
    AutoTokenizer,
    get_linear_schedule_with_warmup
)
from datasets import load_dataset
from tqdm.auto import tqdm

# Suppress warnings
import warnings
warnings.filterwarnings("ignore")

print("\n  Installation complete. PLEASE RESTART YOUR RUNTIME NOW before proceeding.")


✅ Installation complete. PLEASE RESTART YOUR RUNTIME NOW before proceeding.


In [2]:
# --- Layer 1: Input Guardrail ---
SENSITIVE_KEYWORDS = [
    "suicide", "kill myself", "want to die", "chest pain", "can't breathe",
    "stroke", "unconscious", "severe bleeding", "heart attack", "poison"
]
EMERGENCY_RESPONSE = "If you believe you are experiencing a medical emergency, please dial your local emergency number (e.g., 911 in the US) immediately or go to the nearest emergency room. This service is for informational purposes only and cannot handle medical emergencies."

def is_query_sensitive(query: str) -> bool:
    """Checks if a user's query contains any sensitive or emergency-related keywords."""
    return any(keyword in query.lower() for keyword in SENSITIVE_KEYWORDS)

In [9]:
dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k")

README.md:   0%|          | 0.00/542 [00:00<?, ?B/s]

(…)-00000-of-00001-5e7cb295b9cff0bf.parquet:   0%|          | 0.00/70.5M [00:00<?, ?B/s]

Generating train split:   0%|          | 0/112165 [00:00<?, ? examples/s]

In [14]:
dataset[0]

{'instruction': "If you are a doctor, please answer the medical questions based on the patient's description.",
 'input': 'I woke up this morning feeling the whole room is spinning when i was sitting down. I went to the bathroom walking unsteadily, as i tried to focus i feel nauseous. I try to vomit but it wont come out.. After taking panadol and sleep for few hours, i still feel the same.. By the way, if i lay down or sit down, my head do not spin, only when i want to move around then i feel the whole world is spinning.. And it is normal stomach discomfort at the same time? Earlier after i relieved myself, the spinning lessen so i am not sure whether its connected or coincidences.. Thank you doc!',
 'output': 'Hi, Thank you for posting your query. The most likely cause for your symptoms is benign paroxysmal positional vertigo (BPPV), a type of peripheral vertigo. In this condition, the most common symptom is dizziness or giddiness, which is made worse with movements. Accompanying naus

In [10]:
# --- Load and Format the Specified Dataset ---
# We are now using the user-provided ChatDoctor dataset.
dataset = load_dataset("lavita/ChatDoctor-HealthCareMagic-100k", split='train')

# Filter for high-quality data and create a smaller subset for the demo
dataset = dataset.filter(lambda x: x['input'] and x['output'] and len(x['input']) > 10 and len(x['output']) > 10)
subset_dataset = dataset.shuffle(seed=42).select(range(2000)) # Use 2000 examples for a quick demo run

# This function formats the data into our instruction prompt.
# It now maps 'input' to the Question and 'output' to the Answer.
def format_instruction(patient_query, doctor_response):
    disclaimer = "Disclaimer: This is for informational purposes only and does not constitute medical advice. Please consult a healthcare professional for any health concerns."
    return f"""### Instruction:
Answer the following medical question accurately and safely. You are a helpful AI assistant, not a doctor. Always include a disclaimer to consult a healthcare professional.

### Question:
{patient_query}

### Answer:
{doctor_response}

{disclaimer}"""

# --- Tokenizer and Custom Dataset Class ---
model_name = "google/flan-t5-small"
tokenizer = AutoTokenizer.from_pretrained(model_name)

class ChatDoctorDataset(Dataset):
    def __init__(self, data, tokenizer, max_length=512):
        self.tokenizer = tokenizer
        self.max_length = max_length
        self.texts = []
        # The loop is now updated to use the 'input' and 'output' columns.
        for example in data:
            formatted_text = format_instruction(example['input'], example['output'])
            self.texts.append(formatted_text)

    def __len__(self):
        return len(self.texts)

    def __getitem__(self, idx):
        text = self.texts[idx]
        tokenized = self.tokenizer(
            text,
            max_length=self.max_length,
            padding="max_length",
            truncation=True,
            return_tensors="pt"
        )
        return {
            "input_ids": tokenized["input_ids"].squeeze(),
            "attention_mask": tokenized["attention_mask"].squeeze()
        }

# Create the dataset and dataloader
train_dataset = ChatDoctorDataset(subset_dataset, tokenizer)
train_dataloader = DataLoader(train_dataset, batch_size=8, shuffle=True)

print(f"✅ Custom Dataset for ChatDoctor created. Number of batches: {len(train_dataloader)}")

Filter:   0%|          | 0/112165 [00:00<?, ? examples/s]

✅ Custom Dataset for ChatDoctor created. Number of batches: 250


In [11]:
# --- Model Initialization ---
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = AutoModelForSeq2SeqLM.from_pretrained(model_name).to(device)
print(f"Model loaded on: {device}")

# --- Optimizer and Scheduler Setup ---
optimizer = AdamW(model.parameters(), lr=5e-5)
num_epochs = 1 # For a real task, 2-3 epochs are common
num_training_steps = num_epochs * len(train_dataloader)
lr_scheduler = get_linear_schedule_with_warmup(
    optimizer,
    num_warmup_steps=0,
    num_training_steps=num_training_steps
)

# --- The Custom Training Loop ---
print("\n--- Starting Full Fine-Tuning on ChatDoctor data... ---")
model.train()

for epoch in range(num_epochs):
    progress_bar = tqdm(train_dataloader, desc=f"Epoch {epoch + 1}/{num_epochs}")
    total_loss = 0

    for batch in progress_bar:
        input_ids = batch["input_ids"].to(device)
        attention_mask = batch["attention_mask"].to(device)
        labels = input_ids.clone() # For T5, labels are the same as input_ids

        outputs = model(
            input_ids=input_ids,
            attention_mask=attention_mask,
            labels=labels
        )

        loss = outputs.loss
        total_loss += loss.item()
        loss.backward()

        optimizer.step()
        lr_scheduler.step()
        optimizer.zero_grad()

        progress_bar.set_postfix({'loss': loss.item()})

    avg_loss = total_loss / len(train_dataloader)
    print(f"--- Epoch {epoch + 1} Finished --- Average Loss: {avg_loss:.4f} ---")

print("\n✅ Fine-tuning complete.")

Model loaded on: cuda

--- Starting Full Fine-Tuning on ChatDoctor data... ---


Epoch 1/1:   0%|          | 0/250 [00:00<?, ?it/s]

--- Epoch 1 Finished --- Average Loss: 2.9264 ---

✅ Fine-tuning complete.


In [16]:
# --- Save the Model and Tokenizer ---
output_dir = "./fully-finetuned-chatdoctor-assistant"
print(f"\nSaving model to {output_dir}...")
os.makedirs(output_dir, exist_ok=True)
model.save_pretrained(output_dir)
tokenizer.save_pretrained(output_dir)
print("✅ Model and tokenizer saved successfully.")

# --- Custom Inference with the Saved Model ---
print("\n--- Loading saved model for custom inference ---")
loaded_model = AutoModelForSeq2SeqLM.from_pretrained(output_dir).to(device)
loaded_tokenizer = AutoTokenizer.from_pretrained(output_dir)
loaded_model.eval()

def generate_safe_response(query: str, model, tokenizer) -> str:
    if is_query_sensitive(query):
        return EMERGENCY_RESPONSE

    # Use the same prompt structure the model was trained on
    prompt = f"""### Instruction:
Answer the following medical question accurately and safely. You are a helpful AI assistant, not a doctor. Always include a disclaimer to consult a healthcare professional.

### Question:
{query}

### Answer:"""

    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)
    with torch.no_grad():
        outputs = model.generate(**inputs, max_new_tokens=1024, do_sample=True, temperature=0.7)
    response_text = tokenizer.decode(outputs[0], skip_special_tokens=True)
    print("Response text is", response)

    if "disclaimer" not in response_text.lower():
        response_text += "\n\nDisclaimer: This is for informational purposes only and does not constitute medical advice. Please consult a healthcare professional for any health concerns."

    return response_text

# --- Test the Inference ---
test_query = "I have a lump under my left nipple and stomach pain."
response = generate_safe_response(test_query, loaded_model, loaded_tokenizer)
print(f"\nQuery: {test_query}\nResponse:\n{response}")


Saving model to ./fully-finetuned-chatdoctor-assistant...
✅ Model and tokenizer saved successfully.

--- Loading saved model for custom inference ---
Response text is Instruction: Answer the following medical question accurately and safely. You are a helpful AI assistant, not a doctor. Always include a disclaimer to consult a healthcare professional. ### Question: I have a lump under my left nipple and stomach pain. ### Answer: I have a lump under my left nipple and stomach pain. ### Answer: I have a lump under my left nipple and stomach pain. ### Answer: I have a lump under my left nipple and stomach pain.

Query: I have a lump under my left nipple and stomach pain.
Response:
Instruction: Answer the following medical question accurately and safely. You are a helpful AI assistant, not a doctor. Always include a disclaimer to consult a healthcare professional. ### Question: I have a lump under my left nipple and stomach pain. ###
