# ***Libraries***

In [None]:
!pip install transformers datasets peft accelerate bitsandbytes

# ***Create Medical Corpus***

In [6]:
import json

# Sample medical texts (in real project, use PubMed, medical papers)
medical_corpus = [
    "Acute myocardial infarction, commonly known as a heart attack, occurs when blood flow to part of the heart muscle is blocked. Common symptoms include chest pain, shortness of breath, and nausea.",
    "Hypertension, or high blood pressure, is a chronic condition where the force of blood against artery walls is consistently too high. It can lead to serious complications including stroke and heart disease.",
    "Diabetes mellitus is a metabolic disorder characterized by elevated blood glucose levels. Type 1 diabetes results from insulin deficiency, while Type 2 involves insulin resistance.",
    "The cardiovascular system consists of the heart and blood vessels, responsible for circulating blood throughout the body to deliver oxygen and nutrients to tissues.",
    "Pneumonia is an infection that inflames air sacs in one or both lungs, which may fill with fluid. Symptoms include cough with phlegm, fever, chills, and difficulty breathing.",
    "Chronic obstructive pulmonary disease (COPD) is a progressive lung disease characterized by increasing breathlessness. It includes emphysema and chronic bronchitis.",
    "The immune system defends the body against infectious agents. White blood cells, antibodies, and other mechanisms work together to identify and eliminate pathogens.",
    "Osteoporosis is a condition where bones become weak and brittle, increasing fracture risk. It's often caused by hormonal changes, calcium deficiency, or certain medications.",
    "Alzheimer's disease is a progressive neurodegenerative disorder affecting memory, thinking, and behavior. It's the most common cause of dementia in older adults.",
    "Asthma is a chronic respiratory condition characterized by inflammation and narrowing of airways, causing wheezing, shortness of breath, and chest tightness.",
    "Rheumatoid arthritis is an autoimmune disease causing joint inflammation, pain, and eventual damage. It typically affects hands, wrists, and knees symmetrically.",
    "Hepatitis refers to liver inflammation, often caused by viral infection. Types A, B, and C are most common, each with different transmission routes and treatments.",
    "Anemia is a condition where the body lacks sufficient healthy red blood cells to carry adequate oxygen. Symptoms include fatigue, weakness, and pale skin.",
    "Gastroesophageal reflux disease (GERD) occurs when stomach acid frequently flows back into the esophagus, causing heartburn and potential tissue damage.",
    "Multiple sclerosis is an autoimmune disease affecting the central nervous system, damaging the myelin sheath that protects nerve fibers.",
    "Parkinson's disease is a progressive neurological disorder affecting movement, causing tremors, stiffness, and balance problems due to dopamine depletion.",
    "Thyroid disorders include hypothyroidism (underactive) and hyperthyroidism (overactive), affecting metabolism, energy levels, and body temperature regulation.",
    "Sepsis is a life-threatening condition arising when the body's response to infection causes tissue damage and organ failure, requiring immediate medical attention.",
    "Epilepsy is a neurological disorder characterized by recurrent seizures, caused by abnormal electrical activity in the brain.",
    "Celiac disease is an autoimmune disorder where gluten consumption damages the small intestine, preventing proper nutrient absorption.",
]

# Save as training corpus
with open('data/medical_corpus.json', 'w') as f:
    json.dump({"text": medical_corpus}, f, indent=2)

print(f"✅ Created medical corpus with {len(medical_corpus)} documents")

medical_qa = [
    {
        "question": "What are the symptoms of hypertension?",
        "answer": "Hypertension often has no symptoms, which is why it's called the 'silent killer.' However, when blood pressure is extremely high, symptoms may include severe headaches, fatigue, vision problems, chest pain, difficulty breathing, and irregular heartbeat. Regular blood pressure monitoring is essential for early detection."
    },
    {
        "question": "How is diabetes diagnosed?",
        "answer": "Diabetes is diagnosed through blood tests. A fasting plasma glucose test showing levels of 126 mg/dL or higher, or an HbA1c test result of 6.5% or higher, indicates diabetes. An oral glucose tolerance test may also be used. Two separate tests showing elevated levels are typically required for diagnosis."
    },
    {
        "question": "What causes osteoporosis?",
        "answer": "Osteoporosis develops when bone resorption exceeds bone formation. Risk factors include aging, hormonal changes (especially menopause), calcium and vitamin D deficiency, sedentary lifestyle, smoking, excessive alcohol consumption, and certain medications like corticosteroids. Genetics also play a role."
    },
]

with open('data/medical_qa.json', 'w') as f:
    json.dump(medical_qa, f, indent=2)

print(f"✅ Created {len(medical_qa)} medical Q&A pairs")

✅ Created medical corpus with 20 documents
✅ Created 3 medical Q&A pairs


# ***Domain Adaptation Training***

In [7]:
import torch
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    TrainingArguments,
    Trainer,
    DataCollatorForLanguageModeling
)
from datasets import load_dataset

In [8]:
from datasets import load_dataset
import json

# Load YOUR raw medical text (created earlier)
dataset = load_dataset(
    "json",
    data_files="data/medical_corpus.json"
)

# Extract list of text entries
corpus = dataset["train"]["text"]

# Limit for free Colab VRAM
corpus = corpus[:200]

# Save a smaller processed version for training
with open("medical_corpus_small.json", "w") as f:
    json.dump({"text": corpus}, f, indent=2)

print(f"✅ Loaded {len(corpus)} medical text samples for domain adaptation")

Generating train split: 0 examples [00:00, ? examples/s]

✅ Loaded 1 medical text samples for domain adaptation


In [9]:
from datasets import load_dataset
import json

# Load your own Q&A dataset
dataset = load_dataset(
    "json",
    data_files="data/medical_qa.json"
)

# Convert to HuggingFace-friendly instruction/response format
instruction_data = []

for item in dataset["train"]:
    instruction_data.append({
        "instruction": item["question"],
        "response": item["answer"]
    })

# Save processed version
with open("medical_qa_processed.json", "w") as f:
    json.dump(instruction_data, f, indent=2)

print(f"✅ Prepared {len(instruction_data)} Q&A instruction samples")

Generating train split: 0 examples [00:00, ? examples/s]

✅ Prepared 3 Q&A instruction samples


In [10]:
import torch
from transformers import AutoModelForCausalLM, AutoTokenizer, TrainingArguments, Trainer, DataCollatorForLanguageModeling
from datasets import Dataset
import json

In [11]:
# Config
MODEL_NAME = "gpt2"  # small model for free GPU
OUTPUT_DIR = "models/medical-gpt2"

# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
tokenizer.pad_token = tokenizer.eos_token
model = AutoModelForCausalLM.from_pretrained(MODEL_NAME)

tokenizer_config.json:   0%|          | 0.00/26.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/665 [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/1.04M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/456k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/1.36M [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/548M [00:00<?, ?B/s]

generation_config.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

In [12]:
# Load medical corpus
with open('data/medical_corpus.json', 'r') as f:
    data = json.load(f)

dataset = Dataset.from_dict({"text": data["text"]})
dataset = dataset.train_test_split(test_size=0.2)

In [13]:
# Tokenization
def tokenize_fn(examples):
    return tokenizer(examples["text"], truncation=True, max_length=128, padding="max_length")

tokenized_dataset = dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

Map:   0%|          | 0/16 [00:00<?, ? examples/s]

Map:   0%|          | 0/4 [00:00<?, ? examples/s]

In [14]:
# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [16]:
# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=5,          # reduced for free GPU
    per_device_train_batch_size=2,
    save_steps=50,
    logging_steps=10,
    eval_steps=50,
    learning_rate=5e-5,
    report_to="none"
)

In [17]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator
)

# Train
trainer.train()

`loss_type=None` was set in the config but it is unrecognized. Using the default loss: `ForCausalLMLoss`.


Step,Training Loss
10,2.7452
20,1.8085
30,1.4149
40,1.2875


TrainOutput(global_step=40, training_loss=1.8140163183212281, metrics={'train_runtime': 19.1324, 'train_samples_per_second': 4.181, 'train_steps_per_second': 2.091, 'total_flos': 5225840640000.0, 'train_loss': 1.8140163183212281, 'epoch': 5.0})

In [18]:
# Save
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("✅ Domain Adaptation Complete")

✅ Domain Adaptation Complete


# ***Instruction Tuning on Medical Q&A using LoRA***

In [19]:
from peft import LoraConfig, get_peft_model, prepare_model_for_kbit_training
from datasets import Dataset
import json
from transformers import AutoModelForCausalLM, AutoTokenizer, Trainer, TrainingArguments, DataCollatorForLanguageModeling

In [20]:
# Paths
MODEL_PATH = "models/medical-gpt2"
OUTPUT_DIR = "models/medical-assistant"

# Load tokenizer & model
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
tokenizer.pad_token = tokenizer.eos_token

model = AutoModelForCausalLM.from_pretrained(MODEL_PATH, load_in_8bit=True, device_map="auto")
model = prepare_model_for_kbit_training(model)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.


In [21]:
# Add LoRA
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["c_attn"],
    lora_dropout=0.05,
    bias="none",
    task_type="CAUSAL_LM"
)
model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

trainable params: 294,912 || all params: 124,734,720 || trainable%: 0.2364


In [22]:
# Load Q&A data
with open('data/medical_qa.json', 'r') as f:
    qa_data = json.load(f)

formatted_data = [{"text": f"### Question:\n{qa['question']}\n### Answer:\n{qa['answer']}"} for qa in qa_data]
dataset = Dataset.from_list(formatted_data).train_test_split(test_size=0.2)

In [23]:
# Tokenize
def tokenize_fn(examples):
    return tokenizer(examples["text"], truncation=True, max_length=128, padding="max_length")

tokenized_dataset = dataset.map(tokenize_fn, batched=True, remove_columns=["text"])

Map:   0%|          | 0/2 [00:00<?, ? examples/s]

Map:   0%|          | 0/1 [00:00<?, ? examples/s]

In [24]:
# Data collator
data_collator = DataCollatorForLanguageModeling(tokenizer=tokenizer, mlm=False)

In [25]:

# Training arguments
training_args = TrainingArguments(
    output_dir=OUTPUT_DIR,
    num_train_epochs=3,           # small for free GPU
    per_device_train_batch_size=1,
    gradient_accumulation_steps=2,
    learning_rate=2e-4,
    fp16=True,
    save_steps=20,
    logging_steps=5,
    report_to="none"
)

In [26]:
trainer = Trainer(
    model=model,
    args=training_args,
    train_dataset=tokenized_dataset["train"],
    eval_dataset=tokenized_dataset["test"],
    data_collator=data_collator
)

# Train
trainer.train()

`use_cache=True` is incompatible with gradient checkpointing. Setting `use_cache=False`...
  return fn(*args, **kwargs)


Step,Training Loss


TrainOutput(global_step=3, training_loss=3.378473917643229, metrics={'train_runtime': 2.6856, 'train_samples_per_second': 2.234, 'train_steps_per_second': 1.117, 'total_flos': 393297002496.0, 'train_loss': 3.378473917643229, 'epoch': 3.0})

In [27]:
# Save
trainer.save_model(OUTPUT_DIR)
tokenizer.save_pretrained(OUTPUT_DIR)
print("✅ Instruction Tuning Complete")

✅ Instruction Tuning Complete


# ***Test Medical Assistant***

In [28]:
from peft import PeftModel
from transformers import AutoModelForCausalLM, AutoTokenizer
import torch

# Paths
MODEL_PATH = "models/medical-assistant"
BASE_PATH = "models/medical-gpt2"

# Load model
tokenizer = AutoTokenizer.from_pretrained(MODEL_PATH)
base_model = AutoModelForCausalLM.from_pretrained(BASE_PATH, load_in_8bit=True, device_map="auto")
model = PeftModel.from_pretrained(base_model, MODEL_PATH)
model.eval()

# Test questions
test_questions = [
    "What is hypertension?",
    "How can I prevent diabetes?",
    "What are the symptoms of pneumonia?",
    "Explain what asthma is."
]

for question in test_questions:
    prompt = f"### Question:\n{question}\n### Answer:\n"
    inputs = tokenizer(prompt, return_tensors="pt").to(model.device)

    with torch.no_grad():
        outputs = model.generate(
            **inputs,
            max_new_tokens=100,
            temperature=0.7,
            top_p=0.9,
            do_sample=True
        )

    answer = tokenizer.decode(outputs[0], skip_special_tokens=True).split("### Answer:")[-1].strip()
    print(f"❓ Question: {question}")
    print(f"🏥 Answer: {answer}\n")
    print("-"*50)

The `load_in_4bit` and `load_in_8bit` arguments are deprecated and will be removed in the future versions. Please, pass a `BitsAndBytesConfig` object in `quantization_config` argument instead.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


❓ Question: What is hypertension?
🏥 Answer: Diabetes is a chronic condition characterized by insulin resistance. Type 1 diabetes is a blood disorder characterized

--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


❓ Question: How can I prevent diabetes?
🏥 Answer: If you are diabetic, you can develop diabetes by eating too much sugar. The body stores excess glucose, causing insulin resistance. Symptoms include abdominal pain, abdominal cramps, and muscle weakness.
### Symptoms: Type 1 diabetes mellitus (fasting), Type 2 diabetes mellitus (diabetes), and Type 3 diabetes mellitus (diabetes). Type 1 and Type 2 diabetes mellitus are often caused by a combination of insulin resistance, high blood sugar levels, and insulin resistance. Type 1 and

--------------------------------------------------


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


❓ Question: What are the symptoms of pneumonia?
🏥 Answer: ### Symptoms include:

### Breathing heavily

### Chest tightness

### Blood clots

### Heartburn

### Pain

### Difficulty breathing

### Difficulty breathing

### Pain sensitivity

### Difficulty breathing

### Difficulty breathing

### Pain sensitivity

### Other common symptoms include:

### Chest tightness

### Blood clots

### Chest tightness

### Blood clots

### Chest tightness

--------------------------------------------------
❓ Question: Explain what asthma is.
🏥 Answer: Question: What is an asthma condition? Anaphylaxis is an allergic reaction to airway gas. It occurs when airways are opened or closed, causing inflammation and swelling. Symptoms include chest pain, shortness of breath, and shortness of breath. Symptoms include chest tightness, weakness, and joint pain. Symptoms include shortness of breath, chest tightness, and weakness. Symptoms include chest tightness, weakness, and joint pain. Symptoms include ligh