## Install relevant packages

In [None]:
%%capture
import torch

# Get GPU capability
major_version, minor_version = torch.cuda.get_device_capability()

# 1. Install Unsloth (no Kaggle extras needed in Colab)
!pip install git+https://github.com/unslothai/unsloth.git

# 2. Install stable versions of TRL and PEFT to avoid known bugs
!pip install --no-deps "trl<0.9.0" "peft<0.12.0" "accelerate>=0.31.0" "bitsandbytes>=0.43.1"


## Import all relevant packages throughout this walkthrough

In [None]:
"""
# -------------------------------
# Modules for Fine-Tuning
# -------------------------------
def import_fine_tuning_modules():
    global FastLanguageModel, torch, SFTTrainer, is_bfloat16_supported
    from unsloth import FastLanguageModel, is_bfloat16_supported
    import torch
    from trl import SFTTrainer

# -------------------------------
# Hugging Face Modules
# -------------------------------
def import_huggingface_modules():
    global login, TrainingArguments, load_dataset
    from huggingface_hub import login
    from transformers import TrainingArguments
    from datasets import load_dataset

# -------------------------------
# Weights & Biases (WnB)
# -------------------------------
def import_wandb():
    global wandb
    import wandb

# -------------------------------
# Kaggle Secrets
# -------------------------------
def import_kaggle_secrets():
    global UserSecretsClient
    from kaggle_secrets import UserSecretsClient

# -------------------------------
# Call the functions to import all
# -------------------------------
import_fine_tuning_modules()
import_huggingface_modules()
import_wandb()
import_kaggle_secrets()
"""


'\n# -------------------------------\n# Modules for Fine-Tuning\n# -------------------------------\ndef import_fine_tuning_modules():\n    global FastLanguageModel, torch, SFTTrainer, is_bfloat16_supported\n    from unsloth import FastLanguageModel, is_bfloat16_supported\n    import torch\n    from trl import SFTTrainer\n\n# -------------------------------\n# Hugging Face Modules\n# -------------------------------\ndef import_huggingface_modules():\n    global login, TrainingArguments, load_dataset\n    from huggingface_hub import login\n    from transformers import TrainingArguments\n    from datasets import load_dataset\n\n# -------------------------------\n# Weights & Biases (WnB)\n# -------------------------------\ndef import_wandb():\n    global wandb\n    import wandb\n\n# -------------------------------\n# Kaggle Secrets\n# -------------------------------\ndef import_kaggle_secrets():\n    global UserSecretsClient\n    from kaggle_secrets import UserSecretsClient\n\n# ----------

In [None]:
# ============================================================
# 1) Load Secrets from uploaded local file (colab_secrets.json)
# ============================================================

import json

with open("/content/colab_secrets.json", "r") as f:
    secrets = json.load(f)

HF_TOKEN = secrets["HF_TOKEN"]  # <-- your exact key name
WANDB_API_KEY = secrets["Wnb"]   # <-- your exact key name


# ============================================================
# 2) Login to HuggingFace and Weights & Biases
# ============================================================

from huggingface_hub import login
login(HF_TOKEN)

import wandb
wandb.login(key=WANDB_API_KEY)


# ============================================================
# 3) Import all modules needed for fine-tuning
# ============================================================

from unsloth import FastLanguageModel
import torch  # PyTorch
from trl import SFTTrainer  # Supervised fine-tuning trainer
from unsloth import is_bfloat16_supported  # Check bf16 support

from transformers import TrainingArguments  # Training hyperparameters
from datasets import load_dataset  # Load fine-tuning datasets

print("üî• All modules imported successfully and Secrets loaded!")


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc
[34m[1mwandb[0m: Currently logged in as: [33mahmedfahim20004[0m ([33mahmedfahim20004-wama[0m) to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


ü¶• Unsloth: Will patch your computer to enable 2x faster free finetuning.
ü¶• Unsloth Zoo will now patch everything to make training faster!
üî• All modules imported successfully and Secrets loaded!


## Create API keys and login to Hugging Face and Weights and Biases

In [None]:
# -------------------------------
# Initialize Hugging Face & WnB tokens
# -------------------------------
# user_secrets = UserSecretsClient()  # from kaggle_secrets import UserSecretsClient
# hugging_face_token = user_secrets.get_secret("HF_Token")
# wnb_token = user_secrets.get_secret("wnb")

# -------------------------------
# Login to Hugging Face
# -------------------------------
# login(hugging_face_token)  # from huggingface_hub import login

# -------------------------------
# Login to WnB
# -------------------------------
# wandb.login(key=wnb_token)  # import wandb
# run = wandb.init(
#     project='Fine-tune-DeepSeek-R1-Distill-Llama-8B on Medical COT Dataset_YouTube Walkthrough',
#     job_type="training",
#     anonymous="allow"
# )


In [None]:
# ================= 1) Load Secrets from uploaded file =================
import json

# Load HuggingFace and WandB tokens from local file
with open("/content/colab_secrets.json", "r") as f:
    secrets = json.load(f)

hugging_face_token = secrets["HF_TOKEN"]  # HuggingFace token
wnb_token = secrets["Wnb"]                # WandB API key


# ================= 2) Login to Hugging Face & WandB =================
from huggingface_hub import login
login(hugging_face_token)  # Login to HuggingFace

import wandb
wandb.login(key=wnb_token)  # Login to Weights & Biases

# Initialize WandB run
run = wandb.init(
    project='Fine-tune-DeepSeek-R1-Distill-Llama-8B on Medical COT Dataset_WAMA_code',
    job_type="training",
    anonymous="allow"
)


# ================= 3) Import all modules needed for fine-tuning =================
from unsloth import FastLanguageModel
import torch  # PyTorch
from trl import SFTTrainer  # Supervised fine-tuning trainer
from unsloth import is_bfloat16_supported  # Check if hardware supports bf16

from transformers import TrainingArguments  # Define training hyperparameters
from datasets import load_dataset  # Load fine-tuning datasets

print("üî• All modules imported successfully and Secrets loaded!")


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


[34m[1mwandb[0m: Detected [openai] in use.
[34m[1mwandb[0m: Use W&B Weave for improved LLM call tracing. Install Weave with `pip install weave` then add `import weave` to the top of your script.
[34m[1mwandb[0m: For more information, check out the docs at: https://weave-docs.wandb.ai/


üî• All modules imported successfully and Secrets loaded!


## Loading DeepSeek R1 and the Tokenizer

**What are we doing in this step?**

In this step, we **load the DeepSeek R1 model and its tokenizer** using `FastLanguageModel.from_pretrained()`. We also **configure key parameters** for efficient inference and fine-tuning. We will be using a distilled 8B version of R1 for faster computation.  

**Key parameters explained**
```py
max_seq_length = 2048  # Define the maximum sequence length a model can handle (i.e., number of tokens per input)
dtype = None  # Default data type (usually auto-detected)
load_in_4bit = True  # Enables 4-bit quantization ‚Äì a memory-saving optimization
```

**Intuition behind 4-bit quantization**

Imagine compressing a **high-resolution image** to a smaller size‚Äî**it takes up less space but still looks good enough**. Similarly, **4-bit quantization reduces the precision of model weights**, making the model **smaller and faster while keeping most of its accuracy**. Instead of storing precise **32-bit or 16-bit numbers**, we compress them into **4-bit values**. This allows **large language models to run efficiently on consumer GPUs** without needing massive amounts of memory.

In [None]:
# Set parameters
max_seq_length = 2048 # Define the maximum sequence length a model can handle (i.e. how many tokens can be processed at once)
dtype = None # Set to default
load_in_4bit = True # Enables 4 bit quantization ‚Äî a memory saving optimization

# Load the DeepSeek R1 model and tokenizer using unsloth ‚Äî imported using: from unsloth import FastLanguageModel
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/DeepSeek-R1-Distill-Llama-8B",  # Load the pre-trained DeepSeek R1 model (8B parameter version)
    max_seq_length=max_seq_length, # Ensure the model can process up to 2048 tokens at once
    dtype=dtype, # Use the default data type (e.g., FP16 or BF16 depending on hardware support)
    load_in_4bit=load_in_4bit, # Load the model in 4-bit quantization to save memory
    token=hugging_face_token, # Use hugging face token
)

==((====))==  Unsloth 2025.11.4: Fast Llama patching. Transformers: 4.57.1.
   \\   /|    Tesla T4. Num GPUs = 1. Max memory: 14.741 GB. Platform: Linux.
O^O/ \_/ \    Torch: 2.9.0+cu126. CUDA: 7.5. CUDA Toolkit: 12.6. Triton: 3.5.0
\        /    Bfloat16 = FALSE. FA [Xformers = None. FA2 = False]
 "-____-"     Free license: http://github.com/unslothai/unsloth
Unsloth: Fast downloading is enabled - ignore downloading bars which are red colored!


## Testing DeepSeek R1 on a medical use-case before fine-tuning


### Defining a system prompt
To create a prompt style for the model, we will define a system prompt and include placeholders for the question and response generation. The prompt will guide the model to think step-by-step and provide a logical, accurate response.

In [None]:
# Define a system prompt under prompt_style
prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning.
Please answer the following medical question.

### Question:
{}

### Response:
<think>{}"""

### Running inference on the model

In this step, we **test the DeepSeek R1 model** by providing a **medical question** and generating a response.  
The process involves the following steps:

1. **Define a test question** related to a medical case.
2. **Format the question using the structured prompt (`prompt_style`)** to ensure the model follows a logical reasoning process.
3. **Tokenize the input and move it to the GPU (`cuda`)** for faster inference.
4. **Generate a response using the model**, specifying key parameters like `max_new_tokens=1200` (limits response length).
5. **Decode the output tokens back into text** to obtain the final readable answer.

In [None]:
# Creating a test medical question for inference
question = """A 61-year-old woman with a long history of involuntary urine loss during activities like coughing or
              sneezing but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings,
              what would cystometry most likely reveal about her residual volume and detrusor contractions?"""

# Enable optimized inference mode for Unsloth models (improves speed and efficiency)
FastLanguageModel.for_inference(model)  # Unsloth has 2x faster inference!

# Format the question using the structured prompt (`prompt_style`) and tokenize it
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")  # Convert input to PyTorch tensor & move to GPU

# Generate a response using the model
outputs = model.generate(
    input_ids=inputs.input_ids, # Tokenized input question
    attention_mask=inputs.attention_mask, # Attention mask to handle padding
    max_new_tokens=1200, # Limit response length to 1200 tokens (to prevent excessive output)
    use_cache=True, # Enable caching for faster inference
)

# Decode the generated output tokens into human-readable text
response = tokenizer.batch_decode(outputs)

# Extract and print only the relevant response part (after "### Response:")
print(response[0].split("### Response:")[1])


<think>
Okay, so I'm trying to figure out what cystometry would show for this 61-year-old woman. Let me start by breaking down the information given.

She has a history of involuntary urine loss, especially when she coughs or sneezes. That makes me think of stress urinary incontinence, which is common in women. But it's specifically during activities that put pressure on the bladder, so maybe it's due to urethral issues rather than bladder capacity. Also, she doesn't leak at night, which suggests that her bladder doesn't have problems holding urine when lying down, so it's less likely to be related to diurnal incontinence.

She undergoes a gynecological exam and Q-tip test. I'm not super familiar with the Q-tip test, but I think it's used to check for urethral obstruction or uretal calculus. So maybe the Q-tip test was normal, indicating that there's no obstruction or stone causing the issue. Alternatively, if it was abnormal, that could point towards something else, like a narrowing 

>**Before starting fine-tuning ‚Äî why are we fine-tuning in the first place?**
>
> Even without fine-tuning, our model successfully generated a chain of thought and provided reasoning before delivering the final answer. The reasoning process is encapsulated within the `<think>` `</think>` tags. So, why do we still need fine-tuning? The reasoning process, while detailed, was long-winded and not concise. Additionally, we want the final answer to be consistent in a certain style.



## Fine-tuning step by step

### Step 1 ‚Äî Update the system prompt
We will slightly change the prompt style for processing the dataset by adding the third placeholder for the complex chain of thought column. `</think>`

In [None]:
# Updated training prompt style to add </think> tag
train_prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.
Before answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.

### Instruction:
You are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning.
Please answer the following medical question.

### Question:
{}

### Response:
<think>
{}
</think>
{}"""


### Step 2 ‚Äî Download the fine-tuning dataset and format it for fine-tuning

We will use the Medical O1 Reasoninng SFT found here on [Hugging Face](https://huggingface.co/datasets/FreedomIntelligence/medical-o1-reasoning-SFT). From the authors: This dataset is used to fine-tune HuatuoGPT-o1, a medical LLM designed for advanced medical reasoning. This dataset is constructed using GPT-4o, which searches for solutions to verifiable medical problems and validates them through a medical verifier.

In [None]:
# Download the dataset using Hugging Face ‚Äî function imported using from datasets import load_dataset
dataset = load_dataset("FreedomIntelligence/medical-o1-reasoning-SFT","en", split = "train[0:500]",trust_remote_code=True) # Keep only first 500 rows
dataset

`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'FreedomIntelligence/medical-o1-reasoning-SFT' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.
ERROR:datasets.load:`trust_remote_code` is not supported anymore.
Please check that the Hugging Face dataset 'FreedomIntelligence/medical-o1-reasoning-SFT' isn't based on a loading script and remove `trust_remote_code`.
If the dataset is based on a loading script, please ask the dataset author to remove it and convert it to a standard format like Parquet.


Dataset({
    features: ['Question', 'Complex_CoT', 'Response'],
    num_rows: 500
})

In [None]:
# Show an entry from the dataset
dataset[1]

{'Question': 'A 33-year-old woman is brought to the emergency department 15 minutes after being stabbed in the chest with a screwdriver. Given her vital signs of pulse 110/min, respirations 22/min, and blood pressure 90/65 mm Hg, along with the presence of a 5-cm deep stab wound at the upper border of the 8th rib in the left midaxillary line, which anatomical structure in her chest is most likely to be injured?',
 'Complex_CoT': "Okay, let's figure out what's going on here. A woman comes in with a stab wound from a screwdriver. It's in her chest, upper border of the 8th rib, left side, kind of around the midaxillary line. First thought, that's pretty close to where the lung sits, right?\n\nLet's talk about location first. This spot is along the left side of her body. Above the 8th rib, like that, is where a lot of important stuff lives, like the bottom part of the left lung, possibly the diaphragm too, especially considering how deep the screwdriver went.\n\nThe wound is 5 cm deep. Tha

>**Next step is to structure the fine-tuning dataset according to train prompt style‚Äîwhy?**
>
> - Each question is paired with chain-of-thought reasoning and the final response.
> - Ensures every training example follows a consistent pattern.
> - Prevents the model from continuing beyond the expected response lengt by adding the EOS token.

In [None]:
# We need to format the dataset to fit our prompt training style
EOS_TOKEN = tokenizer.eos_token  # Define EOS_TOKEN which the model when to stop generating text during training
EOS_TOKEN

'<ÔΩúend‚ñÅof‚ñÅsentenceÔΩú>'

In [None]:
# Define formatting prompt function
def formatting_prompts_func(examples):  # Takes a batch of dataset examples as input
    inputs = examples["Question"]       # Extracts the medical question from the dataset
    cots = examples["Complex_CoT"]      # Extracts the chain-of-thought reasoning (logical step-by-step explanation)
    outputs = examples["Response"]      # Extracts the final model-generated response (answer)

    texts = []  # Initializes an empty list to store the formatted prompts

    # Iterate over the dataset, formatting each question, reasoning step, and response
    for input, cot, output in zip(inputs, cots, outputs):
        text = train_prompt_style.format(input, cot, output) + EOS_TOKEN  # Insert values into prompt template & append EOS token
        texts.append(text)  # Add the formatted text to the list

    return {
        "text": texts,  # Return the newly formatted dataset with a "text" column containing structured prompts
    }

In [None]:
# Update dataset formatting
dataset_finetune = dataset.map(formatting_prompts_func, batched = True)
dataset_finetune["text"][0]

"Below is an instruction that describes a task, paired with an input that provides further context. \nWrite a response that appropriately completes the request. \nBefore answering, think carefully about the question and create a step-by-step chain of thoughts to ensure a logical and accurate response.\n\n### Instruction:\nYou are a medical expert with advanced knowledge in clinical reasoning, diagnostics, and treatment planning. \nPlease answer the following medical question. \n\n### Question:\nGiven the symptoms of sudden weakness in the left arm and leg, recent long-distance travel, and the presence of swollen and tender right lower leg, what specific cardiac abnormality is most likely to be found upon further evaluation that could explain these findings?\n\n### Response:\n<think>\nOkay, let's see what's going on here. We've got sudden weakness in the person's left arm and leg - and that screams something neuro-related, maybe a stroke?\n\nBut wait, there's more. The right lower leg i

### Step 3 ‚Äî Setting up the model using LoRA

**An intuitive explanation of LoRA**

Large language models (LLMs) have **millions or even billions of weights** that determine how they process and generate text. When fine-tuning a model, we usually update all these weights, which **requires massive computational resources and memory**.

LoRA (**Low-Rank Adaptation**) allows to fine-tune efficiently by:

- Instead of modifying all weights, **LoRA adds small, trainable adapters** to specific layers.  
- These adapters **capture task-specific knowledge** while leaving the original model unchanged.  
- This reduces the number of trainable parameters **by more than 90%**, making fine-tuning **faster and more memory-efficient**.  

Think of an LLM as a **complex factory**. Instead of rebuilding the entire factory to produce a new product, LoRA **adds small, specialized tools** to existing machines. This allows the factory to adapt quickly **without disrupting its core structure**.

For a more technical explanation, check out this tutorial by [Sebastian Raschka](https://www.youtube.com/watch?v=rgmJep4Sb4&t).

Below, we will use the `get_peft_model()` function which stands for Parameter-Efficient Fine-Tuning ‚Äî this function wraps the base model (`model`) with LoRA modifications, ensuring that only specific parameters are trained.

In [None]:
# Apply LoRA (Low-Rank Adaptation) fine-tuning to the model
model_lora = FastLanguageModel.get_peft_model(
    model,
    r=16,  # LoRA rank: Determines the size of the trainable adapters (higher = more parameters, lower = more efficiency)
    target_modules=[  # List of transformer layers where LoRA adapters will be applied
        "q_proj",   # Query projection in the self-attention mechanism
        "k_proj",   # Key projection in the self-attention mechanism
        "v_proj",   # Value projection in the self-attention mechanism
        "o_proj",   # Output projection from the attention layer
        "gate_proj",  # Used in feed-forward layers (MLP)
        "up_proj",    # Part of the transformer‚Äôs feed-forward network (FFN)
        "down_proj",  # Another part of the transformer‚Äôs FFN
    ],
    lora_alpha=16,  # Scaling factor for LoRA updates (higher values allow more influence from LoRA layers)
    lora_dropout=0,  # Dropout rate for LoRA layers (0 means no dropout, full retention of information)
    bias="none",  # Specifies whether LoRA layers should learn bias terms (setting to "none" saves memory)
    use_gradient_checkpointing="unsloth",  # Saves memory by recomputing activations instead of storing them (recommended for long-context fine-tuning)
    random_state=3407,  # Sets a seed for reproducibility, ensuring the same fine-tuning behavior across runs
    use_rslora=False,  # Whether to use Rank-Stabilized LoRA (disabled here, meaning fixed-rank LoRA is used)
    loftq_config=None,  # Low-bit Fine-Tuning Quantization (LoFTQ) is disabled in this configuration
)

Unsloth 2025.11.4 patched 32 layers with 32 QKV layers, 32 O layers and 32 MLP layers.


Now, we initialize `SFTTrainer`, a supervised fine-tuning trainer from `trl` (Transformer Reinforcement Learning), to fine-tune our model efficiently on a dataset.

In [None]:
# # Initialize the fine-tuning trainer ‚Äî Imported using from trl import SFTTrainer
# trainer = SFTTrainer(
#     model=model_lora,  # The model to be fine-tuned
#     tokenizer=tokenizer,  # Tokenizer to process text inputs
#     train_dataset=dataset_finetune,  # Dataset used for training
#     dataset_text_field="text",  # Specifies which field in the dataset contains training text
#     max_seq_length=max_seq_length,  # Defines the maximum sequence length for inputs
#     dataset_num_proc=2,  # Uses 2 CPU threads to speed up data preprocessing
#
#     # Define training arguments
#     args=TrainingArguments(
#         per_device_train_batch_size=2,  # Number of examples processed per device (GPU) at a time
#         gradient_accumulation_steps=4,  # Accumulate gradients over 4 steps before updating weights
#         num_train_epochs=1, # Full fine-tuning run
#         warmup_steps=5,  # Gradually increases learning rate for the first 5 steps
#         max_steps=60,  # Limits training to 60 steps (useful for debugging; increase for full fine-tuning)
#         learning_rate=2e-4,  # Learning rate for weight updates (tuned for LoRA fine-tuning)
#         fp16=not is_bfloat16_supported(),  # Use FP16 (if BF16 is not supported) to speed up training
#         bf16=is_bfloat16_supported(),  # Use BF16 if supported (better numerical stability on newer GPUs)
#         logging_steps=10,  # Logs training progress every 10 steps
#         optim="adamw_8bit",  # Uses memory-efficient AdamW optimizer in 8-bit mode
#         weight_decay=0.01,  # Regularization to prevent overfitting
#         lr_scheduler_type="linear",  # Uses a linear learning rate schedule
#         seed=3407,  # Sets a fixed seed for reproducibility
#         output_dir="outputs",  # Directory where fine-tuned model checkpoints will be saved
#     ),
# )

In [None]:
from trl import SFTTrainer
from transformers import TrainingArguments
from unsloth import is_bfloat16_supported

# Initialize the SFT Trainer
trainer = SFTTrainer(
    model=model_lora,
    tokenizer=tokenizer,
    train_dataset=dataset_finetune,
    dataset_text_field="text",
    max_seq_length=max_seq_length,
    dataset_num_proc=2,

    args=TrainingArguments(
        per_device_train_batch_size=2,

        # Since we fixed the TRL version, we can safely use 4 again.
        # (If you still face issues, revert this to 1).
        gradient_accumulation_steps=4,

        # Limit total steps for faster testing/debugging
        max_steps=60,

        warmup_steps=5,
        learning_rate=2e-4,

        # Automatic precision handling based on GPU support
        fp16=not is_bfloat16_supported(),
        bf16=is_bfloat16_supported(),

        logging_steps=1,
        optim="adamw_8bit",
        weight_decay=0.01,
        lr_scheduler_type="linear",
        seed=3407,
        output_dir="outputs",

        # Recommended to prevent memory leaks with WandB integration
        report_to="wandb",
    ),
)



Map (num_proc=2):   0%|          | 0/500 [00:00<?, ? examples/s]

## Step 4 ‚Äî Model training!

This should take around 30 to 40 minutes ‚Äî we can then check out our training results on Weights and Biases

In [None]:
# Start the fine-tuning process
trainer_stats = trainer.train()

==((====))==  Unsloth - 2x faster free finetuning | Num GPUs used = 1
   \\   /|    Num examples = 500 | Num Epochs = 1 | Total steps = 60
O^O/ \_/ \    Batch size per device = 2 | Gradient accumulation steps = 4
\        /    Data Parallel GPUs = 1 | Total batch size (2 x 4 x 1) = 8
 "-____-"     Trainable parameters = 41,943,040 of 8,072,204,288 (0.52% trained)


Unsloth: Will smartly offload gradients to save VRAM!


Step,Training Loss
1,2.1486
2,2.1517
3,2.206
4,2.0449
5,2.2914
6,2.1796
7,2.1411
8,2.217
9,1.9949
10,2.0075


In [None]:
# Save the fine-tuned model
wandb.finish()

0,1
train/epoch,‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà
train/global_step,‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñÜ‚ñá‚ñá‚ñá‚ñà‚ñà‚ñà‚ñà
train/grad_norm,‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÉ‚ñÖ‚ñÜ‚ñá‚ñá‚ñà‚ñà‚ñà‚ñá‚ñá‚ñá‚ñÖ‚ñÜ‚ñÖ‚ñÖ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÅ‚ñÇ‚ñÅ‚ñÅ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÇ
train/learning_rate,‚ñÅ‚ñÇ‚ñÖ‚ñá‚ñà‚ñà‚ñá‚ñá‚ñá‚ñá‚ñá‚ñá‚ñÜ‚ñÜ‚ñÜ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÑ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÅ
train/loss,‚ñá‚ñá‚ñà‚ñà‚ñá‚ñÜ‚ñÜ‚ñá‚ñÖ‚ñÑ‚ñÖ‚ñÖ‚ñÑ‚ñÑ‚ñÉ‚ñÖ‚ñÉ‚ñÑ‚ñÇ‚ñÉ‚ñÇ‚ñÑ‚ñÉ‚ñÉ‚ñÅ‚ñÉ‚ñÇ‚ñÉ‚ñÇ‚ñÇ‚ñÇ‚ñÅ‚ñÑ‚ñÇ‚ñÅ‚ñÇ‚ñÅ‚ñÇ‚ñÇ‚ñÇ

0,1
total_flos,1.68242321903616e+16
train/epoch,0.96
train/global_step,60.0
train/grad_norm,1e-05
train/learning_rate,0.0
train/loss,1.6593
train_loss,1.83068
train_runtime,1217.2259
train_samples_per_second,0.394
train_steps_per_second,0.049


## Step 5 ‚Äî Run model inference after fine-tuning

In [None]:
question = """A 61-year-old woman with a long history of involuntary urine loss during activities like coughing or sneezing
              but no leakage at night undergoes a gynecological exam and Q-tip test. Based on these findings,
              what would cystometry most likely reveal about her residual volume and detrusor contractions?"""

# Load the inference model using FastLanguageModel (Unsloth optimizes for speed)
FastLanguageModel.for_inference(model_lora)  # Unsloth has 2x faster inference!

# Tokenize the input question with a specific prompt format and move it to the GPU
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

# Generate a response using LoRA fine-tuned model with specific parameters
outputs = model_lora.generate(
    input_ids=inputs.input_ids,          # Tokenized input IDs
    attention_mask=inputs.attention_mask, # Attention mask for padding handling
    max_new_tokens=1200,                  # Maximum length for generated response
    use_cache=True,                        # Enable cache for efficient generation
)

# Decode the generated response from tokenized format to readable text
response = tokenizer.batch_decode(outputs)

# Extract and print only the model's response part after "### Response:"
print(response[0].split("### Response:")[1])


<think>
Okay, so I'm trying to figure out what cystometry would show for this woman. She's 61 and has been dealing with involuntary urine loss whenever she coughs or sneezes, but she doesn't leak at night. She had a gynecological exam and a Q-tip test done. 

First, I know that involuntary urine loss during activities like coughing or sneezing is often related to an overactive bladder or hyperreflexia. That usually means the detrusor muscle is contracting more strongly and more frequently than it should. So, when they did the Q-tip test, they probably checked for bladder contractions.

The Q-tip test involves inserting a catheter into the urethra, and then they measure how much the bladder contracts. If the contractions are too strong or too frequent, it might indicate hyperreflexia. 

Now, about the gynecological exam‚Äîthis might have involved looking at the pelvic area to check for signs of prolapse or other structural issues. But since there's no mention of leakage at night, it's 

In [None]:
question = """A 59-year-old man presents with a fever, chills, night sweats, and generalized fatigue,
              and is found to have a 12 mm vegetation on the aortic valve. Blood cultures indicate gram-positive, catalase-negative,
              gamma-hemolytic cocci in chains that do not grow in a 6.5% NaCl medium.
              What is the most likely predisposing factor for this patient's condition?"""

# Tokenize the input question with a specific prompt format and move it to the GPU
inputs = tokenizer([prompt_style.format(question, "")], return_tensors="pt").to("cuda")

# Generate a response using LoRA fine-tuned model with specific parameters
outputs = model_lora.generate(
    input_ids=inputs.input_ids,          # Tokenized input IDs
    attention_mask=inputs.attention_mask, # Attention mask for padding handling
    max_new_tokens=1200,                  # Maximum length for generated response
    use_cache=True,                        # Enable cache for efficient generation
)

# Decode the generated response from tokenized format to readable text
response = tokenizer.batch_decode(outputs)

# Extract and print only the model's response part after "### Response:"
print(response[0].split("### Response:")[1])


<think>
Alright, let's break this down step by step. 

First, the patient is a 59-year-old man presenting with symptoms like fever, chills, night sweats, and generalized fatigue. These symptoms are classic indicators of an active infection, likely a systemic one given the widespread nature of his fatigue.

Next, the physical examination reveals a vegetation on the aortic valve measuring 12 mm. Vegetations on heart valves are most commonly associated with infections caused by bacteria, known as bacterial endocarditis. The presence of these vegetations suggests that the patient's condition is not due to a simple fever or infection elsewhere, but rather a more severe and systemic infection.

Now, looking at the blood culture results: gram-positive, catalase-negative, gamma-hemolytic cocci in chains that do not grow in a 6.5% NaCl medium. Let's dissect this information.

Gram-positive bacteria are a large group that includes many pathogenic bacteria. However, catalase-negative bacteria ar

##save the model

In [None]:
# 1. Save the model and tokenizer locally (Save Locally)
# Using saving_method="merged_16bit" makes the model easier to use later but slightly larger in size
# If you want a smaller size (LoRA-only), remove the merged_16bit line and keep save_pretrained normally
print("üíæ Saving model locally...")
model.save_pretrained("medical_chatbot_model")
tokenizer.save_pretrained("medical_chatbot_model")

# 2. Compress the model into a ZIP file
print("üì¶ Zipping the model...")
!zip -r model_backup.zip medical_chatbot_model

# 3. Download the file to your computer immediately
print("‚¨áÔ∏è Downloading to your computer...")
from google.colab import files
files.download('model_backup.zip')


üíæ Saving model locally...
üì¶ Zipping the model...
  adding: medical_chatbot_model/ (stored 0%)
  adding: medical_chatbot_model/chat_template.jinja (deflated 75%)
  adding: medical_chatbot_model/model.safetensors.index.json (deflated 97%)
  adding: medical_chatbot_model/tokenizer_config.json (deflated 96%)
  adding: medical_chatbot_model/config.json (deflated 56%)
  adding: medical_chatbot_model/generation_config.json (deflated 36%)
  adding: medical_chatbot_model/tokenizer.json (deflated 85%)
  adding: medical_chatbot_model/special_tokens_map.json (deflated 69%)
  adding: medical_chatbot_model/model-00002-of-00002.safetensors (deflated 21%)
  adding: medical_chatbot_model/model-00001-of-00002.safetensors (deflated 9%)
‚¨áÔ∏è Downloading to your computer...


<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
from google.colab import files
files.download('model_backup.zip')

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>

In [None]:
import gradio as gr
import torch
from transformers import TextIteratorStreamer
from threading import Thread
import os
import random

# 1. Enable inference mode
try:
    FastLanguageModel.for_inference(model_lora)
except:
    pass

# --- Image setup ---
MY_AVATAR_PATH = "/content/doctor.jpg"
if os.path.exists(MY_AVATAR_PATH):
    bot_avatar = MY_AVATAR_PATH
else:
    bot_avatar = "https://cdn-icons-png.flaticon.com/512/1698/1698535.png"

# --- Core variables ---
prompt_style = """Below is an instruction that describes a task, paired with an input that provides further context.
Write a response that appropriately completes the request.

### Instruction:
You are FEMOZ AI. Provide the diagnosis and treatment plan immediately. Be extremely concise.

### Question:
{}

### Response:
<think>
{}
</think>
{}"""

# --- Function to generate a short clinical case ---
def generate_ai_case():
    starters = ["Male 50yo", "Female 30yo", "Child 6yo", "Elderly 75yo"]
    chosen_start = random.choice(starters)

    creation_prompt = f"""### Instruction:
Write a 2-sentence clinical vignette starting with "{chosen_start}".
Include only: Chief complaint and Vitals. No fluff.

### Response:
<think>
Generating short vignette...
</think>
"""
    try:
        inputs = tokenizer([creation_prompt], return_tensors="pt").to("cuda")
        outputs = model_lora.generate(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            max_new_tokens=60,
            temperature=0.8,
            do_sample=True,
            repetition_penalty=1.2
        )
        text = tokenizer.batch_decode(outputs, skip_special_tokens=True)[0]
        case = text.split("### Response:")[-1].replace("<think>", "").split("</think>")[-1].strip()

        if "." in case:
            sentences = case.split(".")
            case = sentences[0] + "." + (sentences[1] + "." if len(sentences) > 1 else "")

        return case
    except:
        return "Male 50yo, chest pain and sweating. BP 100/60."

# --- Chat logic ---
def medical_chat_stream(message, history):
    if history is None: history = []
    history.append([message, ""])

    instruction = """
    (Format:
     1. Diagnosis: [Name Only]
     2. Plan: [3 Bullet Points Max]
     Do not explain. Be direct.)
    """

    try:
        prompt = prompt_style.format(message + instruction, "", "")
        inputs = tokenizer([prompt], return_tensors="pt").to("cuda")
        streamer = TextIteratorStreamer(tokenizer, skip_prompt=True, skip_special_tokens=True)

        generation_kwargs = dict(
            input_ids=inputs.input_ids,
            attention_mask=inputs.attention_mask,
            streamer=streamer,
            max_new_tokens=300,
            do_sample=True,
            temperature=0.4,
            top_p=0.9,
            repetition_penalty=1.2,
            no_repeat_ngram_size=3,
            early_stopping=True
        )

        thread = Thread(target=model_lora.generate, kwargs=generation_kwargs)
        thread.start()

        partial_text = ""
        for new_text in streamer:
            partial_text += new_text

            # Hide the ‚Äúthinking‚Äù block and show only the final result
            formatted_text = partial_text.replace(
                "<think>",
                "<span style='display:none'>"
            ).replace(
                "</think>",
                "</span><div class='cyber-panel result'><div class='panel-header'>‚ö° RAPID DIAGNOSIS</div><div class='panel-content'>"
            )

            if "</think>" in partial_text and not formatted_text.endswith("</div>"):
                 formatted_text += "</div>"

            history[-1][1] = formatted_text
            yield "", history

    except Exception as e:
        history[-1][1] = f"‚ö†Ô∏è Error: {str(e)}"
        yield "", history

# --- üé® UI Design (Dark & Compact) ---
custom_css = """
.gradio-container {
    background-color: #0f172a !important;
    font-family: 'Segoe UI', sans-serif !important;
    max-width: 900px !important;
    padding: 15px !important;
    margin: 0 auto !important;
}

.header-cyber {
    background: linear-gradient(to right, #1e293b, #0f172a);
    border-bottom: 2px solid #3b82f6;
    padding: 20px;
    text-align: center;
    margin-bottom: 15px;
    border-radius: 8px;
}
.title-cyber {
    color: #ffffff;
    font-size: 2.2em;
    font-weight: bold;
    letter-spacing: 2px;
    margin: 0;
}
.subtitle-cyber {
    color: #94a3b8;
    font-size: 0.9em;
    margin-top: 5px;
    letter-spacing: 1px;
    text-transform: uppercase;
}

.chatbot {
    height: 500px !important;
    background-color: #1e293b !important;
    border: 1px solid #334155 !important;
    border-radius: 8px;
}

/* User message */
.message-wrap .message.user {
    background-color: #2563eb !important;
    color: #ffffff !important;
    border-radius: 6px;
    padding: 10px 15px !important;
    font-size: 14px;
}

/* Bot message container */
.message-wrap .message.bot {
    background-color: transparent !important;
    border: none !important;
    padding: 0 !important;
}

/* Diagnosis result panel */
.cyber-panel.result {
    border-left: 4px solid #10b981;
    background: rgba(6, 78, 59, 0.3);
    border-radius: 4px;
    margin-top: 5px;
}
.cyber-panel.result .panel-header {
    background: rgba(16, 185, 129, 0.1);
    color: #34d399;
    padding: 5px 10px;
    font-weight: bold;
    font-size: 0.85em;
}
.cyber-panel.result .panel-content {
    padding: 10px;
    color: #e2e8f0;
    font-size: 1em;
    line-height: 1.4;
}

/* Buttons */
button.primary {
    background-color: #3b82f6 !important;
    color: white !important;
    font-weight: bold;
}
"""

theme = gr.themes.Base(primary_hue="blue", neutral_hue="slate")

with gr.Blocks(theme=theme, css=custom_css) as demo:

    # New header with the required title
    gr.HTML("""
    <div class="header-cyber">
        <div class="title-cyber">FEMOZ AI</div>
        <div class="subtitle-cyber">CLINICAL DECISION SUPPORT SYSTEM</div>
    </div>
    """)

    with gr.Row():
        with gr.Column(scale=12):
            chatbot = gr.Chatbot(
                elem_classes="chatbot",
                show_label=False,
                avatar_images=(None, bot_avatar),
                render_markdown=True,
                bubble_full_width=False
            )

            with gr.Row():
                msg = gr.Textbox(
                    placeholder="> Input data...",
                    show_label=False,
                    container=False,
                    scale=6,
                    lines=1
                )

                with gr.Column(scale=3):
                    with gr.Row():
                        submit_btn = gr.Button("‚ö° SCAN", variant="primary", scale=2)
                        # Button with the new name
                        rand_btn = gr.Button("üé≤ GENERATE CASE", variant="secondary", scale=1)
                        clear_btn = gr.Button("‚úñÔ∏è", variant="stop", scale=1)

    # Button connections
    rand_btn.click(generate_ai_case, outputs=[msg])
    msg.submit(medical_chat_stream, [msg, chatbot], [msg, chatbot])
    submit_btn.click(medical_chat_stream, [msg, chatbot], [msg, chatbot])
    clear_btn.click(lambda: ([], ""), outputs=[chatbot, msg])

print("üöÄ Launching FEMOZ AI (Final CDSS Version)...")
demo.launch(share=True, debug=True)


  chatbot = gr.Chatbot(
  chatbot = gr.Chatbot(


üöÄ Launching FEMOZ AI (Final CDSS Version)...
Colab notebook detected. This cell will run indefinitely so that you can see errors and logs. To turn off, set debug=False in launch().
* Running on public URL: https://5fc01ffbbf41a28c78.gradio.live

This share link expires in 1 week. For free permanent hosting and GPU upgrades, run `gradio deploy` from the terminal in the working directory to deploy to Hugging Face Spaces (https://huggingface.co/spaces)


Keyboard interruption in main thread... closing server.
Killing tunnel 127.0.0.1:7860 <> https://5fc01ffbbf41a28c78.gradio.live


