In [1]:
# Connect to Google Drive
from google.colab import drive
drive.mount('/content/drive', force_remount=True)

Mounted at /content/drive


In [2]:
%cd "drive/MyDrive/Colab Notebooks/medical_assist_academicway"

/content/drive/MyDrive/Colab Notebooks/medical_assist_academicway


In [3]:
!pwd

/content/drive/MyDrive/Colab Notebooks/medical_assist_academicway


In [4]:
# This will show detailed GPU information if available
!nvidia-smi

Sat May 10 13:13:57 2025       
+-----------------------------------------------------------------------------------------+
| NVIDIA-SMI 550.54.15              Driver Version: 550.54.15      CUDA Version: 12.4     |
|-----------------------------------------+------------------------+----------------------+
| GPU  Name                 Persistence-M | Bus-Id          Disp.A | Volatile Uncorr. ECC |
| Fan  Temp   Perf          Pwr:Usage/Cap |           Memory-Usage | GPU-Util  Compute M. |
|                                         |                        |               MIG M. |
|   0  Tesla T4                       Off |   00000000:00:04.0 Off |                    0 |
| N/A   41C    P8              9W /   70W |       0MiB /  15360MiB |      0%      Default |
|                                         |                        |                  N/A |
+-----------------------------------------+------------------------+----------------------+
                                                

## Setup

**NOTE: refer https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf for phi-3 mini setup once**

#### Essential Libraries for Task

The cu121 (CUDA 12.1) version works with your CUDA 12.4 because of backward compatibility within major CUDA versions. CUDA 12.1 libraries are generally compatible with CUDA 12.4, as NVIDIA maintains compatibility within major versions.

Regarding which approach is better:

Using `CMAKE_ARGS="-DLLAMA_CUBLAS=on"` might be slightly more beneficial because:

1. It compiles specifically against your exact CUDA version (12.4)
2. You can add additional optimization flags if needed
3. It ensures CUBLAS is explicitly enabled with your specific GPU drivers

The difference in performance is likely to be minor, but if you want the most optimized build for your specific environment, using the CMAKE_ARGS approach would be preferable.

If you're concerned about getting the best performance, you could cancel the current installation and use:

```language=python
!CMAKE_ARGS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_F16=on" pip install llama-cpp-python
```

This explicitly enables CUBLAS and FP16 support, which can improve performance on T4 GPUs.


In [5]:
# As our cuda version is 12.4 so using whl > cu124 (inorder not to get version mismatch things by using cu118)
# ref: https://pypi.org/project/llama-cpp-python/

!pip install llama-cpp-python --upgrade --force-reinstall --extra-index-url https://abetlen.github.io/llama-cpp-python/whl/cu124

# Alternatively use
#!CMAKE_ARGS="-DLLAMA_CUBLAS=on" pip install llama-cpp-python


# Use this F16 support one for faster inference
#!CMAKE_ARGS="-DLLAMA_CUBLAS=on -DLLAMA_CUDA_F16=on" pip install llama-cpp-python

#!CMAKE_ARGS="-DLLAMA_CUBLAS=ON" pip install llama-cpp-python --upgrade --force-reinstall

# !CMAKE_ARGS="-DLLAMA_CUBLAS=on" FORCE_CMAKE=1 pip install llama-cpp-python==0.1.85 --force-reinstall --no-cache-dir -q

Looking in indexes: https://pypi.org/simple, https://abetlen.github.io/llama-cpp-python/whl/cu124
Collecting llama-cpp-python
  Downloading llama_cpp_python-0.3.9.tar.gz (67.9 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m67.9/67.9 MB[0m [31m14.9 MB/s[0m eta [36m0:00:00[0m
[?25h  Installing build dependencies ... [?25l[?25hdone
  Getting requirements to build wheel ... [?25l[?25hdone
  Installing backend dependencies ... [?25l[?25hdone
  Preparing metadata (pyproject.toml) ... [?25l[?25hdone
Collecting typing-extensions>=4.5.0 (from llama-cpp-python)
  Downloading typing_extensions-4.13.2-py3-none-any.whl.metadata (3.0 kB)
Collecting numpy>=1.20.0 (from llama-cpp-python)
  Downloading numpy-2.2.5-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (62 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m62.0/62.0 kB[0m [31m218.8 kB/s[0m eta [36m0:00:00[0m
[?25hCollecting diskcache>=5.6.1 (from llama-cpp-python)
  D

In [6]:
!pip install huggingface_hub



In [68]:
!pip install langchain langchain_text_splitters



In [75]:
!pip install pymupdf

Collecting pymupdf
  Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl.metadata (3.4 kB)
Downloading pymupdf-1.25.5-cp39-abi3-manylinux2014_x86_64.manylinux_2_17_x86_64.whl (20.0 MB)
[?25l   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/20.0 MB[0m [31m?[0m eta [36m-:--:--[0m[2K   [91m━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m2.0/20.0 MB[0m [31m58.0 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━[0m[90m╺[0m[90m━━━━━━━━━━━━━━━━━━━━━━[0m [32m8.6/20.0 MB[0m [31m122.9 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m[90m━━━━━━━━━[0m [32m15.3/20.0 MB[0m [31m189.5 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m20.0/20.0 MB[0m [31m209.7 MB/s[0m eta [36m0:00:01[0m[2K   [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[91m╸[0m [32m20.0/20.0 MB[0m [31m209.7 MB/s[0m eta [36m0:00:01[0m[

In [77]:
!pip install langchain-community

Collecting langchain-community
  Downloading langchain_community-0.3.23-py3-none-any.whl.metadata (2.5 kB)
Collecting dataclasses-json<0.7,>=0.5.7 (from langchain-community)
  Downloading dataclasses_json-0.6.7-py3-none-any.whl.metadata (25 kB)
Collecting pydantic-settings<3.0.0,>=2.4.0 (from langchain-community)
  Downloading pydantic_settings-2.9.1-py3-none-any.whl.metadata (3.8 kB)
Collecting httpx-sse<1.0.0,>=0.4.0 (from langchain-community)
  Downloading httpx_sse-0.4.0-py3-none-any.whl.metadata (9.0 kB)
Collecting marshmallow<4.0.0,>=3.18.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading marshmallow-3.26.1-py3-none-any.whl.metadata (7.3 kB)
Collecting typing-inspect<1,>=0.4.0 (from dataclasses-json<0.7,>=0.5.7->langchain-community)
  Downloading typing_inspect-0.9.0-py3-none-any.whl.metadata (1.5 kB)
Collecting python-dotenv>=0.21.0 (from pydantic-settings<3.0.0,>=2.4.0->langchain-community)
  Downloading python_dotenv-1.1.0-py3-none-any.whl.metadata (24 kB

#### Imports

In [7]:
import os
from huggingface_hub import hf_hub_download, list_repo_files
from llama_cpp import Llama

In [78]:
from langchain_community.document_loaders import PyMuPDFLoader
import re

In [69]:
# Import libraries for text chunking
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_text_splitters import MarkdownHeaderTextSplitter

In [8]:
# List available files to get the correct filename
files = list_repo_files("microsoft/Phi-3-mini-4k-instruct-GGUF")
gguf_files = [f for f in files if f.endswith('.gguf')]
print("Available GGUF files:")
for f in gguf_files:
    print(f)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Available GGUF files:
Phi-3-mini-4k-instruct-fp16.gguf
Phi-3-mini-4k-instruct-q4.gguf


We will use q4 one
> Phi-3-mini-4k-instruct-q4.gguf


✅ **`phi-3-mini-4k-instruct-q4.gguf` is much faster**, especially on:

* CPUs (with `llama.cpp`)
* GPUs with limited memory (e.g., T4, RTX 3050, etc.)
* Google Colab Free (RAM or VRAM limited)

---

### ✅ Recommendation

For **RAG and general inference**, use:

> **`phi-3-mini-4k-instruct-q4.gguf`**

It's **fast**, **efficient**, and retains **most of the accuracy** — ideal for local or Colab usage.


**Lets create directory for storing the model (above one)**

In [None]:
# Create directory for models
!mkdir -p /content/drive/MyDrive/llm_models

In [9]:
llm_models_dir_path = '/content/drive/MyDrive/llm_models'

In [10]:
phi3_mini_local_path = f"{llm_models_dir_path}/Phi-3-mini-4k-instruct-q4.gguf"

In [None]:

# Download pre-quantized model if it doesn't exist
if not os.path.exists(phi3_mini_local_path):
    print("Downloading pre-quantized model...")
    hf_hub_download(
        repo_id="microsoft/Phi-3-mini-4k-instruct-GGUF",
        filename="Phi-3-mini-4k-instruct-q4.gguf",
        local_dir=llm_models_dir_path,

        # Colab doesn't always support symlinks well esp. when
        # - Writing to mounted paths (e.g. Google Drive)
        # Using False ensures the model files are copied instead of symlinked
        local_dir_use_symlinks=False
    )

Downloading pre-quantized model...


For more details, check out https://huggingface.co/docs/huggingface_hub/main/en/guides/download#download-files-to-local-folder.


Phi-3-mini-4k-instruct-q4.gguf:   0%|          | 0.00/2.39G [00:00<?, ?B/s]

## Question Answering using LLM

REMEMBER:

the structure `response["choices"][0]["text"]` is consistent across models when using the llama-cpp-python library. This is because llama.cpp follows the OpenAI API response format.

When you call the model with a prompt, it returns a dictionary with this structure:

```
{
  "id": "...",
  "object": "text_completion",
  "created": timestamp,
  "model": "...",
  "choices": [
    {
      "text": "The generated text response",
      "index": 0,
      "logprobs": null,
      "finish_reason": "length" or "stop"
    }
  ],
  "usage": {
    "prompt_tokens": number,
    "completion_tokens": number,
    "total_tokens": number
  }
}
```

So `response["choices"][0]["text"]` will consistently give you the generated text output regardless of which model you're using with llama-cpp-python.


**Phi 3 Mini 4K Info**

Phi-3-mini-4k-instruct has a **total context window** of 4096 tokens. This means the combined length of:
- Your input prompt
- The model's generated response

cannot exceed 4096 tokens.

For example:
- If your prompt is 1000 tokens
- You could generate up to 3096 tokens in response

The `max_tokens` parameter (set to 1024 in our code) limits how many tokens the model will generate in its response, regardless of how much room is left in the context window.

So while the model *could* theoretically generate up to ~4000 tokens (if your prompt is very short), setting `max_tokens=1024` is a practical limit that:
1. Keeps response times reasonable
2. Provides sufficient detail for medical answers
3. Prevents excessively long outputs

You can adjust this value based on your needs, but 1024 is a good starting point.


#### Loading the model

In [11]:
# Function to load the model
def load_model(model_path):
    """
    Load the llm model (quantized) to be used eith llama.cpp
    """
    print(f'checking model at {model_path} ...')
    if not os.path.exists(model_path):
        print(f"Model not found at {model_path} Please check !.")
        return None

    # Load the model with appropriate parameters
    llm = Llama(
        model_path=model_path,
        n_ctx=4096,  # Context window size - use the full 4K that the model supports
        n_gpu_layers=-1,  # Use all GPU layers
        verbose=False
    )

    return llm

In [12]:
# max tokens - 1024 is good starting point // should be sufficient for comprehensive answers
# temperature - A temperature of 0.3-0.5 would be more appropriate for medical question answering
# - for medical applications, leaning toward more deterministic outputs is generally preferred

# Function to define model parameters
def get_model_parameters(temperature=0.3, top_p=0.9, max_tokens=1024):
    """
    Define parameters for model inference
    """
    return {
        "temperature": temperature,
        "top_p": top_p,
        "max_tokens": max_tokens,
        # todo: decide
        # "top_k": 40,
    }

In [13]:
# model path
print('model path: ', phi3_mini_local_path)

model path:  /content/drive/MyDrive/llm_models/Phi-3-mini-4k-instruct-q4.gguf


In [14]:
!ls -l "/content/drive/MyDrive/llm_models"

total 2337140
-rw------- 1 root root 2393231072 May 10 12:57 Phi-3-mini-4k-instruct-q4.gguf


In [15]:
print(os.path.exists(phi3_mini_local_path))

True


In [16]:
# Test the model loading
model = load_model(phi3_mini_local_path)

checking model at /content/drive/MyDrive/llm_models/Phi-3-mini-4k-instruct-q4.gguf ...


**Helper python function**

In [59]:
def response(query, llm, max_tokens=128,temperature=0,top_p=0.95,top_k=50):
    # * ! this method is provided by academic
    model_output = llm(
      prompt=query,
      max_tokens=max_tokens,
      temperature=temperature,
      top_p=top_p,
      top_k=top_k
    )

    return model_output['choices'][0]['text']

def get_response(query, model, params=dict()):
    """
    Get response from the model for a given question
    """
    try:

      prompt = 'N/A'

      pt = params['prompt_template']
      if pt:
        prompt = pt(query)
      else:
        # Create the properly formatted prompt
        prompt = create_prompt(query)

      # Get response from model
      response = model(
          prompt,
          # because 512 or 1024 was consuming considerable amount of time so going with what academic takes as default (ie 128)
          max_tokens=params.get("max_tokens", 128),
          temperature=params.get("temperature", 0.2),
          top_p=params.get("top_p", 0.9),
          top_k=params.get("top_k", 50)
          #top_k=params.get("top_k")
      )
      # TODO: comment below print statement (if not needed to trace !...)
      print('got respoonse', response)
      return response['choices'][0]['text']
    except Exception as e:
      print('Error whilst getting the response', e)
      return 'Error'

def create_prompt(query):
    """
    Creates a properly formatted prompt for the Phi-3-mini-4k-instruct model

    Args:
        query: The user's question or query

    Returns:
        A formatted prompt string that follows the model's expected format
    """
    # Format following Phi-3 chat template
    formatted_prompt = f"""<|user|>
{query}
<|assistant|>"""

    # ! NOTE: not using `pipeline()` or `pipe()`
    # We're using llama.cpp via the llama-cpp-python binding rather than the Hugging Face Transformers library
    # The Transformers pipeline would be useful if we were using the full HF implementation
    # but for our quantized model with llama.cpp, the direct approach we're using is more appropriate

    return formatted_prompt

In [40]:
def display_response(question, answer, verbose=False):
    """
    Display the question and model response in a clean, formatted way

    Args:
        question: The question asked to the model
        response: The full response object from the model
        verbose: Whether to show additional details like token counts
    """
    # Print with clear formatting
    print("\n" + "="*80)
    print("📋 QUESTION:")
    print("-"*80)
    print(question)
    print("\n" + "🩺 ANSWER:")
    print("-"*80)
    print(answer)
    print("="*80)

    # Optional verbose output with token information
    if verbose:
        usage = response.get("usage", {})
        prompt_tokens = usage.get("prompt_tokens", "N/A")
        completion_tokens = usage.get("completion_tokens", "N/A")
        total_tokens = usage.get("total_tokens", "N/A")

        print("\n📊 STATS:")
        print(f"  • Prompt tokens: {prompt_tokens}")
        print(f"  • Completion tokens: {completion_tokens}")
        print(f"  • Total tokens: {total_tokens}")

In [23]:
if model:
    print("Model loaded successfully!")

Model loaded successfully!


In [24]:
# Test questions from the problem statement
questions = [
    "What is the protocol for managing sepsis in a critical care unit?",
    "What are the common symptoms of appendicitis, and can it be cured via medicine? If not, what surgical procedure should be followed to treat it?",
    "What are the effective treatments or solutions for addressing sudden patchy hair loss, commonly seen as localized bald spots on the scalp, and what could be the possible causes behind it?",
    "What treatments are recommended for a person who has sustained a physical injury to brain tissue, resulting in temporary or permanent impairment of brain function?",
    "What are the necessary precautions and treatment steps for a person who has fractured their leg during a hiking trip, and what should be considered for their care and recovery?"
]

In [25]:
llm = model

### Query 1: What is the protocol for managing sepsis in a critical care unit?

In [46]:
response0 = get_response(questions[0], llm)
display_response(questions[0], response0)

got respoonse {'id': 'cmpl-6ea31136-c85e-4ee5-91e1-1634efd91b43', 'object': 'text_completion', 'created': 1746886655, 'model': '/content/drive/MyDrive/llm_models/Phi-3-mini-4k-instruct-q4.gguf', 'choices': [{'text': " The management of sepsis in a critical care unit follows the Surviving Sepsis Campaign (SSC) guidelines, which are periodically updated. The protocol generally includes the following steps:\n\n1. Early recognition and assessment: Identify patients with suspected sepsis, septic shock, or severe sepsis based on clinical signs, symptoms, and laboratory findings.\n\n2. Immediate resuscitation: Initiate aggressive fluid resuscitation with crystalloids, aiming for a 30 mL/kg bolus within the first 3 hours.\n\n3. Antibiotic therapy: Administer broad-spectrum antibiotics within one hour of recognition, and then de-escalate based on culture results and clinical response.\n\n4. Source control: Identify and treat the source of infection, such as draining abscesses, removing infected

### Query 2: What are the common symptoms for appendicitis, and can it be cured via medicine? If not, what surgical procedure should be followed to treat it?

In [51]:
i = 1
response1 = get_response(questions[i], llm)
display_response(questions[i], response1)

got respoonse {'id': 'cmpl-439199b1-d299-4347-9701-6533f3d6a890', 'object': 'text_completion', 'created': 1746887080, 'model': '/content/drive/MyDrive/llm_models/Phi-3-mini-4k-instruct-q4.gguf', 'choices': [{'text': ' Appendicitis is an inflammation of the appendix, a small pouch-like organ located in the lower right abdomen. The common symptoms of appendicitis include:\n\n1. Abdominal pain: The pain usually starts around the navel and then moves to the lower right abdomen. The pain tends to worsen over time and may become severe.\n2. Loss of appetite\n3. Nausea and vomiting\n4. Fever\n5. Abdominal bloating\n6. Constipation or diarrhea\n\nAppendicitis', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 38, 'completion_tokens': 128, 'total_tokens': 166}}

📋 QUESTION:
--------------------------------------------------------------------------------
What are the common symptoms of appendicitis, and can it be cured via medicine? If not, what surgical proc

### Query 3: What are the effective treatments or solutions for addressing sudden patchy hair loss, commonly seen as localized bald spots on the scalp, and what could be the possible causes behind it?

In [52]:
i = 2
response2 = get_response(questions[i], llm)
display_response(questions[i], response2)

got respoonse {'id': 'cmpl-6288221f-0323-458e-8d79-9a07e6aeca28', 'object': 'text_completion', 'created': 1746887208, 'model': '/content/drive/MyDrive/llm_models/Phi-3-mini-4k-instruct-q4.gguf', 'choices': [{'text': ' Sudden patchy hair loss, also known as alopecia areata, can be caused by various factors, including genetics, autoimmune disorders, and stress. Here are some effective treatments and solutions for addressing this condition:\n\n1. Medications:\n   a. Corticosteroids: Injectable or topical corticosteroids can help reduce inflammation and promote hair regrowth.\n   b. Minoxidil: This is a topical solution that can help stimulate hair growth.\n   c. Immunomodulatory agents:', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 44, 'completion_tokens': 128, 'total_tokens': 172}}

📋 QUESTION:
--------------------------------------------------------------------------------
What are the effective treatments or solutions for addressing sudden patc

### Query 4:  What treatments are recommended for a person who has sustained a physical injury to brain tissue, resulting in temporary or permanent impairment of brain function?

In [53]:
i = 3
response3 = get_response(questions[i], llm)
display_response(questions[i], response3)

got respoonse {'id': 'cmpl-f7b91398-3fc6-45ec-9c1d-fbe3f66fbd1d', 'object': 'text_completion', 'created': 1746887284, 'model': '/content/drive/MyDrive/llm_models/Phi-3-mini-4k-instruct-q4.gguf', 'choices': [{'text': " I am not able to diagnose or provide specific treatment recommendations. It is crucial to consult with a qualified healthcare professional for an accurate diagnosis and appropriate treatment plan. However, I can provide you with some general information about potential treatments for brain injuries.\n\nTreatment for brain injuries depends on the severity and type of injury, as well as the individual's overall health. Some common approaches to treating brain injuries include:\n\n1. Medical management: This involves monitoring the patient's vital signs, managing pain, and addressing any immediate medical issues related to the injury", 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 36, 'completion_tokens': 128, 'total_tokens': 164}}

📋 

### Query 5: What are the necessary precautions and treatment steps for a person who has fractured their leg during a hiking trip, and what should be considered for their care and recovery?

In [54]:
i = 4
response4 = get_response(questions[i], llm)
display_response(questions[i], response4)

got respoonse {'id': 'cmpl-284cbf05-074c-4dd5-bc93-5ee1e876e325', 'object': 'text_completion', 'created': 1746887421, 'model': '/content/drive/MyDrive/llm_models/Phi-3-mini-4k-instruct-q4.gguf', 'choices': [{'text': ' If a person has fractured their leg during a hiking trip, it is crucial to take the following precautions and treatment steps:\n\n1. Safety first:\n   - Ensure the injured person is in a safe location, away from any potential hazards.\n   - If possible, help the person to a stable, flat surface.\n\n2. Call for help:\n   - Contact emergency services or a local rescue team to provide professional medical assistance.\n   - If cell phone service is available, call for help immediately.\n\n3. Immobilize the leg', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 42, 'completion_tokens': 128, 'total_tokens': 170}}

📋 QUESTION:
--------------------------------------------------------------------------------
What are the necessary precautions a

## Question Answering using LLM with Prompt Engineering

In [61]:
combinations = [
    # Combination 1: Highly deterministic (factual focus)
    {
        "name": "Highly Deterministic",
        "temperature": 0.1,
        "top_p": 0.5,
        "max_tokens": 150,
        "prompt_template": lambda q: f"<|user|>\nAnswer this medical question with precise, factual information: {q}\n<|assistant|>"
    },

    # Combination 2: Balanced approach
    {
        "name": "Balanced Approach",
        "temperature": 0.4,
        "top_p": 0.8,
        "max_tokens": 160,
        "prompt_template": lambda q: f"<|user|>\nProvide a comprehensive medical answer to this question: {q}\n<|assistant|>"
    },

    # Combination 3: Step-by-step reasoning
    {
        "name": "Step-by-Step Reasoning",
        "temperature": 0.3,
        "top_p": 0.7,
        "max_tokens": 150,
        "prompt_template": lambda q: f"<|user|>\nAnswer this medical question step-by-step with clear reasoning: {q}\n<|assistant|>"
    },

    # Combination 4: Concise summary
    {
        "name": "Concise Summary",
        "temperature": 0.2,
        "top_p": 0.9,
        "max_tokens": 132,
        "prompt_template": lambda q: f"<|user|>\nProvide a brief, concise answer to this medical question: {q}\n<|assistant|>"
    },

    # Combination 5: Medical expert persona
    {
        "name": "Medical Expert Persona",
        "temperature": 0.3,
        "top_p": 0.85,
        "max_tokens": 135,
        "prompt_template": lambda q: f"<|user|>\nAs an experienced medical specialist, answer this question with your expert knowledge: {q}\n<|assistant|>"
    }
]

In [62]:
def test_combinations(question_index):
  query = questions[question_index]
  for e in combinations:
    ans = get_response(query, llm, params=e)
    display_response(query, ans)
    print('---\n')

### Query 1: What is the protocol for managing sepsis in a critical care unit?

In [63]:
test_combinations(0)

got respoonse {'id': 'cmpl-44cfbe41-f337-4ff0-be9f-603d6e1f902b', 'object': 'text_completion', 'created': 1746888719, 'model': '/content/drive/MyDrive/llm_models/Phi-3-mini-4k-instruct-q4.gguf', 'choices': [{'text': ' The protocol for managing sepsis in a critical care unit is based on the Surviving Sepsis Campaign (SSC) guidelines, which emphasize early recognition, prompt administration of antibiotics, and aggressive fluid resuscitation. The following steps are typically followed:\n\n1. Early recognition: Identify patients with suspected sepsis by assessing for signs and symptoms, such as fever, elevated heart rate, altered mental status, and hypotension.\n\n2. Immediate interventions:\n   a. Administer broad-spectrum antibiotics within one hour of recognition.\n   b. Initiate fluid resuscitation with', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 31, 'completion_tokens': 150, 'total_tokens': 181}}

📋 QUESTION:
--------------------------------

### Query 2: What are the common symptoms for appendicitis, and can it be cured via medicine? If not, what surgical procedure should be followed to treat it?

In [64]:
test_combinations(1)

got respoonse {'id': 'cmpl-5029667d-19d0-42b1-9168-99689acf86d1', 'object': 'text_completion', 'created': 1746889097, 'model': '/content/drive/MyDrive/llm_models/Phi-3-mini-4k-instruct-q4.gguf', 'choices': [{'text': ' Appendicitis is an inflammation of the appendix, a small, finger-like pouch that projects from the large intestine. The common symptoms of appendicitis include:\n\n1. Abdominal pain: The pain usually starts around the navel and then moves to the lower right side of the abdomen. The pain typically worsens over time and becomes more severe.\n\n2. Loss of appetite\n\n3. Nausea and vomiting\n\n4. Low-grade fever\n\n5. Constipation or diarrhea\n\n6. Abdominal bloating\n\n7. Inability to pass gas\n\n8. Abdominal', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 49, 'completion_tokens': 150, 'total_tokens': 199}}

📋 QUESTION:
--------------------------------------------------------------------------------
What are the common symptoms of appe

### Query 3: What are the effective treatments or solutions for addressing sudden patchy hair loss, commonly seen as localized bald spots on the scalp, and what could be the possible causes behind it?

In [65]:
test_combinations(2)

got respoonse {'id': 'cmpl-ddf847b5-d0b0-458c-aea1-cc3e3692feeb', 'object': 'text_completion', 'created': 1746889448, 'model': '/content/drive/MyDrive/llm_models/Phi-3-mini-4k-instruct-q4.gguf', 'choices': [{'text': " Sudden patchy hair loss, also known as alopecia areata, is an autoimmune condition where the body's immune system mistakenly attacks hair follicles, leading to localized bald spots on the scalp. The exact cause of alopecia areata is unknown, but it is believed to involve a combination of genetic and environmental factors.\n\nPossible causes of alopecia areata include:\n\n1. Genetic predisposition: A family history of alopecia areata or other autoimmune diseases may increase the risk of developing the condition.\n2. Immune system dysfunction: An overactive immune system may attack hair follicles, causing", 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 55, 'completion_tokens': 150, 'total_tokens': 205}}

📋 QUESTION:
------------------

### Query 4:  What treatments are recommended for a person who has sustained a physical injury to brain tissue, resulting in temporary or permanent impairment of brain function?

In [66]:
test_combinations(3)

got respoonse {'id': 'cmpl-656cabf2-698f-4380-a037-2bcf4436b795', 'object': 'text_completion', 'created': 1746889776, 'model': '/content/drive/MyDrive/llm_models/Phi-3-mini-4k-instruct-q4.gguf', 'choices': [{'text': " Treatment for a physical injury to brain tissue, resulting in temporary or permanent impairment of brain function, depends on the severity and location of the injury, as well as the specific symptoms and complications experienced by the individual. Here are some general treatment options:\n\n1. Immediate medical attention: In the case of a severe head injury, immediate medical attention is crucial. This may involve stabilizing the patient's vital signs, performing a thorough neurological examination, and obtaining imaging studies such as a CT scan or MRI to assess the extent of the injury.\n\n2. Medications: Various medications may be prescribed to manage symptoms and complications associated with", 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'pro

### Query 5: What are the necessary precautions and treatment steps for a person who has fractured their leg during a hiking trip, and what should be considered for their care and recovery?

In [67]:
test_combinations(4)

got respoonse {'id': 'cmpl-89eecbf0-a8dc-49d0-861e-892e738e0ec0', 'object': 'text_completion', 'created': 1746890563, 'model': '/content/drive/MyDrive/llm_models/Phi-3-mini-4k-instruct-q4.gguf', 'choices': [{'text': ' Precautions and treatment steps for a person who has fractured their leg during a hiking trip include:\n\n1. Immobilization: Immobilize the injured leg using a splint or a makeshift support to prevent further injury.\n\n2. Pain management: Administer over-the-counter pain medications, such as acetaminophen or ibuprofen, to alleviate pain and reduce inflammation.\n\n3. Elevation: Elevate the injured leg above heart level to reduce swelling and improve blood circulation.\n\n4. Ice application: Apply ice packs wrapped in a cloth to the injured area for 15-', 'index': 0, 'logprobs': None, 'finish_reason': 'length'}], 'usage': {'prompt_tokens': 53, 'completion_tokens': 150, 'total_tokens': 203}}

📋 QUESTION:
---------------------------------------------------------------------

### 👀 **Summary**



We utilized the Phi-3 Mini 4K model in a GPU-accelerated Colab setup to generate medical responses for a predefined set of five clinically relevant questions.

itial tests showed that a max_tokens setting of 512 to 1024 consumes too much time So we resort back to defualt provided by academia (ie 128)

1. **Response Time vs. Token Length**: Generating 512 tokens took significantly longer than shorter outputs. There appears to be a near-linear relationship between max_tokens and generation time.

2. **Temperature Impact**: Lower temperature settings (0.1-0.2) produced more consistent, factual responses appropriate for medical information, while slightly higher values (0.3-0.4) introduced minor variations in phrasing without compromising accuracy.

3. **Prompt Engineering Effects**: Directive prompts (e.g., "Answer step-by-step") noticeably influenced the structure of responses, with the model generally following the requested format.

4. **Default Value Optimization**: We settled on max_tokens=128 as the default for a balance between response quality and generation speed. This value provided sufficient detail for most medical questions while maintaining reasonable response times.

5. **Optimal Balance**: The "Balanced Approach" (temp=0.3, top_p=0.8) provided a good compromise between factual accuracy and natural language flow for medical questions.

6. **Conciseness vs. Completeness**: While the "Concise Summary" setting generated faster responses, some medical questions benefited from the additional context provided by longer outputs.

7. **Persona Framing**: The "Medical Expert Persona" prompt appeared to elicit slightly more technical terminology and structured explanations compared to neutral prompts.

8. **Token Efficiency**: Lower temperatures generally resulted in more information-dense responses, requiring fewer tokens to convey key medical information.

9. **Model Limitations**: For complex medical protocols (like sepsis management), even the longest responses (384 tokens) sometimes felt truncated before completing the full explanation.

***The responses generated by Phi-3 Mini were surprisingly coherent and informative, even without retrieval augmentation — suggesting that smaller models can still be leveraged for meaningful domain-specific reasoning when guided with the right prompt and sampling parameters.***

## Data Preparation for RAG

### Loading the Data

In [72]:
!pwd

/content/drive/MyDrive/Colab Notebooks/medical_assist_academicway


In [70]:
!ls -la

total 19851
-rw------- 1 root root    46430 May 10 08:48 medical_assistant_learner_notebook.ipynb
-rw------- 1 root root 20150490 May  9 11:14 medical_diagnosis_manual.pdf
-rw------- 1 root root   125661 May 10 15:39 notebook1.ipynb
drwx------ 2 root root     4096 May 10 11:56 phi3_models


In [71]:
pdf_path = 'medical_diagnosis_manual.pdf'

In [84]:
# helper Functions

def load_pdf_with_langchain(pdf_path):
    """
    Load a PDF file using LangChain's PyMuPDFLoader

    Args:
        pdf_path: Path to the PDF file

    Returns:
        List of document objects with page content and metadata
    """
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"PDF file not found at {pdf_path}")

    # Load the PDF using PyMuPDFLoader
    loader = PyMuPDFLoader(pdf_path)
    documents = loader.load()

    print(f"PDF loaded successfully: {os.path.basename(pdf_path)}")
    print(f"Total pages: {len(documents)}")

    return documents

def preview_documents(documents, num_pages=2):
    """
    Preview the first few pages of loaded documents
    """
    total_pages = len(documents)

    print(f"Previewing first {min(num_pages, total_pages)} pages:")
    print("="*80)

    for i in range(min(num_pages, total_pages)):
        doc = documents[i]
        text = doc.page_content

        print(f"\n--- Page {i+1} ---\n")
        print(text[:1000] + "..." if len(text) > 1000 else text)

    print("\n" + "="*80)

### Loading the Data

In [82]:
try:
  documents = load_pdf_with_langchain(pdf_path)
except:
  print(f"Error loading pdf: {e}")

PDF loaded successfully: medical_diagnosis_manual.pdf
Total pages: 4114


#### Checking the first 5 pages

In [85]:
# preview the PDF
preview_documents(documents, 5)

Previewing first 5 pages:

--- Page 1 ---

nipunshah6776@gmail.com
0W3XG8QC4A
nt for personal use by nipunshah6776@
shing the contents in part or full is liable

--- Page 2 ---

nipunshah6776@gmail.com
0W3XG8QC4A
This file is meant for personal use by nipunshah6776@gmail.com only.
Sharing or publishing the contents in part or full is liable for legal action.

--- Page 3 ---

Table of Contents
1
Front    ................................................................................................................................................................................................................
1
Cover    .......................................................................................................................................................................................................
2
Front Matter    ...........................................................................................................................................................

#### Checking the number of pages

In [87]:
print('Total number of pages: ', len(documents))

Total number of pages:  4114


### Data Overview

In [86]:
 # Combine all text for statistics
full_text = "\n\n".join([doc.page_content for doc in documents])

# Display some statistics about the content
lines = full_text.split('\n')
words = re.findall(r'\w+', full_text)

print(f"\nPDF loaded successfully with {len(documents)} pages and {len(full_text)} characters")
print(f"Approximate number of lines: {len(lines)}")
print(f"Approximate number of words: {len(words)}")


PDF loaded successfully with 4114 pages and 13703595 characters
Approximate number of lines: 207112
Approximate number of words: 2027501


In [None]:
def create_chunks_by_size(documents, chunk_size=1000, chunk_overlap=200):
    """
    Split documents into chunks based on character count

    Args:
        documents: List of LangChain document objects
        chunk_size: Maximum size of each chunk in characters
        chunk_overlap: Number of characters to overlap between chunks

    Returns:
        List of chunked document objects
    """
    # Initialize the text splitter
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap,
        length_function=len,
        is_separator_regex=False,
    )

    # Split the documents into chunks
    chunked_documents = text_splitter.split_documents(documents)

    print(f"Created {len(chunked_documents)} chunks from {len(documents)} documents")
    print(f"Average chunk size: {sum(len(doc.page_content) for doc in chunked_documents) / len(chunked_documents):.2f} characters")

    return chunked_documents