## 1. Install and Import Dependencies


In [1]:
# Install required packages
%pip install datasets faiss-cpu sentence-transformers pandas numpy


Collecting datasets
  Downloading datasets-4.2.0-py3-none-any.whl (506 kB)
     ------------------------------------ 506.3/506.3 KB 705.7 kB/s eta 0:00:00
Collecting faiss-cpu
  Downloading faiss_cpu-1.12.0-cp310-cp310-win_amd64.whl (18.2 MB)
     -------------------------------------- 18.2/18.2 MB 486.2 kB/s eta 0:00:00
Collecting sentence-transformers
  Downloading sentence_transformers-5.1.1-py3-none-any.whl (486 kB)
     ------------------------------------ 486.6/486.6 KB 423.4 kB/s eta 0:00:00
Collecting dill<0.4.1,>=0.3.0
  Downloading dill-0.4.0-py3-none-any.whl (119 kB)
     ------------------------------------ 119.7/119.7 KB 701.5 kB/s eta 0:00:00
Collecting filelock
  Downloading filelock-3.20.0-py3-none-any.whl (16 kB)
Collecting fsspec[http]<=2025.9.0,>=2023.1.0
  Downloading fsspec-2025.9.0-py3-none-any.whl (199 kB)
     ------------------------------------ 199.3/199.3 KB 327.1 kB/s eta 0:00:00
Collecting pyyaml>=5.1
  Downloading pyyaml-6.0.3-cp310-cp310-win_amd64.whl (15

You should consider upgrading via the 'c:\Users\marcf\AppData\Local\Programs\Python\Python310\python.exe -m pip install --upgrade pip' command.


In [2]:
import numpy as np
import pandas as pd
import faiss
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from typing import List, Tuple
import time


  from .autonotebook import tqdm as notebook_tqdm


## 2. Load the Dataset


In [3]:
# Load the symptom-to-diagnosis dataset
print("Loading dataset...")
ds = load_dataset("gretelai/symptom_to_diagnosis")
print(f"Dataset loaded successfully!")
print(f"Available splits: {list(ds.keys())}")


Loading dataset...


Generating train split: 100%|██████████| 853/853 [00:00<00:00, 71046.14 examples/s]
Generating test split: 100%|██████████| 212/212 [00:00<00:00, 8287.05 examples/s]


Dataset loaded successfully!
Available splits: ['train', 'test']


In [4]:
# Explore the dataset structure
train_data = ds['train']
print(f"Number of examples: {len(train_data)}")
print(f"\nFirst example:")
print(f"Input (symptoms): {train_data[0]['input_text']}")
print(f"Output (diagnosis): {train_data[0]['output_text']}")


Number of examples: 853

First example:
Input (symptoms): I've been having a lot of pain in my neck and back. I've also been having trouble with my balance and coordination. I've been coughing a lot and my limbs feel weak.
Output (diagnosis): cervical spondylosis


In [5]:
# Convert to pandas for easier manipulation
df = pd.DataFrame(train_data)
print(f"Dataset shape: {df.shape}")
df.head()


Dataset shape: (853, 2)


Unnamed: 0,output_text,input_text
0,cervical spondylosis,I've been having a lot of pain in my neck and ...
1,impetigo,I have a rash on my face that is getting worse...
2,urinary tract infection,I have been urinating blood. I sometimes feel ...
3,arthritis,I have been having trouble with my muscles and...
4,dengue,I have been feeling really sick. My body hurts...


## 3. Initialize Embedding Model

In [6]:
# Initialize the embedding model
# Using a model optimized for semantic search
model_name = 'all-MiniLM-L6-v2'  # Fast and effective model
print(f"Loading embedding model: {model_name}")
embedding_model = SentenceTransformer(model_name)
print(f"Model loaded successfully!")
print(f"Embedding dimension: {embedding_model.get_sentence_embedding_dimension()}")


Loading embedding model: all-MiniLM-L6-v2


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


Model loaded successfully!
Embedding dimension: 384


## 4. Generate Embeddings for the Dataset


In [7]:
# Extract symptom texts
symptom_texts = df['input_text'].tolist()
print(f"Generating embeddings for {len(symptom_texts)} symptom descriptions...")

# Generate embeddings with progress tracking
start_time = time.time()
embeddings = embedding_model.encode(
    symptom_texts,
    show_progress_bar=True,
    batch_size=32,
    convert_to_numpy=True
)
end_time = time.time()

print(f"\nEmbeddings generated in {end_time - start_time:.2f} seconds")
print(f"Embeddings shape: {embeddings.shape}")


Generating embeddings for 853 symptom descriptions...


Batches: 100%|██████████| 27/27 [00:06<00:00,  4.00it/s]


Embeddings generated in 6.81 seconds
Embeddings shape: (853, 384)





## 5. Build FAISS Index

In [8]:
# Normalize embeddings for cosine similarity
faiss.normalize_L2(embeddings)

# Get embedding dimension
dimension = embeddings.shape[1]

# Create FAISS index (using IndexFlatIP for inner product, equivalent to cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)

# Add embeddings to the index
print(f"Building FAISS index...")
index.add(embeddings)
print(f"Index built successfully!")
print(f"Total vectors in index: {index.ntotal}")


Building FAISS index...
Index built successfully!
Total vectors in index: 853


## 6. Create Retrieval Function


In [9]:
def retrieve_top_k(query: str, k: int = 5) -> List[Tuple[float, str, str]]:
    """
    Retrieve top-k most similar symptom-diagnosis pairs for a given query.
    
    Args:
        query: The symptom description query
        k: Number of results to return (default: 5)
    
    Returns:
        List of tuples containing (similarity_score, symptoms, diagnosis)
    """
    # Generate embedding for the query
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    
    # Normalize for cosine similarity
    faiss.normalize_L2(query_embedding)
    
    # Search the index
    distances, indices = index.search(query_embedding, k)
    
    # Retrieve and format results
    results = []
    for i, (distance, idx) in enumerate(zip(distances[0], indices[0])):
        symptom = df.iloc[idx]['input_text']
        diagnosis = df.iloc[idx]['output_text']
        results.append((distance, symptom, diagnosis))
    
    return results


def display_results(query: str, results: List[Tuple[float, str, str]]):
    """
    Display retrieval results in a readable format.
    """
    print("="*80)
    print(f"QUERY: {query}")
    print("="*80)
    print()
    
    for i, (score, symptoms, diagnosis) in enumerate(results, 1):
        print(f"Result {i} (Similarity Score: {score:.4f})")
        print("-" * 80)
        print(f"Symptoms: {symptoms}")
        print(f"Diagnosis: {diagnosis}")
        print()
    print("="*80)


## 7. Example Query: Retrieve Top 5 Results


In [10]:
# Example query with headache and fever symptoms
example_query = "I have been experiencing severe headaches, high fever, and muscle aches for the past 3 days."

# Retrieve top 5 matches
print("Searching for top 5 matches...\n")
results = retrieve_top_k(example_query, k=5)

# Display results
display_results(example_query, results)


Searching for top 5 matches...

QUERY: I have been experiencing severe headaches, high fever, and muscle aches for the past 3 days.

Result 1 (Similarity Score: 0.8209)
--------------------------------------------------------------------------------
Symptoms: I have a high fever, chills, nausea, and a headache. I also have muscle aches and a lot of sweating.
Diagnosis: malaria

Result 2 (Similarity Score: 0.8097)
--------------------------------------------------------------------------------
Symptoms: I'm experiencing a high fever, chills, nausea, and severe itching. I also have a headache and have been sweating a lot. I've also been experiencing muscle aches.
Diagnosis: malaria

Result 3 (Similarity Score: 0.7993)
--------------------------------------------------------------------------------
Symptoms: I've been having a really high fever, chills, and nausea. I've also been sweating a lot and my muscles hurt. I feel really queasy and have a headache.
Diagnosis: malaria

Result 4 (Si

## 8. Additional Examples

In [None]:
# Example 2: Respiratory symptoms
query_2 = "I have a persistent cough, shortness of breath, and chest pain."
results_2 = retrieve_top_k(query_2, k=5)
display_results(query_2, results_2)


In [None]:
# Example 3: Digestive symptoms
query_3 = "I'm experiencing stomach pain, nausea, and diarrhea."
results_3 = retrieve_top_k(query_3, k=5)
display_results(query_3, results_3)


## 9. Performance Statistics


In [None]:
# Measure query latency
test_query = "I feel dizzy and have blurred vision."
num_trials = 100

print(f"Running {num_trials} queries to measure average latency...")
start_time = time.time()
for _ in range(num_trials):
    _ = retrieve_top_k(test_query, k=5)
end_time = time.time()

avg_latency = (end_time - start_time) / num_trials
print(f"\nAverage query latency: {avg_latency*1000:.2f} ms")
print(f"Queries per second: {1/avg_latency:.2f}")
