## 1. Install and Import Dependencies


In [None]:
# Install required packages
%pip install datasets faiss-cpu sentence-transformers pandas numpy


In [2]:
import numpy as np
import pandas as pd
import faiss
from datasets import load_dataset, Dataset
from sentence_transformers import SentenceTransformer
from typing import List, Tuple
import time


## 2. Load the Dataset


In [None]:
print("Loading (streaming) …")
stream = load_dataset("MedRAG/wikipedia", split="train", streaming=True)

# Only iterate the first 1000 examples
first_1k_iter = stream.take(1000) # change this line to load more data

ds = Dataset.from_list(list(first_1k_iter))
print("Dataset loaded successfully!")
print(f"Number of entries: {len(ds)}")
print(ds[0])



Loading (streaming) …
Dataset loaded successfully!
Number of entries: 1000
{'id': 'wiki20220301en000_0', 'title': 'Anarchism', 'content': 'Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful. As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong historical association with anti-capitalism and socialism.', 'contents': 'Anarchism. Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful. As a historically left-wing m

In [6]:
# Explore the dataset structure
corpus = ds
print(f"Number of documents: {len(corpus)}")
print(f"\nDataset features: {corpus.features}")
print(f"\nFirst example:")
print(corpus[0])


Number of documents: 1000

Dataset features: {'id': Value('string'), 'title': Value('string'), 'content': Value('string'), 'contents': Value('string'), 'wiki_id': Value('string')}

First example:
{'id': 'wiki20220301en000_0', 'title': 'Anarchism', 'content': 'Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the state, which it holds to be unnecessary, undesirable, and harmful. As a historically left-wing movement, placed on the farthest left of the political spectrum, it is usually described alongside communalism and libertarian Marxism as the libertarian wing (libertarian socialism) of the socialist movement, and has a strong historical association with anti-capitalism and socialism.', 'contents': 'Anarchism. Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls 

In [7]:
# Convert to pandas for easier manipulation
df = pd.DataFrame(corpus)
print(f"Dataset shape: {df.shape}")
print(f"\nColumn names: {df.columns.tolist()}")
df.head()


Dataset shape: (1000, 5)

Column names: ['id', 'title', 'content', 'contents', 'wiki_id']


Unnamed: 0,id,title,content,contents,wiki_id
0,wiki20220301en000_0,Anarchism,Anarchism is a political philosophy and moveme...,Anarchism. Anarchism is a political philosophy...,12
1,wiki20220301en000_1,Anarchism,Humans lived in societies without formal hiera...,Anarchism. Humans lived in societies without f...,12
2,wiki20220301en000_2,Anarchism,Anarchism employs a diversity of tactics in or...,Anarchism. Anarchism employs a diversity of ta...,12
3,wiki20220301en000_3,Anarchism,"Etymology, terminology, and definition The ety...","Anarchism. Etymology, terminology, and definit...",12
4,wiki20220301en000_4,Anarchism,The first political philosopher to call himsel...,Anarchism. The first political philosopher to ...,12


## 3. Initialize Embedding Model

In [8]:
# Initialize the embedding model
# Using a model optimized for semantic search
model_name = 'all-MiniLM-L6-v2'  # Fast and effective model
print(f"Loading embedding model: {model_name}")
embedding_model = SentenceTransformer(model_name)
print(f"Model loaded successfully!")
print(f"Embedding dimension: {embedding_model.get_sentence_embedding_dimension()}")


Loading embedding model: all-MiniLM-L6-v2
Model loaded successfully!
Embedding dimension: 384


## 4. Prepare Text for Embedding

In [9]:
# Determine which fields to use for embedding
# Typically Wikipedia datasets have 'title' and 'text' or 'contents' fields
if 'title' in df.columns and 'contents' in df.columns:
    # Combine title and contents for better retrieval
    documents = [f"{title}: {content}" for title, content in zip(df['title'], df['contents'])]
    print("Using combined 'title' and 'contents' fields")
elif 'title' in df.columns and 'text' in df.columns:
    documents = [f"{title}: {text}" for title, text in zip(df['title'], df['text'])]
    print("Using combined 'title' and 'text' fields")
elif 'contents' in df.columns:
    documents = df['contents'].tolist()
    print("Using 'contents' field only")
elif 'text' in df.columns:
    documents = df['text'].tolist()
    print("Using 'text' field only")
else:
    # Use the first text-like column found
    text_column = [col for col in df.columns if df[col].dtype == 'object'][0]
    documents = df[text_column].tolist()
    print(f"Using '{text_column}' field")

print(f"\nTotal documents to embed: {len(documents)}")
print(f"First document preview: {documents[0][:200]}...")


Using combined 'title' and 'contents' fields

Total documents to embed: 1000
First document preview: Anarchism: Anarchism. Anarchism is a political philosophy and movement that is sceptical of authority and rejects all involuntary, coercive forms of hierarchy. Anarchism calls for the abolition of the...


## 5. Generate Embeddings for the Dataset

In [10]:
# Generate embeddings with progress tracking
print(f"Generating embeddings for {len(documents)} documents...")
print("Note: This may take several minutes depending on dataset size...")

start_time = time.time()
embeddings = embedding_model.encode(
    documents,
    show_progress_bar=True,
    batch_size=32,
    convert_to_numpy=True
)
end_time = time.time()

print(f"\nEmbeddings generated in {end_time - start_time:.2f} seconds")
print(f"Embeddings shape: {embeddings.shape}")


Generating embeddings for 1000 documents...
Note: This may take several minutes depending on dataset size...


Batches: 100%|██████████| 32/32 [00:31<00:00,  1.01it/s]


Embeddings generated in 31.68 seconds
Embeddings shape: (1000, 384)





## 6. Build FAISS Index

In [11]:
# Normalize embeddings for cosine similarity
faiss.normalize_L2(embeddings)

# Get embedding dimension
dimension = embeddings.shape[1]

# Create FAISS index (using IndexFlatIP for inner product, equivalent to cosine similarity with normalized vectors)
index = faiss.IndexFlatIP(dimension)

# Add embeddings to the index
print(f"Building FAISS index...")
index.add(embeddings)
print(f"Index built successfully!")
print(f"Total vectors in index: {index.ntotal}")


Building FAISS index...
Index built successfully!
Total vectors in index: 1000


## 7. Create Retrieval Function


In [12]:
def retrieve_top_k(query: str, k: int = 5) -> List[Tuple[float, dict]]:
    """
    Retrieve top-k most similar documents for a given query.
    
    Args:
        query: The search query
        k: Number of results to return (default: 5)
    
    Returns:
        List of tuples containing (similarity_score, document_dict)
    """
    # Generate embedding for the query
    query_embedding = embedding_model.encode([query], convert_to_numpy=True)
    
    # Normalize for cosine similarity
    faiss.normalize_L2(query_embedding)
    
    # Search the index
    distances, indices = index.search(query_embedding, k)
    
    # Retrieve and format results
    results = []
    for i, (distance, idx) in enumerate(zip(distances[0], indices[0])):
        doc_info = df.iloc[idx].to_dict()
        results.append((distance, doc_info))
    
    return results


def display_results(query: str, results: List[Tuple[float, dict]]):
    """
    Display retrieval results in a readable format.
    """
    print("="*80)
    print(f"QUERY: {query}")
    print("="*80)
    print()
    
    for i, (score, doc) in enumerate(results, 1):
        print(f"Result {i} (Similarity Score: {score:.4f})")
        print("-" * 80)
        
        # Display title if available
        if 'title' in doc:
            print(f"Title: {doc['title']}")
        
        # Display content preview (first 300 characters)
        if 'contents' in doc:
            content = doc['contents']
            print(f"Content: {content[:300]}{'...' if len(content) > 300 else ''}")
        elif 'text' in doc:
            content = doc['text']
            print(f"Content: {content[:300]}{'...' if len(content) > 300 else ''}")
        
        # Display any other relevant fields
        for key, value in doc.items():
            if key not in ['title', 'contents', 'text'] and isinstance(value, (str, int, float)):
                print(f"{key.capitalize()}: {value}")
        
        print()
    print("="*80)


In [13]:
# Example query about diabetes
example_query = "What are the symptoms and treatment options for type 2 diabetes?"

# Retrieve top 5 matches
print("Searching for top 5 matches...\n")
results = retrieve_top_k(example_query, k=5)

# Display results
display_results(example_query, results)


Searching for top 5 matches...

QUERY: What are the symptoms and treatment options for type 2 diabetes?

Result 1 (Similarity Score: 0.2290)
--------------------------------------------------------------------------------
Title: Autism
Content: Autism. have some positive evidence, suggesting that some form of treatment is preferable to no treatment, the methodological quality of systematic reviews of these studies has generally been poor, their clinical results are mostly tentative, and there is little evidence for the relative effectivene...
Id: wiki20220301en000_111
Content: have some positive evidence, suggesting that some form of treatment is preferable to no treatment, the methodological quality of systematic reviews of these studies has generally been poor, their clinical results are mostly tentative, and there is little evidence for the relative effectiveness of treatment options. Intensive, sustained special education programs and behavior therapy early in life can help childre

## 9. Additional Examples

In [14]:
# Example 2: Cardiovascular query
query_2 = "What causes high blood pressure and how is it treated?"
results_2 = retrieve_top_k(query_2, k=5)
display_results(query_2, results_2)


QUERY: What causes high blood pressure and how is it treated?

Result 1 (Similarity Score: 0.2153)
--------------------------------------------------------------------------------
Title: Aristotle
Content: Aristotle. On Early Modern scientists In the Early Modern period, scientists such as William Harvey in England and Galileo Galilei in Italy reacted against the theories of Aristotle and other classical era thinkers like Galen, establishing new theories based to some degree on observation and experim...
Id: wiki20220301en000_578
Content: On Early Modern scientists In the Early Modern period, scientists such as William Harvey in England and Galileo Galilei in Italy reacted against the theories of Aristotle and other classical era thinkers like Galen, establishing new theories based to some degree on observation and experiment. Harvey demonstrated the circulation of the blood, establishing that the heart functioned as a pump rather than being the seat of the soul and the controller of t

In [None]:
# Example 3: Infectious disease query
query_3 = "How does the immune system fight bacterial infections?"
results_3 = retrieve_top_k(query_3, k=5)
display_results(query_3, results_3)
