In [1]:
pip install sentence-transformers faiss-cpu numpy


Defaulting to user installation because normal site-packages is not writeable
Note: you may need to restart the kernel to use updated packages.


In [1]:
import json

with open("metadata.json", "r") as f:
    metadata = json.load(f)

titles = [entry["title"] for entry in metadata]


In [2]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("msmarco-distilbert-base-v4")


Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


model.safetensors:  75%|#######5  | 199M/265M [00:00<?, ?B/s]

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development


tokenizer_config.json:   0%|          | 0.00/319 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/232k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/466k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/112 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

In [4]:
import numpy as np
import faiss

# Generate embeddings
embeddings = model.encode(titles, convert_to_numpy=True, show_progress_bar=True)

# Create Faiss index
dimension = embeddings.shape[1]
index = faiss.IndexFlatL2(dimension)
index.add(embeddings)

np.save("title_embeddings.npy", embeddings)
faiss.write_index(index, "faiss_index_ms_marco.index")

Batches:   0%|          | 0/4 [00:00<?, ?it/s]

In [5]:
embeddings = np.load("title_embeddings.npy")
index      = faiss.read_index("faiss_index_ms_marco.index")
# reload model & metadata, then define search()…


In [6]:
def search(query, top_k=3):
    # 1. Embed the query
    q_emb = model.encode([query], convert_to_numpy=True)
    # 2. Retrieve from Faiss
    distances, indices = index.search(q_emb, top_k)
    # 3. Format results
    return [
        {
            "title":    metadata[i]["title"],
            "chapter":  metadata[i]["chapter"],
            "score":    float(distances[0][j])
        }
        for j, i in enumerate(indices[0])
    ]
results = search("Acids and Bases in Laboratory", top_k=3)
for r in results:
    print(r)


{'title': '2.1.1 Acids and Bases in the Laboratory', 'chapter': '2 CHAPTER', 'score': 64.59967041015625}
{'title': '2.1 UNDERSTANDING THE CHEICAL PROPERTIES OF ACIDS AND BASES', 'chapter': '2 CHAPTER', 'score': 114.44064331054688}
{'title': '2.1.4 How do Acids and Bases React with each other?', 'chapter': '2 CHAPTER', 'score': 148.95782470703125}


## normalize and search


In [10]:
import faiss
import numpy as np

# — After you’ve computed `embeddings` as before —

# 1. Normalize embeddings to unit length
faiss.normalize_L2(embeddings)

# 2. Build an inner-product index instead of L2
dim   = embeddings.shape[1]
index = faiss.IndexFlatIP(dim)
index.add(embeddings)

# 3. When you search, normalize the query too:
def search_cosine(query, top_k=3):
    # a) embed
    q_emb = model.encode([query], convert_to_numpy=True)
    # b) normalize
    faiss.normalize_L2(q_emb)
    # c) search inner product (returns highest similarity first)
    distances, indices = index.search(q_emb, top_k)
    return [
        {
            "title":   metadata[i]["title"],
            "chapter": metadata[i]["chapter"],
            # distances[j] is now the cosine similarity (between 0 and 1)
            "cosine_sim": float(distances[0][j])
        }
        for j, i in enumerate(indices[0])
    ]

# 4. Try it out:
for r in search_cosine("electric ", top_k=3):
    print(r)


{'title': '13.4 ELECTRIC MOTOR', 'chapter': '13 CHAPTER', 'cosine_sim': 0.5492956638336182}
{'title': '12.8 ELECTRIC POWER', 'chapter': '12 CHAPTER', 'cosine_sim': 0.5454273223876953}
{'title': '13.6 ELECTRIC GENERATOR', 'chapter': '13 CHAPTER', 'cosine_sim': 0.4917614459991455}


## cosine similarity is used here
### Cosine similarity	Cosine of the angle between two vectors (after normalizing)	Higher = more similar (smaller angle)
### Cosine similarity, by normalizing vectors, ignores magnitude and focuses purely on “direction” (semantic content).