
Semantic Search Pipeline with Hugging Face Datasets

Steps:
1. Login to Hugging Face Hub
2. Load dataset (change DATASET_NAME / DATASET_CONFIG to experiment)
3. Clean text and split into sentence-level chunks
4. Embed all chunks into 128-dimensional vectors
5. Embed a prompt into the same vector space
6. Compute Euclidean distance and show closest matches


In [1]:
import numpy as np
from huggingface_hub import login
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
import nltk
from nltk.tokenize import sent_tokenize

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
"""import nltk  --- for sentence
try:
    from nltk.tokenize import sent_tokenize
    nltk.data.find("tokenizers/punkt")
except LookupError:
    nltk.download("punkt")
    from nltk.tokenize import sent_tokenize"""


'import nltk  --- for sentence\ntry:\n    from nltk.tokenize import sent_tokenize\n    nltk.data.find("tokenizers/punkt")\nexcept LookupError:\n    nltk.download("punkt")\n    from nltk.tokenize import sent_tokenize'

In [3]:
# Config: HF token, dataset, model, search

HF_TOKEN        = "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"

DATASET_NAME    = "sentence-transformers/gooaq"   # FIXED
DATASET_CONFIG  = None                    # no config for this dataset
MAX_ROWS        = 1000

EMBED_MODEL_NAME = "sentence-transformers/all-MiniLM-L6-v2"
TOP_K            = 10
CHUNK_SIZE       = 60

PROMPT           = "What is the capital of Finland?"


In [4]:
login(HF_TOKEN)
print("Logged in successfully!")

Logged in successfully!


In [5]:
print(f"\nLoading dataset: {DATASET_NAME}"
      + (f"/{DATASET_CONFIG}" if DATASET_CONFIG is not None else ""))

if DATASET_CONFIG is not None:
    dataset = load_dataset(DATASET_NAME, DATASET_CONFIG, split="train")
else:
    dataset = load_dataset(DATASET_NAME, split="train")

if MAX_ROWS is not None:
    dataset = dataset.select(range(min(MAX_ROWS, len(dataset))))

print("Dataset loaded.")
print("Number of rows:", len(dataset))
print("Columns:", dataset.column_names)


Loading dataset: sentence-transformers/gooaq
Dataset loaded.
Number of rows: 1000
Columns: ['question', 'answer']


In [6]:
# Extract non-empty text lines
raw_texts = []

for ans in dataset["answer"]:
    if ans:
        content = ans.strip()
        if content:
            raw_texts.append(content)

print("Non-empty answers:", len(raw_texts))


Non-empty answers: 1000


In [7]:
"""
    Split text into chunks of up to `max_len` characters,
    but try not to break words in the middle.
    """

def chunk_text(text: str, max_len: int = CHUNK_SIZE):
    chunks = []
    i = 0
    n = len(text)

    while i < n:
        end = min(i + max_len, n)
        window = text[i:end]

        if end < n:
            last_space = window.rfind(" ")
            if last_space != -1 and last_space != 0:
                end = i + last_space
                window = text[i:end]

        chunk = window.strip()
        if chunk:
            chunks.append(chunk)

        i = end

        while i < n and text[i] == " ":
            i += 1

    return chunks

In [8]:
print(f"Chunking text with CHUNK_SIZE = {CHUNK_SIZE}")
chunked_data = []
for line in raw_texts:
    chunked_data.extend(chunk_text(line, CHUNK_SIZE))

print("Total chunks created:", len(chunked_data))

Chunking text with CHUNK_SIZE = 60
Total chunks created: 4899


In [9]:
# 4. Load n-dim embedding model
print(f"\nLoading embedding model: {EMBED_MODEL_NAME}")
embed_model = SentenceTransformer(EMBED_MODEL_NAME)
print("Embedding model loaded.")
test_vec = embed_model.encode("test", convert_to_numpy=True)
print("Embedding dimension:", test_vec.shape[0])


Loading embedding model: sentence-transformers/all-MiniLM-L6-v2
Embedding model loaded.
Embedding dimension: 384


In [10]:
# 5. Embed all chunks into vector DB
print("\nEmbedding chunks (take time)")
vector_db = embed_model.encode(
    chunked_data,
    convert_to_numpy=True,
    show_progress_bar=True
)
print("Embedding completed.")
print("Vector DB shape:", vector_db.shape)   # (num_chunks, 384)


Embedding chunks (take time)


Batches: 100%|██████████| 154/154 [00:03<00:00, 39.01it/s]

Embedding completed.
Vector DB shape: (4899, 384)





In [11]:
# 6. Embed the prompt
print("\nPrompt:", PROMPT)
prompt_vec = embed_model.encode(PROMPT, convert_to_numpy=True)

print("\nPrompt vector (384-dim):")
print(prompt_vec)
print("Vector shape:", prompt_vec.shape)


Prompt: What is the capital of Finland?

Prompt vector (384-dim):
[ 4.45507951e-02  3.97122838e-02 -3.90180461e-02  5.03091998e-02
  5.06749563e-03 -4.70998064e-02  3.18860374e-02 -3.83177400e-03
  3.20874574e-03  3.08345333e-02 -9.64995846e-03 -8.91747177e-02
 -1.05075665e-01  2.42689606e-02 -6.87031262e-03 -1.66187901e-02
 -7.03075808e-03  1.70511901e-02  2.71559265e-02  4.10183556e-02
 -4.28796466e-03 -4.42730822e-02  1.17016668e-02 -2.24054847e-02
  1.36240542e-01  6.99829906e-02  1.43763712e-02  5.11771394e-03
  6.42202282e-03  3.30357105e-02 -5.42362221e-02 -7.78892264e-02
  1.73910148e-03  1.68413576e-02 -4.17772128e-04  6.69443887e-03
  4.59495001e-03 -5.04224077e-02  4.32742313e-02 -2.18512081e-02
 -5.43169677e-03  9.25389002e-04 -7.62411766e-03 -5.73805952e-03
  6.59404248e-02  5.14704734e-02 -4.73667048e-02  2.34478414e-02
  3.28630246e-02 -2.68838722e-02  4.50134650e-02 -8.59741941e-02
 -4.01185527e-02 -1.28678521e-02  3.30344774e-02  3.54821086e-02
 -1.89352110e-02  1.168

In [12]:
# 7. Compute Euclidean distances
print("\nCalculating Euclidean distances")
distances = np.linalg.norm(vector_db - prompt_vec, axis=1)

closest_idx = int(np.argmin(distances))
closest_chunk = chunked_data[closest_idx]

print("CLOSEST MATCH (EUCLIDEAN)")
print("__________________________")
print("Index:", closest_idx)
print("Distance:", float(distances[closest_idx]))
print("Matching text chunk:")
print(repr(closest_chunk))


Calculating Euclidean distances
CLOSEST MATCH (EUCLIDEAN)
__________________________
Index: 3687
Distance: 1.1241536140441895
Matching text chunk:
"Belgium's."


In [13]:
# 8. Show top-K closest matches
sorted_indices = np.argsort(distances)

print(f"\nTop {TOP_K} closest chunks to the prompt (Euclidean distance):")
for rank in range(TOP_K):
    idx = int(sorted_indices[rank])
    dist = float(distances[idx])
    text = chunked_data[idx]

    print("\n------------Euclidean distance → lower = better-----------------")
    print(f"Rank: {rank + 1}")
    print(f"Index: {idx}")
    print(f"Distance: {dist:.6f}")
    print("Chunk:")
    print(repr(text))


Top 10 closest chunks to the prompt (Euclidean distance):

------------Euclidean distance → lower = better-----------------
Rank: 1
Index: 3687
Distance: 1.124154
Chunk:
"Belgium's."

------------Euclidean distance → lower = better-----------------
Rank: 2
Index: 2953
Distance: 1.133087
Chunk:
'the Netherlands, New Zealand, Niger, Norway, Oman,'

------------Euclidean distance → lower = better-----------------
Rank: 3
Index: 826
Distance: 1.147256
Chunk:
'in Scandinavian countries.'

------------Euclidean distance → lower = better-----------------
Rank: 4
Index: 4726
Distance: 1.149410
Chunk:
'"northerner", and referred to people from Norway, and'

------------Euclidean distance → lower = better-----------------
Rank: 5
Index: 947
Distance: 1.151121
Chunk:
'province of Carinthia.'

------------Euclidean distance → lower = better-----------------
Rank: 6
Index: 2951
Distance: 1.157584
Chunk:
'Belgium, Canada, Czechoslovakia, Denmark, Egypt, France,'

------------Euclidean distance → lo

In [14]:
# 7B. Cosine Similarity Search

print("\nCalculating Cosine similarities...")

cosine_scores = (vector_db @ prompt_vec) / (
    np.linalg.norm(vector_db, axis=1) * np.linalg.norm(prompt_vec)
)

closest_idx_cosine = int(np.argmax(cosine_scores))
closest_chunk_cosine = chunked_data[closest_idx_cosine]

print("CLOSEST MATCH (COSINE)")
print("__________________________")
print("Index:", closest_idx_cosine)
print("Cosine Score:", float(cosine_scores[closest_idx_cosine]))
print("Matching text chunk:")
print(repr(closest_chunk_cosine))



Calculating Cosine similarities...
CLOSEST MATCH (COSINE)
__________________________
Index: 3687
Cosine Score: 0.3681393265724182
Matching text chunk:
"Belgium's."


In [15]:
print(f"\nTop {TOP_K} closest chunks to the prompt (Cosine similarity):")
sorted_cosine_indices = np.argsort(-cosine_scores)  # negative for descending order

for rank in range(TOP_K):
    idx = int(sorted_cosine_indices[rank])
    score = float(cosine_scores[idx])
    text = chunked_data[idx]

    print("\n---------Cosine similarity → higher = better--------------------")
    print(f"Rank: {rank + 1}")
    print(f"Index: {idx}")
    print(f"Cosine Score: {score:.6f}")
    print("Chunk:")
    print(repr(text))



Top 10 closest chunks to the prompt (Cosine similarity):

---------Cosine similarity → higher = better--------------------
Rank: 1
Index: 3687
Cosine Score: 0.368139
Chunk:
"Belgium's."

---------Cosine similarity → higher = better--------------------
Rank: 2
Index: 2953
Cosine Score: 0.358057
Chunk:
'the Netherlands, New Zealand, Niger, Norway, Oman,'

---------Cosine similarity → higher = better--------------------
Rank: 3
Index: 826
Cosine Score: 0.341902
Chunk:
'in Scandinavian countries.'

---------Cosine similarity → higher = better--------------------
Rank: 4
Index: 4726
Cosine Score: 0.339428
Chunk:
'"northerner", and referred to people from Norway, and'

---------Cosine similarity → higher = better--------------------
Rank: 5
Index: 947
Cosine Score: 0.337460
Chunk:
'province of Carinthia.'

---------Cosine similarity → higher = better--------------------
Rank: 6
Index: 2951
Cosine Score: 0.329999
Chunk:
'Belgium, Canada, Czechoslovakia, Denmark, Egypt, France,'

---------Cos

In [16]:
import os
import json
import numpy as np   # you already imported, but just in case

OUTPUT_DIR = "semantic_gooaq_minilm"
os.makedirs(OUTPUT_DIR, exist_ok=True)

# 1) Save model locally (so Streamlit can load without downloading)
model_path = os.path.join(OUTPUT_DIR, "model")
embed_model.save(model_path)
print("Model saved to:", model_path)

# 2) Save embeddings
emb_path = os.path.join(OUTPUT_DIR, "embeddings.npy")
np.save(emb_path, vector_db)
print("Embeddings saved to:", emb_path)

# 3) Save text chunks
chunks_path = os.path.join(OUTPUT_DIR, "chunks.json")
with open(chunks_path, "w", encoding="utf-8") as f:
    json.dump(chunked_data, f, ensure_ascii=False)
print("Chunks saved to:", chunks_path)


Model saved to: semantic_gooaq_minilm/model
Embeddings saved to: semantic_gooaq_minilm/embeddings.npy
Chunks saved to: semantic_gooaq_minilm/chunks.json
