## Vector database and embeddings

### Install libraries

In [1]:
!pip install faiss-cpu sentence_transformers



### Generate embeddings

In [2]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

model.max_seq_length = 256

sentences = [
    "dinosaurs live in africa but in different time dimension", 
    "this is sentence about little cat that liked this eat fast food",
    "this is the another sample sentence which is here just this not be matched while other one is"
]

embeddings = model.encode(sentences, normalize_embeddings=True)

In [3]:
embeddings

array([[-0.04987683,  0.03634831,  0.01747592, ..., -0.05154557,
         0.01327896, -0.05160031],
       [ 0.07967837,  0.06043367,  0.0579344 , ...,  0.1495044 ,
         0.06014546,  0.02409914],
       [-0.00895106,  0.05474589,  0.08349688, ...,  0.0095391 ,
         0.09756648,  0.00701756]], shape=(3, 384), dtype=float32)

In [4]:
print(len(embeddings[1]))

384


### Create vector database and load documents

In [5]:
import faiss

d = 384  # number of dimensions

index = faiss.IndexFlatL2(d)  #  build the index
index.add(embeddings)  #  add vectors to the index

### Search the vector database

In [6]:
queryText = "french fries"
embeddingSearch = model.encode([queryText], normalize_embeddings=True)
embeddingFound, idx = index.search(embeddingSearch, 1)  #  actual search
print(queryText + " matches:\\n" + sentences[idx[0][0]])

queryText = "not similar text"
embeddingSearch = model.encode([queryText], normalize_embeddings=True)
embeddingFound, idx = index.search(embeddingSearch, 1)  #  actual search
print(queryText + " matches:\\n" + sentences[idx[0][0]])

french fries matches:\nthis is sentence about little cat that liked this eat fast food
not similar text matches:\nthis is the another sample sentence which is here just this not be matched while other one is


### Vector database on disk

In [7]:
!pip install sentence-transformers sqlite-vss

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [8]:
import sqlite3
import json
import numpy as np
from sentence_transformers import SentenceTransformer
import sqlite_vss

DB_PATH = "vectors.db"
DIM = 384
model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_norm(texts):
    v = model.encode(texts).astype("float32")
    v /= (np.linalg.norm(v, axis=1, keepdims=True) + 1e-12)  #  kosinus → IP
    return v

con = sqlite3.connect(DB_PATH)
con.enable_load_extension(True)
sqlite_vss.load(con)
cur = con.cursor()

#  1) Tabel
cur.executescript(f"""
CREATE TABLE IF NOT EXISTS docs(
  id INTEGER PRIMARY KEY,
  text TEXT NOT NULL
);
CREATE VIRTUAL TABLE IF NOT EXISTS doc_index USING vss0(emb({DIM}));
""")
con.commit()

#  2) Data + index
docs = [
    (1, "The Fibonacci sequence starts with 0 and 1."),
    (2, "Dijkstra's algorithm finds the shortest paths in a graph."),
    (3, "Recursion is a function calling itself."),
]
cur.executemany("INSERT OR IGNORE INTO docs(id, text) VALUES (?,?)", docs)

embs = embed_norm([t for _, t in docs]).tolist()

#  (a) if exist — remove old vectors by rowid
cur.executemany("DELETE FROM doc_index WHERE rowid = ?", [(d[0],) for d in docs])
#  (b) insert as raw JSON string
cur.executemany(
    "INSERT INTO doc_index(rowid, emb) VALUES (?, ?)",
    [(docs[i][0], json.dumps(embs[i])) for i in range(len(docs))]
)
con.commit()

query = "How does the Fibonacci sequence begin?"
q = embed_norm([query])[0].tolist()

rows = cur.execute("""
WITH knn AS (
  SELECT rowid, distance
  FROM doc_index
  WHERE vss_search(emb, ?)
  ORDER BY distance DESC
  LIMIT 5
)
SELECT d.id, d.text, knn.distance
FROM knn
JOIN docs AS d ON d.id = knn.rowid
ORDER BY knn.distance DESC;
""", (json.dumps(q),)).fetchall()

for rid, text, score in rows:
    print(f"id={rid} score={score:.4f}  text={text}")

id=2 score=1.6242  text=Dijkstra's algorithm finds the shortest paths in a graph.
id=3 score=1.1552  text=Recursion is a function calling itself.
id=1 score=0.3085  text=The Fibonacci sequence starts with 0 and 1.
