## Baza wektorowa

### Instalacja bibliotek

In [1]:
!pip install faiss-cpu sentence_transformers

Collecting sentence_transformers
  Downloading sentence_transformers-5.1.0-py3-none-any.whl.metadata (16 kB)
Downloading sentence_transformers-5.1.0-py3-none-any.whl (483 kB)
Installing collected packages: sentence_transformers
Successfully installed sentence_transformers-5.1.0


### Wygeneruj embeddingi

In [10]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer("all-MiniLM-L6-v2")

model.max_seq_length = 256

sentences = [
    "dinosaurs live in africa but in different time dimension", 
    "this is sentence about little cat that liked to eat fast food",
    "this is the another sample sentence which is here just to not be matched while other one is"
]

embeddings = model.encode(sentences, normalize_embeddings=True)

In [11]:
embeddings

array([[-0.04987683,  0.03634831,  0.01747592, ..., -0.05154557,
         0.01327896, -0.05160031],
       [ 0.09842476,  0.07230502,  0.04732889, ...,  0.15476963,
         0.04178708,  0.02207421],
       [-0.01920933,  0.06346308,  0.07642584, ...,  0.01450102,
         0.08586987, -0.00456648]], shape=(3, 384), dtype=float32)

In [13]:
print(len(embeddings[1]))

384


### Utwórz bazę wektorową i załaduj dokumenty

In [14]:
import faiss

d = 384  # dimension

# Build index
index = faiss.IndexFlatL2(d)  # build the index
index.add(embeddings)  # add vectors to the index

### Przeszukaj bazę wektorową

In [15]:
queryText = "french fries"
embeddingSearch = model.encode([queryText], normalize_embeddings=True)
embeddingFound, idx = index.search(embeddingSearch, 1)  # actual search
print(queryText + " matches:\n" + sentences[idx[0][0]])

queryText = "not similar text"
embeddingSearch = model.encode([queryText], normalize_embeddings=True)
embeddingFound, idx = index.search(embeddingSearch, 1)  # actual search
print(queryText + " matches:\n" + sentences[idx[0][0]])

french fries matches:
this is sentence about little cat that liked to eat fast food
not similar text matches:
this is the another sample sentence which is here just to not be matched while other one is


### Baza wektorowa na dysku

In [16]:
!pip install sentence-transformers sqlite-vss

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)




In [17]:
import sqlite3, json, numpy as np
from sentence_transformers import SentenceTransformer
import sqlite_vss

DB_PATH = "vectors.db"
DIM = 384
model = SentenceTransformer("all-MiniLM-L6-v2")

def embed_norm(texts):
    v = model.encode(texts).astype("float32")
    v /= (np.linalg.norm(v, axis=1, keepdims=True) + 1e-12)  # kosinus → IP
    return v

con = sqlite3.connect(DB_PATH)
con.enable_load_extension(True)
sqlite_vss.load(con)
cur = con.cursor()

# 1) Tabela
cur.executescript(f"""
CREATE TABLE IF NOT EXISTS docs(
  id INTEGER PRIMARY KEY,
  text TEXT NOT NULL
);
CREATE VIRTUAL TABLE IF NOT EXISTS doc_index USING vss0(emb({DIM}));
""")
con.commit()

# 2) Dane + indeks
docs = [
    (1, "Ciąg Fibonacciego zaczyna się od 0 i 1."),
    (2, "Algorytm Dijkstry znajduje najkrótsze ścieżki w grafie."),
    (3, "Rekurencja to wywołanie funkcji przez samą siebie."),
]
cur.executemany("INSERT OR IGNORE INTO docs(id, text) VALUES (?,?)", docs)

embs = embed_norm([t for _, t in docs]).tolist()

# (a) jeśli istnieją — usuń stare wektory po rowid
cur.executemany("DELETE FROM doc_index WHERE rowid = ?", [(d[0],) for d in docs])
# (b) wstaw jako surowy JSON string (BEZ json(?))
cur.executemany(
    "INSERT INTO doc_index(rowid, emb) VALUES (?, ?)",
    [(docs[i][0], json.dumps(embs[i])) for i in range(len(docs))]
)
con.commit()

query = "Jak zaczyna się ciąg Fibonacciego?"
q = embed_norm([query])[0].tolist()

rows = cur.execute("""
WITH knn AS (
  SELECT rowid, distance
  FROM doc_index
  WHERE vss_search(emb, ?)
  ORDER BY distance DESC
  LIMIT 5
)
SELECT d.id, d.text, knn.distance
FROM knn
JOIN docs AS d ON d.id = knn.rowid
ORDER BY knn.distance DESC;
""", (json.dumps(q),)).fetchall()

for rid, text, score in rows:
    print(f"id={rid} score={score:.4f}  text={text}")


id=3 score=1.1374  text=Rekurencja to wywołanie funkcji przez samą siebie.
id=2 score=0.9426  text=Algorytm Dijkstry znajduje najkrótsze ścieżki w grafie.
id=1 score=0.3450  text=Ciąg Fibonacciego zaczyna się od 0 i 1.
