# An intro to Matryoskha Embeddings

In [1]:
import duckdb

In [2]:
con = duckdb.connect("olympics.duckdb")

In [3]:
con.sql("DESCRIBE olympics")

In [4]:
con.sql("SELECT * FROM olympics LIMIT 10")

## Create Embeddings

In [None]:
con.sql("""
ALTER TABLE olympics 
ADD COLUMN embeddings_1024 FLOAT[1024];
""")

In [6]:
import llama_cpp

In [7]:
llm = llama_cpp.Llama(
  model_path="./models/mxbai-embed-large-v1-f16.gguf", 
  embedding=True, 
  verbose=False
)

In [8]:
rows = con.sql("SELECT index, text FROM olympics").fetchall()

In [9]:
embeddings = llm.create_embedding([text for index, text in rows])['data']

In [10]:
for (index, text), embedding in zip(rows, embeddings):
  con.execute(
    "UPDATE olympics SET embeddings_1024 = ? WHERE index = ?", 
    [embedding['embedding'], index]
  )

In [11]:
con.sql("SELECT * FROM olympics LIMIT 10""")

In [12]:
import numpy as np

## Create truncated embeddings

In [None]:
def normalize(vec: list[float]) -> list[float]:
    return (vec / np.linalg.norm(vec)).tolist()

In [14]:
con.create_function(name="normalize", function=normalize)

In [15]:
con.sql("""
SELECT index, text, 
       embeddings_1024, 
       normalize(embeddings_1024[:512])::float[512] AS embeddings_512
FROM olympics 
LIMIT 5
""")

In [16]:
dimensions = [16, 32, 64, 128, 256, 512]
for dimension in dimensions:
  con.sql(f"""
  ALTER TABLE olympics 
  ADD COLUMN embeddings_{dimension} FLOAT[{dimension}];
  """)

  con.sql(f"""
  UPDATE olympics 
  SET embeddings_{dimension} = normalize(embeddings_1024[:{dimension}])
  """)

In [17]:
def vector_search(query, dimension=1024):
  raw_embedding = llm.create_embedding([query])
  search_vector = raw_embedding['data'][0]['embedding']

  if dimension < len(search_vector):
    search_vector = normalize(search_vector[:dimension])

  return con.sql(f"""
  SELECT index, text,
          array_cosine_similarity(
            "embeddings_{dimension}", $searchVector::FLOAT[{dimension}]
          ) AS score
  FROM olympics
  ORDER BY score DESC
  LIMIT 3
  """, params={"searchVector": search_vector})

## Query embeddings

In [18]:
query = 'Where did the opening ceremony take place?'
vector_search(query)

In [19]:
from rich.console import Console
c = Console()

In [20]:
with c.pager(styles=True):
  dimensions = [16, 32, 64, 128, 256, 512, 1024]
  for dimension in dimensions[::-1]:
    c.print(dimension)
    c.print(vector_search(query, dimension))

In [21]:
from ranx import Qrels, Run, compare
from functools import partial

## Evalute embeddings

In [22]:
qrels = Qrels.from_file("data/questions.json")

In [23]:
functions = [
  (partial(vector_search, dimension=dimension), dimension)
  for dimension in dimensions
]

In [24]:
def create_run(qrels, retrieval_fn, name):
  run_dict = {
    question: {
      str(index): score
      for index, score in (retrieval_fn(question)
                            .select("index, score")
                            .fetchall()
                          )
    }
    for question in qrels.to_dict()
  }
  return Run(run_dict, name=name)

In [25]:
%%time
runs = [
   create_run(qrels, fn, name)
   for fn, name in functions
]

In [26]:
comparison = compare(
    qrels,
    runs=runs,
    metrics=["hit_rate"],
)

In [27]:
comparison

In [28]:
with c.pager(styles=True):
  c.print(comparison.win_tie_loss)