In [1]:
import json
import os
from pathlib import Path
from typing import Dict, List

import numpy as np
from loguru import logger
from tqdm.auto import tqdm

## Calculate centroids for all collections

In [15]:
embeddings_dir = Path("../database/embeddings")
embeddings_paths = [
    path for path in list(embeddings_dir.iterdir()) if path.suffix == ".jsonl"
]
if not embeddings_dir.exists():
    logger.error(f"Embeddings dir not found: {embeddings_dir}.")

In [21]:
def load_embeddings(path: Path) -> List:
    if not path.exists():
        logger.error(f"File does not exist: {path}.")

    with open(path, "r", encoding="utf-8") as file:
        embeddings = []
        for line in file:
            embeddings.append(json.loads(line)[1]["data"][0]["embedding"])

    return embeddings

In [34]:
collection_centroids = {}
for file_path in tqdm(
    embeddings_paths, desc="Calculating centroids", total=len(embeddings_paths)
):
    embeddings = load_embeddings(file_path)
    centroid = np.array(embeddings).mean(axis=0)
    collection_centroids[file_path.stem] = centroid.tolist()

Calculating centroids:   0%|          | 0/5 [00:00<?, ?it/s]

In [36]:
centroid_path = Path("./collection_centroids.json")

Save centroids

In [38]:
with open(centroid_path, "w", encoding="utf-8") as file:
    file.write(json.dumps(collection_centroids, indent=4))

### Query router dev

In [46]:
def rout_query(centroids: Dict, query_embedding: List) -> str:
    centroids = list(centroids.items())
    centroids_np = np.array([value for key, value in centroids])
    query_np = np.array(query_embedding)

    norm_query = np.linalg.norm(query_np)
    norm_centroids = np.linalg.norm(centroids_np, axis=1)

    cosine_similarities = np.dot(centroids_np, query_np) / (norm_centroids * norm_query)
    max_index = np.argmax(cosine_similarities)

    collection = centroids[max_index][0]
    return collection

In [47]:
rout_query(
    centroids=collection_centroids,
    query_embedding=collection_centroids["porodicni_zakon"],
)

'porodicni_zakon'

### Query Router with Qdrant

Load centroids

In [2]:
centroid_path = Path("./collection_centroids.json")
with open(centroid_path, "r", encoding="utf-8") as file:
    centroids = json.loads(file.read())

Set path to be able to load Qdrant Utils from database folder

In [None]:
logger.info(f"Current working directory: {os.getcwd()}")
os.chdir("../")
logger.info(f"Changed working directory: {os.getcwd()}")

In [4]:
from qdrant_client.http.models import Distance, PointStruct
from tqdm.auto import tqdm

from database.utils import (
    create_collection,
    delete_collection,
    embed_text,
    get_collection_info,
    get_count,
    search,
    upsert,
)

Create data points for Qdrant

In [5]:
points = []

for id, (law_title, centroid) in enumerate(centroids.items()):
    points.append(
        PointStruct(
            id=id,
            vector=centroid,
            payload={"law_title": law_title},
        )
    )

We are using EUCLID distance because it is much more intuitive and better to set threshold between vectors for real distance metric. <br>
COSINE metric is not really distance, it is an angle between vectors and it can be harder to interpret and set threshold value.

In [6]:
collection_name = "router"
create_collection(name=collection_name, distance=Distance.EUCLID)
upsert(collection=collection_name, points=points)
if get_count(collection=collection_name) != len(centroids.items()):
    logger.error("Router is missing centroids.")

[32m2024-04-26 22:40:22.450[0m | [1mINFO    [0m | [36mdatabase.utils[0m:[36mcreate_collection[0m:[36m31[0m - [1mCreating collection: router with vector size: 1536.[0m


In [7]:
get_collection_info(collection_name)

{'status': <CollectionStatus.GREEN: 'green'>,
 'optimizer_status': <OptimizersStatusOneOf.OK: 'ok'>,
 'vectors_count': 5,
 'indexed_vectors_count': 0,
 'points_count': 5,
 'segments_count': 2,
 'config': {'params': {'vectors': {'size': 1536,
    'distance': <Distance.EUCLID: 'Euclid'>,
    'hnsw_config': None,
    'quantization_config': None,
    'on_disk': None},
   'shard_number': 1,
   'sharding_method': None,
   'replication_factor': 1,
   'write_consistency_factor': 1,
   'read_fan_out_factor': None,
   'on_disk_payload': True,
   'sparse_vectors': None},
  'hnsw_config': {'m': 16,
   'ef_construct': 100,
   'full_scan_threshold': 10000,
   'max_indexing_threads': 0,
   'on_disk': False,
   'payload_m': None},
  'optimizer_config': {'deleted_threshold': 0.2,
   'vacuum_min_vector_number': 1000,
   'default_segment_number': 0,
   'max_segment_size': None,
   'memmap_threshold': None,
   'indexing_threshold': 20000,
   'flush_interval_sec': 5,
   'max_optimization_threads': None},
 

Search router

In [8]:
def query_router_qdrant(
    query_embedding: List, collection_name: str = "router", threshold: float = 0.2
) -> str:
    response = search(collection=collection_name, query_vector=query_embedding)
    collection = response
    return collection

In [11]:
# query = "da li sajt sme da mi salje newsletter ako se nisam prijavila?"
query = "koliko godisnjih dana imam po zakonu?"
query_embedding = embed_text(query, model="text-embedding-3-small").data[0].embedding
query_router_qdrant(query_embedding)

[ScoredPoint(id=1, version=0, score=0.8214849, payload={'law_title': 'zakon_o_radu'}, vector=None, shard_key=None),
 ScoredPoint(id=0, version=0, score=0.84445184, payload={'law_title': 'porodicni_zakon'}, vector=None, shard_key=None),
 ScoredPoint(id=4, version=0, score=0.85869735, payload={'law_title': 'zakon-o-porezu-na-dohodak-gradjana'}, vector=None, shard_key=None),
 ScoredPoint(id=3, version=0, score=0.90343463, payload={'law_title': 'zakon_o_zastiti_potrosaca'}, vector=None, shard_key=None),
 ScoredPoint(id=2, version=0, score=0.9431141, payload={'law_title': 'zakon_o_zastiti_podataka_o_licnosti'}, vector=None, shard_key=None)]

### Semantic router using GPT

In [None]:
from openai import OpenAI