In [1]:
import json
import subprocess
from pathlib import Path
from pprint import pprint
from typing import Dict, List

import tiktoken
import yaml
from config import EMBEDDING_MODEL
from dotenv import find_dotenv, load_dotenv
from loguru import logger
from qdrant_client.http.models import PointStruct
from tqdm.auto import tqdm
from utils import (
    create_collection,
    embed_text,
    get_collection_info,
    get_count,
    search,
    upsert,
)

load_dotenv(find_dotenv())

True

In [8]:
config_path = Path("../config.yaml")

with config_path.open("r") as file:
    config = yaml.safe_load(file)

In [19]:
raw_data_path = Path("../scraper/srb_labor_law_data.json")

In [20]:
with open(raw_data_path, "r", encoding="utf-8") as file:
    raw_data = json.loads(file.read())

## Embedd data

Create JSONL for parallel embedding

In [21]:
filename = Path("./requests_to_parallel_process.jsonl")
jobs = [
    {
        "model": config["openai"]["embedding_model"]["name"],
        "input": ". ".join([sample["title"], " ".join(sample["texts"])]),
    }
    for sample in raw_data
]
with open(filename, "w") as f:
    for job in jobs:
        json_string = json.dumps(job)
        f.write(json_string + "\n")

In [None]:
! python api_request_parallel_processor.py \
  --requests_filepath requests_to_parallel_process.jsonl \
  --save_filepath requests_to_parallel_process_results.jsonl \
  --request_url https://api.openai.com/v1/embeddings \
  --max_requests_per_minute 2500 \
  --max_tokens_per_minute 900000 \
  --token_encoding_name cl100k_base \
  --max_attempts 5 \
  --logging_level 20

Create PointStructures for Qdrant database

In [11]:
embeddings_path = Path("./requests_to_parallel_process_results.jsonl")
with open(embeddings_path, "r", encoding="utf-8") as file:
    embeddings = []
    for line in file:
        embeddings.append(json.loads(line))

In [12]:
embeddings_lookup = {}
for item in embeddings:
    text = item[0]["input"]
    article_name = text.split(". ")[0]
    embedding = item[1]["data"][0]["embedding"]
    embeddings_lookup[article_name] = {"embedding": embedding, "text": text}

In [16]:
points = []

for id, dictionary in enumerate(raw_data):
    title = dictionary["title"]
    link = dictionary["link"]
    if title in embeddings_lookup:
        embedding, text = (
            embeddings_lookup[title]["embedding"],
            embeddings_lookup[title]["text"],
        )
        points.append(
            PointStruct(
                id=id,
                vector=embedding,
                payload={"title": title, "text": text, "link": link},
            )
        )
    else:
        print(
            f"Warning: No embedding found for title '{title}'. This item will be skipped."
        )

# Create Vector database

In [12]:
collection_name = "labor_law"
create_collection(name=collection_name)

[32m2024-04-28 23:31:47.156[0m | [1mINFO    [0m | [36mutils[0m:[36mcreate_collection[0m:[36m31[0m - [1mCreating collection: labor_law with vector size: 1536.[0m


True

In [18]:
upsert(collection=collection_name, points=points)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [19]:
get_collection_info(collection=collection_name)

{'status': <CollectionStatus.GREEN: 'green'>,
 'optimizer_status': <OptimizersStatusOneOf.OK: 'ok'>,
 'vectors_count': 313,
 'indexed_vectors_count': 0,
 'points_count': 313,
 'segments_count': 2,
 'config': {'params': {'vectors': {'size': 1536,
    'distance': <Distance.COSINE: 'Cosine'>,
    'hnsw_config': None,
    'quantization_config': None,
    'on_disk': None},
   'shard_number': 1,
   'sharding_method': None,
   'replication_factor': 1,
   'write_consistency_factor': 1,
   'read_fan_out_factor': None,
   'on_disk_payload': True,
   'sparse_vectors': None},
  'hnsw_config': {'m': 16,
   'ef_construct': 100,
   'full_scan_threshold': 10000,
   'max_indexing_threads': 0,
   'on_disk': False,
   'payload_m': None},
  'optimizer_config': {'deleted_threshold': 0.2,
   'vacuum_min_vector_number': 1000,
   'default_segment_number': 0,
   'max_segment_size': None,
   'memmap_threshold': None,
   'indexing_threshold': 20000,
   'flush_interval_sec': 5,
   'max_optimization_threads': None

In [20]:
get_count(collection=collection_name)

313

# Search the Vector database 

In [5]:
path_to_tests = Path("./test_queries.json")
with open(path_to_tests, "r", encoding="utf-8") as file:
    test_samples = json.loads(file.read())

In [None]:
test_samples["hard"]

Get embeddings for tests

In [9]:
for level in test_samples.keys():
    for i, sample in enumerate(test_samples[level]):
        response = embed_text(
            text=sample["query"], model=config["openai"]["embedding_model"]["name"]
        )
        embedding = response.data[0].embedding
        test_samples[level][i]["embedding"] = embedding

Save tests with embeddings

In [10]:
with open(path_to_tests, "w", encoding="utf-8") as file:
    file.write(json.dumps(test_samples, indent=4))

In [11]:
query = test_samples["hard"][3]["query"]
embedding = test_samples["hard"][3]["embedding"]
query

'Koliko traje porodiljsko odsustvo?'

In [12]:
collection_name = "zakon_o_radu"

In [13]:
response = search(collection=collection_name, query_vector=embedding, with_vectors=True)

## Upated for multiple laws

### Getting Embeddings

Initial settings

In [67]:
laws_dir = Path("../scraper/laws")
law_paths = list(laws_dir.iterdir())

embeddings_dir = Path("./embeddings")
embeddings_dir.mkdir(exist_ok=True)

to_process_dir = Path("./to_process")
to_process_dir.mkdir(exist_ok=True)

max_num_tokens_per_chunk = 8191

if not laws_dir.exists():
    logger.error(f"No laws directory found.")

if not len(law_paths):
    logger.error(f"No laws found in directory.")

In [87]:
def load_json(path: Path) -> List[Dict]:
    if not path.exists():
        logger.error(f"File: {path} does not exist.")
    with open(path, "r", encoding="utf-8") as file:
        data = json.loads(file.read())
    return data


def prepare_for_embedding(
    output_path: Path, scraped_data: List[Dict], embedding_model: str = EMBEDDING_MODEL
) -> None:
    jobs = [
        {
            "model": embedding_model,
            "input": "[" + sample["title"] + "]: " + " ".join(sample["texts"]),
        }
        for sample in scraped_data
    ]
    with open(output_path, "w", encoding="utf-8") as file:
        for job in jobs:
            json_string = json.dumps(job)
            file.write(json_string + "\n")


def get_token_num(text: str, model_name: str = EMBEDDING_MODEL) -> int:
    enc = tiktoken.encoding_for_model(EMBEDDING_MODEL)
    return len(enc.encode(text))


def run_api_request_processor(
    requests_filepath: Path,
    save_path: Path,
    max_requests_per_minute: int = 2500,
    max_tokens_per_minute: int = 900000,
    token_encoding_name: str = "cl100k_base",
    max_attempts: int = 5,
    logging_level: int = 20,
) -> None:
    if not requests_filepath.exists():
        logger.error(f"File {requests_filepath} does not exist.")
    if save_path.suffix != ".jsonl":
        logger.error(f"Save path {save_path} must be JSONL.")

    command = [
        "python",
        "api_request_parallel_processor.py",
        "--requests_filepath",
        requests_filepath,
        "--save_filepath",
        save_path,
        "--request_url",
        "https://api.openai.com/v1/embeddings",
        "--max_requests_per_minute",
        str(max_requests_per_minute),
        "--max_tokens_per_minute",
        str(max_tokens_per_minute),
        "--token_encoding_name",
        token_encoding_name,
        "--max_attempts",
        str(max_attempts),
        "--logging_level",
        str(logging_level),
    ]
    result = subprocess.run(command, text=True, capture_output=True)

    if result.returncode == 0:
        logger.info("Embedding executed successfully.")
        logger.info(f"Embeddings saved to: {save_path}")
    else:
        logger.error("Error in Embedding execution!")
        logger.error("Error:", result.stderr)

Check number of tokens per chunk. <br>
⚠️ Integrate this into processing.

In [70]:
for file_path in tqdm(law_paths, desc="Checking tokens length", total=len(law_paths)):
    scraped_data = load_json(path=file_path)

    for i, element in enumerate(scraped_data):
        full_text = " ".join(element["texts"])
        num_tokens = get_token_num(text=full_text)
        if num_tokens > max_num_tokens_per_chunk:
            print(i, element)

Checking tokens length:   0%|          | 0/5 [00:00<?, ?it/s]

In [88]:
for file_path in tqdm(law_paths, desc="Embedding scraped laws", total=len(law_paths)):
    scraped_data = load_json(path=file_path)

    requests_filepath = to_process_dir / (file_path.stem + ".jsonl")
    prepare_for_embedding(
        output_path=requests_filepath,
        scraped_data=scraped_data,
    )

    processed_filepath = embeddings_dir / requests_filepath.name
    run_api_request_processor(
        requests_filepath=requests_filepath, save_path=processed_filepath
    )

Embedding scraped laws:   0%|          | 0/5 [00:00<?, ?it/s]

[32m2024-04-21 22:23:46.169[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_api_request_processor[0m:[36m67[0m - [1mEmbedding executed successfully.[0m
[32m2024-04-21 22:23:46.171[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_api_request_processor[0m:[36m68[0m - [1mEmbeddings saved to: embeddings/zakon-o-porezu-na-dohodak-gradjana.jsonl[0m
[32m2024-04-21 22:23:47.885[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_api_request_processor[0m:[36m67[0m - [1mEmbedding executed successfully.[0m
[32m2024-04-21 22:23:47.886[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_api_request_processor[0m:[36m68[0m - [1mEmbeddings saved to: embeddings/zakon_o_zastiti_potrosaca.jsonl[0m
[32m2024-04-21 22:23:49.781[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_api_request_processor[0m:[36m67[0m - [1mEmbedding executed successfully.[0m
[32m2024-04-21 22:23:49.782[0m | [1mINFO    [0m | [36m__main__[0m:[36mrun_api_request_processor[0m:[36m68[0m 

### Creating vector database

In [71]:
embeddings_dir = Path("./embeddings")
embedding_paths = list(embeddings_dir.iterdir())

if not embeddings_dir.exists():
    logger.error(f"No embeddings directory found.")

if not len(embedding_paths):
    logger.error(f"No embedding files found in directory.")

In [94]:
def load_embeddings(path: Path) -> List[Dict]:
    if not path.exists():
        logger.error(f"File: {path} does not exist.")

    with open(path, "r", encoding="utf-8") as file:
        embedded_data = []
        for line in file:
            embedded_data.append(json.loads(line))

    return embedded_data


def get_embedings_article_lookup(embedded_data: List[Dict]) -> Dict:
    embeddings_lookup = {}
    for item in embedded_data:
        text = item[0]["input"]
        article_name = text.split("]: ")[0][1:]
        embedding = item[1]["data"][0]["embedding"]
        embeddings_lookup[article_name] = {"embedding": embedding, "text": text}

    return embeddings_lookup


def get_data_points(raw_data: List[Dict], embeddings_lookup: Dict) -> List[PointStruct]:
    points = []

    for id, dictionary in enumerate(raw_data):
        title = dictionary["title"]
        link = dictionary["link"]
        if title in embeddings_lookup:
            embedding, text = (
                embeddings_lookup[title]["embedding"],
                embeddings_lookup[title]["text"],
            )
            points.append(
                PointStruct(
                    id=id,
                    vector=embedding,
                    payload={"title": title, "text": text, "link": link},
                )
            )
        else:
            logger.warning(
                f"Warning: No embedding found for title '{title}'. This item will be skipped."
            )

    return points

In [95]:
for file_path in tqdm(
    embedding_paths,
    desc="Creating vector database collections",
    total=len(embedding_paths),
):
    embedded_data = load_embeddings(path=file_path)
    embeddings_lookup = get_embedings_article_lookup(embedded_data)

    raw_data_path = laws_dir / file_path.with_suffix(".json").name
    raw_data = load_json(path=raw_data_path)

    points = get_data_points(raw_data=raw_data, embeddings_lookup=embeddings_lookup)

    collection_name = file_path.stem.replace("-", "_")
    create_collection(name=collection_name)
    upsert(collection=collection_name, points=points)

    if not get_count(collection=collection_name) == len(raw_data):
        logger.error(f"There are missing points in {collection_name} collection.")

    logger.info(
        f'Created "{collection_name}" collection with {get_count(collection=collection_name)} data points.'
    )

Creating vector database collections:   0%|          | 0/5 [00:00<?, ?it/s]

[32m2024-04-21 22:29:29.809[0m | [1mINFO    [0m | [36mutils[0m:[36mcreate_collection[0m:[36m30[0m - [1mCreating collection: porodicni_zakon with vector size: 1536.[0m
[32m2024-04-21 22:29:33.578[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m21[0m - [1mCreated "porodicni_zakon" collection with 364 data points.[0m
[32m2024-04-21 22:29:33.658[0m | [1mINFO    [0m | [36mutils[0m:[36mcreate_collection[0m:[36m30[0m - [1mCreating collection: zakon_o_radu with vector size: 1536.[0m
[32m2024-04-21 22:29:36.784[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m21[0m - [1mCreated "zakon_o_radu" collection with 313 data points.[0m
[32m2024-04-21 22:29:36.820[0m | [1mINFO    [0m | [36mutils[0m:[36mcreate_collection[0m:[36m30[0m - [1mCreating collection: zakon_o_zastiti_podataka_o_licnosti with vector size: 1536.[0m
[32m2024-04-21 22:29:38.111[0m | [1mINFO    [0m | [36m__main__[0m:[36m<module>[0m:[36m21[0m - [1