In [87]:
import json
import os
from pprint import pprint

import tiktoken
from config import EMBEDDING_MODEL
from dotenv import find_dotenv, load_dotenv
from qdrant_client.http.models import PointStruct
from utils import create_collection, get_collection_info, get_count, upsert, search

load_dotenv(find_dotenv())

True

In [39]:
raw_data_path = "../scraper/srb_labor_law_data.json"

In [84]:
with open(raw_data_path, "r", encoding="utf-8") as file:
    raw_data = json.loads(file.read())

## Embedd data

Create JSONL for parallel embedding

In [42]:
filename = "./requests_to_parallel_process.jsonl"
jobs = [
    {
        "model": EMBEDDING_MODEL,
        "input": ". ".join([sample["title"], " ".join(sample["texts"])]),
    }
    for sample in raw_data
]
with open(filename, "w") as f:
    for job in jobs:
        json_string = json.dumps(job)
        f.write(json_string + "\n")

In [44]:
! python api_request_parallel_processor.py \
  --requests_filepath requests_to_parallel_process.jsonl \
  --save_filepath requests_to_parallel_process_results.jsonl \
  --request_url https://api.openai.com/v1/embeddings \
  --max_requests_per_minute 2500 \
  --max_tokens_per_minute 900000 \
  --token_encoding_name cl100k_base \
  --max_attempts 5 \
  --logging_level 20

INFO:root:Starting request #0
INFO:root:Starting request #1
INFO:root:Starting request #2
INFO:root:Starting request #3
INFO:root:Starting request #4
INFO:root:Starting request #5
INFO:root:Starting request #6
INFO:root:Starting request #7
INFO:root:Starting request #8
INFO:root:Starting request #9
INFO:root:Starting request #10
INFO:root:Starting request #11
INFO:root:Starting request #12
INFO:root:Starting request #13
INFO:root:Starting request #14
INFO:root:Starting request #15
INFO:root:Starting request #16
INFO:root:Starting request #17
INFO:root:Starting request #18
INFO:root:Starting request #19
INFO:root:Starting request #20
INFO:root:Starting request #21
INFO:root:Starting request #22
INFO:root:Starting request #23
INFO:root:Starting request #24
INFO:root:Starting request #25
INFO:root:Starting request #26
INFO:root:Starting request #27
INFO:root:Starting request #28
INFO:root:Starting request #29
INFO:root:Starting request #30
INFO:root:Starting request #31
INFO:root:Starting

Create PointStructures for Qdrant database

In [68]:
embeddings_path = "./requests_to_parallel_process_results.jsonl"
with open(embeddings_path, "r", encoding="utf-8") as file:
    embeddings = []
    for line in file:
        embeddings.append(json.loads(line))

In [79]:
embeddings_lookup = {}
for item in embeddings:
    text = item[0]["input"]
    article_name = text.split(". ")[0]
    embedding = item[1]["data"][0]["embedding"]
    embeddings_lookup[article_name] = {"embedding": embedding, "text": text}

In [85]:
points = []

for id, dictionary in enumerate(raw_data):
    title = dictionary["title"]
    link = dictionary["link"]
    if title in embeddings_lookup:
        embedding, text = (
            embeddings_lookup[title]["embedding"],
            embeddings_lookup[title]["text"],
        )
        points.append(
            PointStruct(
                id=id,
                vector=embedding,
                payload={"title": title, "text": text, "link": link},
            )
        )
    else:
        print(
            f"Warning: No embedding found for title '{title}'. This item will be skipped."
        )

# Create Vector database

In [88]:
collection_name = "labor_law"
create_collection(name=collection_name)

[32m2024-03-21 23:00:43.225[0m | [1mINFO    [0m | [36mutils[0m:[36mcreate_collection[0m:[36m30[0m - [1mCreating collection: labor_law with vector size: 1536.[0m


True

In [89]:
upsert(collection=collection_name, points=points)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [90]:
get_collection_info(collection=collection_name)

{'status': <CollectionStatus.GREEN: 'green'>,
 'optimizer_status': <OptimizersStatusOneOf.OK: 'ok'>,
 'vectors_count': 313,
 'indexed_vectors_count': 0,
 'points_count': 313,
 'segments_count': 2,
 'config': {'params': {'vectors': {'size': 1536,
    'distance': <Distance.COSINE: 'Cosine'>,
    'hnsw_config': None,
    'quantization_config': None,
    'on_disk': None},
   'shard_number': 1,
   'sharding_method': None,
   'replication_factor': 1,
   'write_consistency_factor': 1,
   'read_fan_out_factor': None,
   'on_disk_payload': True,
   'sparse_vectors': None},
  'hnsw_config': {'m': 16,
   'ef_construct': 100,
   'full_scan_threshold': 10000,
   'max_indexing_threads': 0,
   'on_disk': False,
   'payload_m': None},
  'optimizer_config': {'deleted_threshold': 0.2,
   'vacuum_min_vector_number': 1000,
   'default_segment_number': 0,
   'max_segment_size': None,
   'memmap_threshold': None,
   'indexing_threshold': 20000,
   'flush_interval_sec': 5,
   'max_optimization_threads': None

In [91]:
get_count(collection=collection_name)

313