In [44]:
import sys
import json
from uuid import uuid4
import sqlite3

sys.path.append("..")

from common.aidevs_authorize import get_task, get_token, send_answer
from common.openai_requests import send_chat_completion
from common.utils import get_message, print_task

from langchain.embeddings.openai import OpenAIEmbeddings
import requests
import pandas as pd

In [6]:
URL = "https://unknow.news/archiwum.json"
COLLECTION_NAME = "C03E04"

resp = requests.get(URL)
json_to_upload = json.loads(resp.text)

In [54]:
# Create database

df = pd.DataFrame(json_to_upload)

df["uuid"] = df.apply(lambda row: str(uuid4()), axis=1)
df["collection_name"] = COLLECTION_NAME

con = sqlite3.connect("c03e04.db")
df.to_sql("embeddings", con, if_exists="replace", index=False)

6153

In [61]:
# Populate database

embeddings = OpenAIEmbeddings()

points = []
local_database = pd.read_sql_query("select * from embeddings", con)

for row in local_database[:300].itertuples():
    metadata = {"uuid": row.uuid, "content": row.url, "source": row.collection_name}
    points.append(
        {
            "id": metadata["uuid"],
            "payload": metadata,
            "vector": embeddings.embed_documents([row.url])[0],
        }
    )

In [63]:
# Prepare embeddings for batch upsert

ids, vectors, payloads = zip(
    *((point["id"], point["vector"], point["payload"]) for point in points)
)

In [46]:
# CREATE COLLECTION IN QDRANT

from qdrant_client import QdrantClient
from qdrant_client.http import models


client = QdrantClient("localhost", port=6333)
result = client.get_collections()

client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=models.VectorParams(
        size=1536, distance=models.Distance.COSINE, on_disk=True
    ),
)

True

In [64]:
# Index

client.upsert(
    COLLECTION_NAME,
    points=models.Batch(ids=ids, payloads=payloads, vectors=vectors),
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [75]:
# Get AIDEVS task

token = get_token("search")
task = get_task(token)
print_task(task)

code
0
******************************************************************************************************************************************************
msg
Index all data from provided URL into vecto store and provide answer to my question - https://unknow.news/archiwum.json
******************************************************************************************************************************************************
question
Co różni pseudonimizację od anonimizowania danych?
******************************************************************************************************************************************************


In [65]:
# Create embedding for question provided in task

task_embedding = embeddings.embed_query(task["question"])

In [66]:
# Search for embedded question

query_filter = models.Filter(
    must=[
        models.FieldCondition(
            key="source",
            match=models.MatchValue(value=COLLECTION_NAME),
        )
    ]
)

search = client.search(
    COLLECTION_NAME, query_vector=task_embedding, limit=1, query_filter=query_filter
)
print(search)

[ScoredPoint(id='6d042041-1683-4c2a-94cd-d509874182c4', version=0, score=0.8953796, payload={'content': 'https://www.internet-czas-dzialac.pl/pseudonimizacja-a-anonimizacja/', 'source': 'C03E04', 'uuid': '6d042041-1683-4c2a-94cd-d509874182c4'}, vector=None)]


In [74]:
cur = con.cursor()
cur.execute(f"select url from embeddings where uuid = '{search[0].payload['uuid']}'")
res = cur.fetchone()
res

('https://www.internet-czas-dzialac.pl/pseudonimizacja-a-anonimizacja/',)

In [76]:
# VOILA

send_answer(token, answer=res[0])

'{\n    "code": 0,\n    "msg": "OK",\n    "note": "CORRECT"\n}'