In [15]:
import sys
import json
from uuid import uuid4

sys.path.append("..")

from common.aidevs_authorize import get_task, get_token, send_answer
from common.openai_requests import send_chat_completion
from common.utils import get_message, print_task

from langchain.embeddings.openai import OpenAIEmbeddings
import requests

In [6]:
URL = "https://unknow.news/archiwum.json"

resp = requests.get(URL)
json_to_upload = json.loads(resp.text)

In [9]:
len(json_to_upload)

6153

In [7]:
json_to_upload[0]

{'title': 'Niesamowite "Roboty" sprzed setek lat - jak to działało? (film, 1h)',
 'url': 'https://www.youtube.com/watch?v=6Nt7xLAfEPs',
 'info': "INFO: Z pewnością znasz figurki poruszające się, na przykład w dawnych szopkach bożonarodzeniowych. A może uczył Ci się 'Mechaniczny Turek', który ogrywał wszystkich w szachy? Są to urządzenia sprzed setek lat. Z filmu dowiesz się, co wprawiało te mechanizmy w ruch.",
 'date': '2023-11-10'}

In [30]:
# CREATE COLLECTION IN QDRANT

from qdrant_client import QdrantClient
from qdrant_client.http import models

COLLECTION_NAME = "C03E04"

client = QdrantClient("localhost", port=6333)
result = client.get_collections()

client.create_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=models.VectorParams(
        size=1536, distance=models.Distance.COSINE, on_disk=True
    ),
)

True

In [27]:
# Create embeddings

embeddings = OpenAIEmbeddings()
points = []

for el in json_to_upload[:300]:
    metadata = {"uuid": str(uuid4()), "content": el["url"], "source": COLLECTION_NAME}
    points.append(
        {
            "id": metadata["uuid"],
            "payload": metadata,
            "vector": embeddings.embed_documents([el["url"]])[0],
        }
    )

In [31]:
# Prepare embeddings for batch upsert
ids, vectors, payloads = zip(
    *((point["id"], point["vector"], point["payload"]) for point in points)
)

# Index
client.upsert(
    COLLECTION_NAME,
    points=models.Batch(ids=ids, payloads=payloads, vectors=vectors),
)

UpdateResult(operation_id=0, status=<UpdateStatus.COMPLETED: 'completed'>)

In [32]:
# Get AIDEVS task 

token = get_token("search")
task = get_task(token)
print_task(task)

code
0
******************************************************************************************************************************************************
msg
Index all data from provided URL into vecto store and provide answer to my question - https://unknow.news/archiwum.json
******************************************************************************************************************************************************
question
Co różni pseudonimizację od anonimizowania danych?
******************************************************************************************************************************************************


In [33]:
# Create embedding for question provided in task

task_embedding = embeddings.embed_query(task["question"])

In [34]:
# Search for embedded question

query_filter = models.Filter(
    must=[
        models.FieldCondition(
            key="source",
            match=models.MatchValue(value=COLLECTION_NAME),
        )
    ]
)

search = client.search(
    COLLECTION_NAME, query_vector=task_embedding, limit=1, query_filter=query_filter
)
print(search)

[ScoredPoint(id='594dcccd-b31f-454a-a7e0-b87382313c46', version=0, score=0.8953796, payload={'content': 'https://www.internet-czas-dzialac.pl/pseudonimizacja-a-anonimizacja/', 'source': 'C03E04', 'uuid': '594dcccd-b31f-454a-a7e0-b87382313c46'}, vector=None)]


In [38]:
# VOILA 

send_answer(token, answer=search[0].payload["content"])

'{\n    "code": 0,\n    "msg": "OK",\n    "note": "CORRECT"\n}'