In [36]:
import os
import requests
import json

from dotenv import dotenv_values
import numpy as np
import weaviate
import weaviate.classes as wvc
from sentence_transformers import SentenceTransformer

In [37]:
config = dotenv_values("../.env")

In [38]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [39]:
# function that reads json file
def read_json(file):
    with open(file, "r") as f:
        data = json.load(f)
    return data

In [40]:
client = weaviate.connect_to_custom(
    http_host="localhost",
    http_port=8080,
    http_secure=False,
    grpc_host="localhost",
    grpc_port=50051,
    grpc_secure=False,
    auth_credentials=weaviate.auth.AuthApiKey(
        config["AUTHENTICATION_APIKEY_ALLOWED_KEYS"]
    ),  # Set this environment variable
)

In [41]:
is_regenerate = True

if is_regenerate:
    client.collections.delete("Subject")

In [42]:
is_subject_collection = False
subjects = None

for collection in list(client.collections.list_all().keys()):
    if collection == "Subject":
        is_subject_collection = True

if is_subject_collection:
    subjects = client.collections.get("Subject")
else:
    subjects = client.collections.create(
        "Subject",
        vectorizer_config=wvc.config.Configure.Vectorizer.none(),
        vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
            distance_metric=wvc.config.VectorDistances.COSINE  # select prefered distance metric
        ),
    )

In [43]:
is_data_loaded = False

for item in subjects.iterator():
    is_data_loaded = True
    break

In [44]:
document_embeddings_dir = "./document_embeddings/sentence_transformer"

In [45]:
if not is_data_loaded:
    subject_objs = []

    for filename in os.listdir(document_embeddings_dir):
        if filename.endswith(".json"):
            subject_embedding = read_json(
                f"{document_embeddings_dir}/{filename}"
            )
            subject_code = filename.split(".")[0]

            subject_objs.append(
                wvc.data.DataObject(
                    properties={
                        "subjectCode": subject_code,
                    },
                    vector=subject_embedding,
                )
            )

    subjects.data.insert_many(subject_objs)

In [46]:
query_embedding = model.encode("Machine Learning algorithm implementation", normalize_embeddings=True)

In [47]:
query_embedding

array([-0.10054397, -0.03073943,  0.0018512 , -0.06369264,  0.00928275,
       -0.00524856, -0.01040587, -0.01579498, -0.02429633, -0.0062831 ,
       -0.00153712,  0.06378006,  0.03238346, -0.05020822, -0.08428038,
        0.0265597 , -0.02575043,  0.00466632, -0.0280313 , -0.13789569,
       -0.0378385 , -0.0786556 ,  0.02012172, -0.00451911,  0.04885184,
       -0.00689993,  0.00508869,  0.02015312,  0.10789573, -0.04211477,
        0.10274938, -0.06954855,  0.07703526, -0.04059914, -0.06326406,
        0.00754337, -0.08215312,  0.03670678, -0.0242956 ,  0.05047772,
       -0.02577701, -0.04851747, -0.02022681,  0.02532433,  0.0278969 ,
        0.00271057, -0.06052624,  0.01481662, -0.04545474,  0.00153073,
       -0.01353212, -0.05038638,  0.00464621, -0.0889425 ,  0.02850713,
        0.0207929 ,  0.03852349,  0.00047886, -0.04809996, -0.03993021,
        0.08301346, -0.03779312,  0.01826357,  0.08036301,  0.01325889,
        0.07102425, -0.04623232, -0.07292984,  0.00275491,  0.00

In [48]:
response = subjects.query.near_vector(
    near_vector=query_embedding.tolist(),
    limit=5,
    return_metadata=wvc.query.MetadataQuery(certainty=True)
)

print(response)

QueryReturn(objects=[Object(uuid=_WeaviateUUIDInt('d1e05bcb-1fc8-4e55-83ef-87973bf3330e'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=0.7028616666793823, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'subjectCode': '31005'}, references=None, vector={}, collection='Subject'), Object(uuid=_WeaviateUUIDInt('baeab525-fa52-4dcb-ae73-5d412777cf38'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=0.6897483468055725, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'subjectCode': '43024'}, references=None, vector={}, collection='Subject'), Object(uuid=_WeaviateUUIDInt('c8afb0ea-25e0-4087-bd92-36a51e67570f'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=0.6783087253570557, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'subjectCode': '41308'}, refe

In [49]:
[x.properties["subjectCode"] for x in response.objects]

['31005', '43024', '41308', '41052', '41040']

In [50]:
# client.close()  # Close client gracefully