In [1]:
import os
import requests
import json

from dotenv import dotenv_values
import numpy as np
import weaviate
import weaviate.classes as wvc
from sentence_transformers import SentenceTransformer

In [2]:
config = dotenv_values("../.env")

In [3]:
model = SentenceTransformer('paraphrase-MiniLM-L6-v2')

In [4]:
# function that reads json file
def read_json(file):
    with open(file, "r") as f:
        data = json.load(f)
    return data

In [5]:
client = weaviate.connect_to_custom(
    http_host="localhost",
    http_port=8080,
    http_secure=False,
    grpc_host="localhost",
    grpc_port=50051,
    grpc_secure=False,
    auth_credentials=weaviate.auth.AuthApiKey(
        config["AUTHENTICATION_APIKEY_ALLOWED_KEYS"]
    ),  # Set this environment variable
)

In [6]:
is_regenerate = True

if is_regenerate:
    client.collections.delete("Subject")

In [7]:
is_subject_collection = False
subjects = None

for collection in list(client.collections.list_all().keys()):
    if collection == "Subject":
        is_subject_collection = True

if is_subject_collection:
    subjects = client.collections.get("Subject")
else:
    subjects = client.collections.create(
        "Subject",
        vectorizer_config=wvc.config.Configure.Vectorizer.none(),
        vector_index_config=wvc.config.Configure.VectorIndex.hnsw(
            distance_metric=wvc.config.VectorDistances.COSINE  # select prefered distance metric
        ),
    )

In [8]:
is_data_loaded = False

for item in subjects.iterator():
    is_data_loaded = True
    break

In [9]:
document_embeddings_dir = "./document_embeddings/sentence_transformer"

In [10]:
subject_code_to_subject_name = {}

for subject in read_json("./subjects_cleaned/subjects.json"):
    subject_code_to_subject_name[subject["_id"]] = subject["name"]

In [11]:
if not is_data_loaded:
    subject_objs = []

    for filename in os.listdir(document_embeddings_dir):
        if filename.endswith(".json"):
            subject_embedding = read_json(f"{document_embeddings_dir}/{filename}")
            subject_code = filename.split(".")[0]

            subject_objs.append(
                wvc.data.DataObject(
                    properties={
                        "subjectCode": subject_code,
                        "name": subject_code_to_subject_name[subject_code],
                    },
                    vector=subject_embedding,
                )
            )

    print("x", subject_objs)

    subjects.data.insert_many(subject_objs)

x [DataObject(properties={'subjectCode': '48260', 'name': 'Engineering Project Management'}, uuid=None, vector=[-0.08499700576066971, 0.046006668359041214, 0.03395908698439598, -0.05179166793823242, 0.049656860530376434, -0.003609271952882409, -0.02815438248217106, 0.03709515556693077, -0.0020487301517277956, -0.02819828689098358, -0.058332089334726334, 0.008924535475671291, 0.04338737949728966, -0.0504881888628006, 0.0036994977854192257, 0.027710067108273506, 0.09105399996042252, -0.06168045103549957, 0.01865529455244541, -0.00926712155342102, 0.04827217757701874, 0.005424844566732645, -0.011844865046441555, -0.023156287148594856, 0.024063628166913986, 0.020896300673484802, -0.016024187207221985, 0.047486960887908936, 0.009153570979833603, -0.11524292081594467, -0.013625570572912693, 0.08079364150762558, 0.028514698147773743, -0.008456702344119549, 0.03784000128507614, 0.04536230117082596, -0.0035712390672415495, -0.013932154513895512, -0.05633928254246712, 0.03252393379807472, -0.032

In [12]:
query_embedding = model.encode("Machine Learning algorithm implementation", normalize_embeddings=True)

In [13]:
query_embedding

array([-0.10054397, -0.03073943,  0.0018512 , -0.06369264,  0.00928275,
       -0.00524856, -0.01040587, -0.01579498, -0.02429633, -0.0062831 ,
       -0.00153712,  0.06378006,  0.03238346, -0.05020822, -0.08428038,
        0.0265597 , -0.02575043,  0.00466632, -0.0280313 , -0.13789569,
       -0.0378385 , -0.0786556 ,  0.02012172, -0.00451911,  0.04885184,
       -0.00689993,  0.00508869,  0.02015312,  0.10789573, -0.04211477,
        0.10274938, -0.06954855,  0.07703526, -0.04059914, -0.06326406,
        0.00754337, -0.08215312,  0.03670678, -0.0242956 ,  0.05047772,
       -0.02577701, -0.04851747, -0.02022681,  0.02532433,  0.0278969 ,
        0.00271057, -0.06052624,  0.01481662, -0.04545474,  0.00153073,
       -0.01353212, -0.05038638,  0.00464621, -0.0889425 ,  0.02850713,
        0.0207929 ,  0.03852349,  0.00047886, -0.04809996, -0.03993021,
        0.08301346, -0.03779312,  0.01826357,  0.08036301,  0.01325889,
        0.07102425, -0.04623232, -0.07292984,  0.00275491,  0.00

In [14]:
response = subjects.query.near_vector(
    near_vector=query_embedding.tolist(),
    limit=5,
    return_metadata=wvc.query.MetadataQuery(certainty=True)
)

print(response)

QueryReturn(objects=[Object(uuid=_WeaviateUUIDInt('bd54d756-5e65-4b6f-9b76-31eba8d2d9f2'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=0.7028616666793823, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'subjectCode': '31005', 'name': 'Machine Learning'}, references=None, vector={}, collection='Subject'), Object(uuid=_WeaviateUUIDInt('5e8c1210-34e4-4ba1-89a2-60cef4b7080c'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=0.6897483468055725, score=None, explain_score=None, is_consistent=None, rerank_score=None), properties={'subjectCode': '43024', 'name': 'Introduction to Computational Intelligence'}, references=None, vector={}, collection='Subject'), Object(uuid=_WeaviateUUIDInt('767025bc-ecb1-4f1f-a6b6-2b37097f542f'), metadata=MetadataReturn(creation_time=None, last_update_time=None, distance=None, certainty=0.6783087253570557, score=None, explain_score=None,

In [15]:
[x.properties for x in response.objects]

[{'subjectCode': '31005', 'name': 'Machine Learning'},
 {'subjectCode': '43024',
  'name': 'Introduction to Computational Intelligence'},
 {'subjectCode': '41308',
  'name': 'Machine Learning and Industrial Data Science'},
 {'subjectCode': '41052', 'name': 'Advanced Algorithms'},
 {'subjectCode': '41040', 'name': 'Introduction to Artificial Intelligence'}]

In [17]:
client.close()  # Close client gracefully