In [21]:
import os
import json
from typing import Literal

from dotenv import dotenv_values
import pandas as pd
from bs4 import BeautifulSoup
import plotly.express as px
import weaviate
import weaviate.classes as wvc

In [22]:
config = dotenv_values("../.env")

In [23]:
client = weaviate.connect_to_custom(
    http_host="localhost",
    http_port=8080,
    http_secure=False,
    grpc_host="localhost",
    grpc_port=50051,
    grpc_secure=False,
    auth_credentials=weaviate.auth.AuthApiKey(
        config["AUTHENTICATION_APIKEY_ALLOWED_KEYS"]
    ),  # Set this environment variable
)

  client = weaviate.connect_to_custom(


In [24]:
collections = list(client.collections.list_all().keys())
collections

['USYD_GLOVE_Subject',
 'UTS_INSTRUCTOR_Subject',
 'USYD_MXBAI_Subject',
 'USYD_SBERT_Subject',
 'UTS_DOC2VEC_Subject',
 'UTS_GLOVE_Subject',
 'USYD_DOC2VEC_Subject',
 'UTS_MXBAI_Subject',
 'UTS_SBERT_Subject',
 'USYD_INSTRUCTOR_Subject']

In [25]:
def get_similar_uts_subjects(
    uts_subject_collection,
    usyd_subject_vector: dict[str, list[float]],
    num_subjects: int = 5,
):
    response = uts_subject_collection.query.near_vector(
        near_vector=usyd_subject_vector,
        limit=num_subjects,
        return_metadata=wvc.query.MetadataQuery(distance=True),
    )
    
    return [o.properties for o in response.objects]

In [26]:
uts_to_usyd_degree = {
  "Bachelor of Computing Science (Honours)": "Bachelor of Advanced Computing",
  "Bachelor of Engineering (Honours)": "Bachelor of Engineering (Honours)",
  "Bachelor of Nursing": "Bachelor of Nursing (Advanced Studies)"
}

In [27]:
def is_share_degree(uts_degrees, usyd_degrees):
    is_shared = False
    
    for uts_degree in uts_degrees:
        if uts_to_usyd_degree[uts_degree] in usyd_degrees:
            is_shared = True
            break
          
    return is_shared

In [28]:
def get_model_accuracy(num_similar_subjects: int):
    embedding_accuracy = {}

    for embedding_type in ["MXBAI", "INSTRUCTOR", "SBERT", "GLOVE", "DOC2VEC"]:
        usyd_subject_collection = client.collections.get(f"USYD_{embedding_type}_Subject")
        uts_subject_collection = client.collections.get(f"UTS_{embedding_type}_Subject")
        
        num_subjects = 0
        num_share_degree = 0
        
        for item in usyd_subject_collection.iterator(include_vector=True):
            degrees = item.properties["degrees"]
            # subject_code = item.properties["subjectCode"]
            # majors = item.properties["majors"]
            subject_vector = item.vector
            
            if type(subject_vector) != list:
                subject_vector = subject_vector["default"]
            
            similar_subjects = get_similar_uts_subjects(
                uts_subject_collection, subject_vector, num_similar_subjects
            )
            
            for similar_subject in similar_subjects:
                if is_share_degree(similar_subject["degrees"], degrees):
                    num_share_degree += 1
                num_subjects += 1
                
        embedding_accuracy[embedding_type] = num_share_degree / num_subjects
        
    return embedding_accuracy

In [29]:
model_accuracy_top_2 = get_model_accuracy(2)
model_accuracy_top_5 = get_model_accuracy(5)
model_accuracy_top_10 = get_model_accuracy(10)

In [30]:
# convert major accuracy to percentage with 4 decimal places
model_accuracy_top_2 = {k: round(v * 100, 2) for k, v in model_accuracy_top_2.items()}
model_accuracy_top_5 = {k: round(v * 100, 2) for k, v in model_accuracy_top_5.items()}
model_accuracy_top_10 = {k: round(v * 100, 2) for k, v in model_accuracy_top_10.items()}

In [31]:
# creating a dataframe from the accuracy results
major_model_accuracy = pd.DataFrame(
    {
        "Top 2": model_accuracy_top_2,
        "Top 5": model_accuracy_top_5,
        "Top 10": model_accuracy_top_10,
    }
)
major_model_accuracy

Unnamed: 0,Top 2,Top 5,Top 10
MXBAI,93.54,94.51,93.54
INSTRUCTOR,92.99,92.36,91.73
SBERT,88.74,88.63,87.72
GLOVE,74.31,71.04,68.08
DOC2VEC,21.98,23.41,23.16
