In [27]:
import os
import json
from typing import Literal

from dotenv import dotenv_values
import pandas as pd
import plotly.express as px
import weaviate
import weaviate.classes as wvc

**This notebook is intended to be run after shared_year_eval, as shared_year_eval does some necessary data cleaning.**

In [28]:
config = dotenv_values("../.env")

In [29]:
client = weaviate.connect_to_custom(
    http_host="localhost",
    http_port=8080,
    http_secure=False,
    grpc_host="localhost",
    grpc_port=50051,
    grpc_secure=False,
    auth_credentials=weaviate.auth.AuthApiKey(
        config["AUTHENTICATION_APIKEY_ALLOWED_KEYS"]
    ),  # Set this environment variable
)

  client = weaviate.connect_to_custom(


In [30]:
collections = list(client.collections.list_all().keys())
collections

['USYD_GLOVE_Subject',
 'UTS_INSTRUCTOR_Subject',
 'USYD_MXBAI_Subject',
 'UTS_DOC2VEC_Subject',
 'UTS_GLOVE_Subject',
 'USYD_DOC2VEC_Subject',
 'UTS_MXBAI_Subject',
 'UTS_SBERT_Subject',
 'USYD_SBERT_Subject',
 'USYD_INSTRUCTOR_Subject']

In [31]:
# function that reads json file
def read_json(file_name):
    with open(file_name, 'r') as file:
        data = json.load(file)
    return data

In [32]:
uts_subject_years = read_json("./data/uts_degree_timeline/C09119v2 Bachelor of Computing Science (Honours).json")

In [33]:
def get_subject_year(subject_code: str) -> int:
  for subject_name in uts_subject_years:
    if subject_code in subject_name:
      return int(uts_subject_years[subject_name][0]["year"])

In [34]:
def get_similar_uts_subjects(
    uts_subject_collection,
    usyd_subject_vector: dict[str, list[float]],
    num_subjects: int = 5,
):
    response = uts_subject_collection.query.near_vector(
        near_vector=usyd_subject_vector,
        limit=num_subjects,
        return_metadata=wvc.query.MetadataQuery(distance=True),
    )
    
    return [o.properties["subjectCode"] for o in response.objects]

In [35]:
"DATA1001"[4]

'1'

In [36]:
def proper_round(num, dec=0):
    num = str(num)[:str(num).index('.')+dec+2]
    if num[-1]>='5':
        return float(num[:-2-(not dec)]+str(int(num[-2-(not dec)])+1))
    return int(float(num[:-1]))

In [37]:
def get_model_accuracy(num_similar_subjects):
    model_accuracy = {}
    share_year_vals = {}

    for model in ["SBERT", "INSTRUCTOR", "GLOVE", "DOC2VEC", "MXBAI"]:
        num_subjects = 0
        num_share_year = 0
        num_share_year_per_subject = []

        usyd_subject_collection = client.collections.get(f"USYD_{model}_Subject")
        uts_subject_collection = client.collections.get(f"UTS_{model}_Subject")

        for item in usyd_subject_collection.iterator(include_vector=True):
            subject_code = item.properties["subjectCode"]
            subject_vector = item.vector

            if type(subject_vector) != list:
                subject_vector = subject_vector["default"]

            similar_uts_subjects = get_similar_uts_subjects(
                uts_subject_collection, subject_vector, num_similar_subjects
            )
            
            current_num_share_year = 0

            for uts_subject_code in similar_uts_subjects:
                uts_subject_year = int(get_subject_year(uts_subject_code))
                usyd_subject_year = proper_round(int(subject_code[4]) * (4/5))
                
                if uts_subject_year == usyd_subject_year:
                    current_num_share_year += 1

                num_subjects += 1
                
            num_share_year_per_subject.append(current_num_share_year)
            num_share_year += current_num_share_year

        model_accuracy[model] = num_share_year / num_subjects
        share_year_vals[model] = num_share_year_per_subject
        
    return model_accuracy, share_year_vals

In [38]:
major_model_accuracy_top_2, _ = get_model_accuracy(2)
major_model_accuracy_top_5, histogram_data_top_5 = get_model_accuracy(5)
major_model_accuracy_top_10, histogram_data_top_10 = get_model_accuracy(10)
major_model_accuracy_top_20, histogram_data_top_20 = get_model_accuracy(20)

In [39]:
# convert major accuracy to percentage with 4 decimal places
major_model_accuracy_top_2 = {k: round(v * 100, 2) for k, v in major_model_accuracy_top_2.items()}
major_model_accuracy_top_5 = {k: round(v * 100, 2) for k, v in major_model_accuracy_top_5.items()}
major_model_accuracy_top_10 = {k: round(v * 100, 2) for k, v in major_model_accuracy_top_10.items()}
major_model_accuracy_top_20 = {k: round(v * 100, 2) for k, v in major_model_accuracy_top_20.items()}

In [40]:
# creating a dataframe from the accuracy results
major_model_accuracy = pd.DataFrame(
    {
        "Top 2": major_model_accuracy_top_2,
        "Top 5": major_model_accuracy_top_5,
        "Top 10": major_model_accuracy_top_10,
        "Top 20": major_model_accuracy_top_20,
    }
)
major_model_accuracy

Unnamed: 0,Top 2,Top 5,Top 10,Top 20
SBERT,27.98,30.45,27.98,27.35
INSTRUCTOR,27.78,26.34,25.43,25.08
GLOVE,25.93,25.93,27.2,27.41
DOC2VEC,27.37,27.82,26.54,26.15
MXBAI,32.1,30.12,29.3,26.44
