In [1]:
import os
import json
from typing import Literal

from dotenv import dotenv_values
import pandas as pd
import plotly.express as px
import weaviate
import weaviate.classes as wvc

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


**This notebook is intended to be run after shared_year_eval, as shared_year_eval does some necessary data cleaning.**

In [2]:
config = dotenv_values("../.env")

In [3]:
client = weaviate.connect_to_custom(
    http_host="localhost",
    http_port=8080,
    http_secure=False,
    grpc_host="localhost",
    grpc_port=50051,
    grpc_secure=False,
    auth_credentials=weaviate.auth.AuthApiKey(
        config["AUTHENTICATION_APIKEY_ALLOWED_KEYS"]
    ),  # Set this environment variable
)

In [4]:
collections = list(client.collections.list_all().keys())
collections

['USYD_DOC2VEC_Subject',
 'USYD_GLOVE_Subject',
 'USYD_INSTRUCTOR_Subject',
 'USYD_SBERT_Subject',
 'UTS_DOC2VEC_Subject',
 'UTS_GLOVE_Subject',
 'UTS_INSTRUCTOR_Subject',
 'UTS_SBERT_Subject']

In [5]:
# function that reads json file
def read_json(file_name):
    with open(file_name, 'r') as file:
        data = json.load(file)
    return data

In [6]:
subject_keywords = read_json("./data/subjects/subject_keywords.json")

In [7]:
def get_similar_uts_subjects(
    uts_subject_collection,
    usyd_subject_vector: dict[str, list[float]],
    num_subjects: int = 5,
):
    response = uts_subject_collection.query.near_vector(
        near_vector=usyd_subject_vector,
        limit=num_subjects,
        return_metadata=wvc.query.MetadataQuery(distance=True),
    )
    
    return [o.properties["subjectCode"] for o in response.objects]

In [8]:
def get_model_accuracy(num_similar_subjects):
    model_accuracy = {}
    keyword_overlap_vals = {}

    for model in ["SBERT", "INSTRUCTOR", "GLOVE", "DOC2VEC"]:
        num_subjects = 0
        keyword_overlap_sum = 0
        keyword_overlap_per_subject = []

        usyd_subject_collection = client.collections.get(f"USYD_{model}_Subject")
        uts_subject_collection = client.collections.get(f"UTS_{model}_Subject")

        for item in usyd_subject_collection.iterator(include_vector=True):
            subject_code = item.properties["subjectCode"]
            subject_vector = item.vector

            if type(subject_vector) != list:
                subject_vector = subject_vector["default"]

            similar_uts_subjects = get_similar_uts_subjects(
                uts_subject_collection, subject_vector, num_similar_subjects
            )
            
            keyword_overlap = 0
            usyd_keywords = subject_keywords[subject_code]

            for uts_subject_code in similar_uts_subjects:
                uts_keywords = subject_keywords[uts_subject_code]
                
                keyword_overlap += len(set(uts_keywords).intersection(usyd_keywords)) / 20

                num_subjects += 1
                
            keyword_overlap_per_subject.append(keyword_overlap)
            keyword_overlap_sum += keyword_overlap

        model_accuracy[model] = keyword_overlap_sum / num_subjects
        keyword_overlap_vals[model] = keyword_overlap_per_subject
        
    return model_accuracy, keyword_overlap_vals

In [9]:
major_model_accuracy_top_2, _ = get_model_accuracy(2)
major_model_accuracy_top_5, histogram_data_top_5 = get_model_accuracy(5)
major_model_accuracy_top_10, histogram_data_top_10 = get_model_accuracy(10)
major_model_accuracy_top_20, histogram_data_top_20 = get_model_accuracy(20)

In [10]:
# convert major accuracy to percentage with 4 decimal places
major_model_accuracy_top_2 = {k: round(v * 100, 2) for k, v in major_model_accuracy_top_2.items()}
major_model_accuracy_top_5 = {k: round(v * 100, 2) for k, v in major_model_accuracy_top_5.items()}
major_model_accuracy_top_10 = {k: round(v * 100, 2) for k, v in major_model_accuracy_top_10.items()}
major_model_accuracy_top_20 = {k: round(v * 100, 2) for k, v in major_model_accuracy_top_20.items()}

In [11]:
# creating a dataframe from the accuracy results
major_model_accuracy = pd.DataFrame(
    {
        "Top 2": major_model_accuracy_top_2,
        "Top 5": major_model_accuracy_top_5,
        "Top 10": major_model_accuracy_top_10,
        "Top 20": major_model_accuracy_top_20,
    }
)
major_model_accuracy

Unnamed: 0,Top 2,Top 5,Top 10,Top 20
SBERT,12.03,10.91,9.73,8.43
INSTRUCTOR,14.16,12.01,10.78,9.32
GLOVE,7.61,7.51,7.5,7.38
DOC2VEC,6.03,5.98,6.0,6.33
