In [86]:
import os
import json
from typing import Literal

from dotenv import dotenv_values
import pandas as pd
from bs4 import BeautifulSoup
import plotly.express as px
import weaviate
import weaviate.classes as wvc
from weaviate.classes.query import MetadataQuery, Filter

In [87]:
config = dotenv_values("../.env")

In [88]:
client = weaviate.connect_to_custom(
    http_host="localhost",
    http_port=8080,
    http_secure=False,
    grpc_host="localhost",
    grpc_port=50051,
    grpc_secure=False,
    auth_credentials=weaviate.auth.AuthApiKey(
        config["AUTHENTICATION_APIKEY_ALLOWED_KEYS"]
    ),  # Set this environment variable
)

In [89]:
with open("./data/degree_subject_codes/subject_to_degrees.json", "r") as f:
    subject_to_degrees = json.load(f)

In [90]:
subject_to_year = {}

for subject_code in subject_to_degrees.keys():
    if not subject_code.isdigit():
        subject_to_year[subject_code] = set([int(subject_code[4])])
    else:
        subject_to_year[subject_code] = set()

In [91]:
for degree_page in os.listdir("./data/uts_degrees"):
    with open(f"./data/uts_degrees/{degree_page}", "r") as f:
        degree_soup = BeautifulSoup(f.read(), "html.parser")
        
        subject_table = degree_soup.findAll("table")[-1]
        
        currentYear = None
        
        for table_row in subject_table.findAll("tr")[1:]:
            if "Year" in table_row.text:
                currentYear = int(table_row.text.split(" ")[-1])
                continue
              
            link = table_row.find("a")
              
            if link and link.text.isdigit():
                subject_code = link.text
                subject_to_year[subject_code].add(currentYear)

In [92]:
subject_to_year

{'33130': {1},
 '37181': {1},
 '31266': {1},
 '41039': {1, 2},
 '31265': {1},
 '33230': {1, 2},
 '31269': {1},
 '31271': {1, 2},
 '41092': {1, 2},
 '31268': {2, 3},
 '48024': {1, 2},
 '41078': {2},
 '41080': {2},
 '31250': {1, 2, 3, 4},
 '41040': {2},
 '31251': {2, 3},
 '41079': {3},
 '42913': {3, 4},
 '31256': {3, 4},
 '32146': {3, 4},
 '42028': {3, 4},
 '41077': {3, 4},
 '43024': {3, 4},
 '41043': {3, 4},
 '43023': {3, 4},
 '31005': {2, 3, 4},
 '43025': {2, 3, 4},
 '31243': {3, 4},
 '32144': {3},
 '32931': {4},
 '31272': {3, 4},
 '41004': {4},
 '31482': {4},
 '31247': {2},
 '31257': {2},
 '31255': {2},
 '31258': {3},
 '31276': {3},
 '31280': {3},
 '31245': {4},
 '31282': {1, 4},
 '31097': {4},
 '48270': {4},
 '48730': {2, 3},
 '41181': {2},
 '41900': {3},
 '41182': {3},
 '41183': {3},
 '41184': {4},
 '41180': {4},
 '48436': {4},
 '41185': {4},
 '31777': {2, 4},
 '31242': {2, 4},
 '41889': {2, 3, 4},
 '31927': {2, 3, 4},
 '41001': {2, 3, 4},
 '31253': {2, 4},
 '41113': {2, 3, 4},
 '41

In [93]:
def get_median_year(years):
    years = list(years)
    years.sort()
    return years[len(years) // 2]

In [94]:
subject_to_median_year = {}

for subject_code, years in subject_to_year.items():
    if len(years) == 0:
        subject_to_median_year[subject_code] = None
        continue
  
    subject_to_median_year[subject_code] = get_median_year(years)

In [95]:
with open("./data/degree_subject_codes/subject_to_median_year.json", "w") as f:
    json.dump(subject_to_median_year, f, indent=2)

In [96]:
def get_similar_uts_subjects(
    uts_subject_collection,
    usyd_subject_vector: dict[str, list[float]],
    num_subjects: int = 5,
    degree: str | None = None
):
    if type(usyd_subject_vector) != list:
        usyd_subject_vector = usyd_subject_vector["default"]
    
    response = None
    
    if degree is not None:
        response = uts_subject_collection.query.near_vector(
            near_vector=usyd_subject_vector,
            limit=num_subjects,
            return_metadata=wvc.query.MetadataQuery(distance=True),
            filters=Filter.by_property("degrees").contains_any([degree])
        )
    else:
        response = uts_subject_collection.query.near_vector(
            near_vector=usyd_subject_vector,
            limit=num_subjects,
            return_metadata=wvc.query.MetadataQuery(distance=True)
        )
    
    return [o.properties for o in response.objects]

In [97]:
def get_model_accuracy(num_similar_subjects: int):
    embedding_accuracy = {}

    for embedding_type in ["MXBAI", "INSTRUCTOR", "SBERT", "GLOVE", "DOC2VEC"]:
        usyd_subject_collection = client.collections.get(f"USYD_{embedding_type}_Subject")
        uts_subject_collection = client.collections.get(f"UTS_{embedding_type}_Subject")
        
        embedding_accuracy[embedding_type] = {
          "correct": 0,
          "total": 0,
          "mae": 0,
        }
        
        for usyd_subject in usyd_subject_collection.iterator(include_vector=True):
            usyd_subject_code = usyd_subject.properties["subjectCode"]
            usyd_subject_year = subject_to_median_year[usyd_subject_code]
            usyd_subject_vector = usyd_subject.vector
            
            if usyd_subject_year is None:
              continue
            
            uts_similar_subjects = get_similar_uts_subjects(
                uts_subject_collection,
                usyd_subject_vector,
                num_subjects=num_similar_subjects
            )
            
            for uts_subject in uts_similar_subjects:
                uts_subject_code = uts_subject["subjectCode"]
                uts_subject_year = subject_to_median_year[uts_subject_code]
                
                if uts_subject_year is None:
                  continue
                
                if usyd_subject_year == uts_subject_year:
                  embedding_accuracy[embedding_type]["correct"] += 1
                  
                embedding_accuracy[embedding_type]["total"] += 1
                embedding_accuracy[embedding_type]["mae"] += abs(usyd_subject_year - uts_subject_year)
                
        embedding_accuracy[embedding_type]["mae"] = round(embedding_accuracy[embedding_type]["mae"] / embedding_accuracy[embedding_type]["total"], 2)
        embedding_accuracy[embedding_type]["accuracy"] = round((embedding_accuracy[embedding_type]["correct"] / embedding_accuracy[embedding_type]["total"]) * 100, 2)
        del embedding_accuracy[embedding_type]["correct"]
        del embedding_accuracy[embedding_type]["total"]
        
    return embedding_accuracy
                

In [98]:
model_accuracy_top_2 = get_model_accuracy(2)
model_accuracy_top_5 = get_model_accuracy(5)
model_accuracy_top_10 = get_model_accuracy(10)

In [99]:
model_accuracy_top_5

{'MXBAI': {'mae': 1.3, 'accuracy': 24.46},
 'INSTRUCTOR': {'mae': 1.32, 'accuracy': 23.98},
 'SBERT': {'mae': 1.27, 'accuracy': 23.47},
 'GLOVE': {'mae': 1.34, 'accuracy': 21.04},
 'DOC2VEC': {'mae': 1.63, 'accuracy': 18.15}}

In [100]:
# creating a dataframe from the accuracy results
major_model_accuracy = pd.DataFrame(
    [[model["accuracy"] for model in model_accuracy_top_2.values()],
     [model["accuracy"] for model in model_accuracy_top_5.values()],
     [model["accuracy"] for model in model_accuracy_top_10.values()]],
    columns=model_accuracy_top_2.keys(),
)
major_model_accuracy = major_model_accuracy.T
major_model_accuracy.columns = ["Top 2", "Top 5", "Top 10"]
major_model_accuracy

Unnamed: 0,Top 2,Top 5,Top 10
MXBAI,28.73,24.46,21.93
INSTRUCTOR,24.34,23.98,23.53
SBERT,23.31,23.47,22.84
GLOVE,21.43,21.04,20.05
DOC2VEC,20.3,18.15,17.52


In [101]:
# creating a dataframe from the accuracy results
major_model_accuracy = pd.DataFrame(
    [[model["mae"] for model in model_accuracy_top_2.values()],
     [model["mae"] for model in model_accuracy_top_5.values()],
     [model["mae"] for model in model_accuracy_top_10.values()]],
    columns=model_accuracy_top_2.keys(),
)
major_model_accuracy = major_model_accuracy.T
major_model_accuracy.columns = ["Top 2", "Top 5", "Top 10"]
major_model_accuracy

Unnamed: 0,Top 2,Top 5,Top 10
MXBAI,1.21,1.3,1.39
INSTRUCTOR,1.28,1.32,1.34
SBERT,1.23,1.27,1.32
GLOVE,1.34,1.34,1.36
DOC2VEC,1.6,1.63,1.6
