In [88]:
import os
import json
from typing import Literal

from dotenv import dotenv_values
import pandas as pd
import plotly.express as px
import weaviate
import weaviate.classes as wvc

In [89]:
config = dotenv_values("../.env")

In [90]:
client = weaviate.connect_to_custom(
    http_host="localhost",
    http_port=8080,
    http_secure=False,
    grpc_host="localhost",
    grpc_port=50051,
    grpc_secure=False,
    auth_credentials=weaviate.auth.AuthApiKey(
        config["AUTHENTICATION_APIKEY_ALLOWED_KEYS"]
    ),  # Set this environment variable
)


unclosed <socket.socket fd=75, family=AddressFamily.AF_INET6, type=SocketKind.SOCK_STREAM, proto=6, laddr=('::1', 62056, 0, 0), raddr=('::1', 8080, 0, 0)>



In [91]:
collections = list(client.collections.list_all().keys())

In [92]:
if "Subject" in collections:
    client.collections.delete("Subject")

if "UTS_Subject" in collections:
    client.collections.delete("UTS_Subject")

In [93]:
collections = list(client.collections.list_all().keys())
collections

['USYD_DOC2VEC_Subject',
 'USYD_GLOVE_Subject',
 'USYD_INSTRUCTOR_Subject',
 'USYD_SBERT_Subject',
 'UTS_DOC2VEC_Subject',
 'UTS_GLOVE_Subject',
 'UTS_INSTRUCTOR_Subject',
 'UTS_SBERT_Subject']

In [94]:
def read_json(file_path):
    with open(file_path, "r") as file:
        data = json.load(file)
    return data

In [95]:
uni_major_equivalents = read_json("./data/major_rels/uni_major_equivalents.json")

In [96]:
uni_major_equivalents

{'MAJ10053': ['Computational Data Science'],
 'MAJ01156': ['Computational Data Science'],
 'MAJ02900': ['Cyber Security'],
 'MAJ03445': ['Cyber Security'],
 'MAJ03519': ['Software Development'],
 'MAJ02080': ['Software Development'],
 'MAJ02901': [],
 'MAJ02092': [],
 'Computational Data Science': ['MAJ10053', 'MAJ01156'],
 'Cyber Security': ['MAJ02900', 'MAJ03445'],
 'Software Development': ['MAJ03519', 'MAJ02080']}

In [97]:
uts_subject_to_majors = read_json("./data/major_rels/uts_subject_to_majors.json")
usyd_subject_to_majors = read_json("./data/major_rels/usyd_subject_to_majors.json")

In [98]:
modelType = Literal["SBERT", "INSTRUCTOR", "GLOVE", "DOC2VEC"]

In [99]:
def get_similar_uts_subjects(
    uts_subject_collection,
    usyd_subject_vector: dict[str, list[float]],
    num_subjects: int = 5,
):
    response = uts_subject_collection.query.near_vector(
        near_vector=usyd_subject_vector,
        limit=num_subjects,
        return_metadata=wvc.query.MetadataQuery(distance=True),
    )
    
    return [o.properties["subjectCode"] for o in response.objects]

In [100]:
def is_share_major(uts_majors: list[str], usyd_majors: list[str]):
    if len(uts_majors) == 0 and len(usyd_majors) == 0:
        # core subject / non-major subject case
        return True
  
    for usyd_major in usyd_majors:
        for uts_major in uts_majors:
            if usyd_major in uni_major_equivalents[uts_major]:
                return True
              
    return False

In [101]:
def remove_non_included_subjects(uts_subject_collection):
    """
    Remove all uts subjects not in uts_subject_to_majors
    need to do this as weaviate db includes non-computer science subjects as well
    also need to remove subjects from majors that don't have equivalent majors in usyd.
    """

    non_computer_science_subjects = []
    non_equivalent_majors = [
        major
        for major in uni_major_equivalents
        if len(uni_major_equivalents[major]) == 0
    ]

    for item in uts_subject_collection.iterator():
        subject_code = item.properties["subjectCode"]

        if subject_code not in uts_subject_to_majors:
            non_computer_science_subjects.append(subject_code)
            continue

        uts_majors = uts_subject_to_majors[subject_code]

        if len(uts_majors) > 0:
            is_remove = True

            for major in uts_majors:
                if major not in non_equivalent_majors:
                    is_remove = False
                    break

            if is_remove:
                non_computer_science_subjects.append(subject_code)

    if len(non_computer_science_subjects) > 0:
        uts_subject_collection.data.delete_many(
            where=wvc.query.Filter.by_property("subjectCode").contains_any(
                non_computer_science_subjects
            )
        )

In [102]:
def get_model_accuracy(num_similar_subjects):
    model_accuracy = {}
    share_major_vals = {}

    for model in ["SBERT", "INSTRUCTOR", "GLOVE", "DOC2VEC"]:
        num_subjects = 0
        num_share_major = 0
        num_share_major_per_subject = []

        usyd_subject_collection = client.collections.get(f"USYD_{model}_Subject")
        uts_subject_collection = client.collections.get(f"UTS_{model}_Subject")

        remove_non_included_subjects(uts_subject_collection)

        for item in usyd_subject_collection.iterator(include_vector=True):
            subject_code = item.properties["subjectCode"]
            subject_vector = item.vector

            if type(subject_vector) != list:
                subject_vector = subject_vector["default"]

            usyd_majors = usyd_subject_to_majors[subject_code]

            similar_uts_subjects = get_similar_uts_subjects(
                uts_subject_collection, subject_vector, num_similar_subjects
            )
            
            current_num_share_major = 0

            for uts_subject_code in similar_uts_subjects:
                uts_majors = uts_subject_to_majors[uts_subject_code]

                if is_share_major(uts_majors, usyd_majors):
                    current_num_share_major += 1

                num_subjects += 1
                
            num_share_major_per_subject.append(current_num_share_major)
            num_share_major += current_num_share_major

        model_accuracy[model] = num_share_major / num_subjects
        share_major_vals[model] = num_share_major_per_subject
        
    return model_accuracy, share_major_vals

In [103]:
major_model_accuracy_top_2, _ = get_model_accuracy(2)
major_model_accuracy_top_5, histogram_data_top_5 = get_model_accuracy(5)
major_model_accuracy_top_10, histogram_data_top_10 = get_model_accuracy(10)
major_model_accuracy_top_20, histogram_data_top_20 = get_model_accuracy(20)

In [110]:
fig = px.histogram(
    x=histogram_data_top_5["INSTRUCTOR"],
    nbins=5,
    title="Number of UTS subjects that share major with USYD subjects (Top 5)",
    labels={"x": "Number of UTS subjects that share major with USYD subjects"},
)
fig.show()

In [105]:
fig = px.histogram(
    x=histogram_data_top_10["INSTRUCTOR"],
    nbins=10,
    title="Number of UTS subjects that share major with USYD subjects (Top 10)",
    labels={"x": "Number of UTS subjects that share major with USYD subjects"},
)
fig.show()

In [106]:
max(histogram_data_top_20)

'SBERT'

In [107]:
fig = px.histogram(
    x=histogram_data_top_20["INSTRUCTOR"],
    nbins=20,
    title="Number of UTS subjects that share major with USYD subjects (Top 20)",
    labels={"x": "Number of UTS subjects that share major with USYD subjects"},
)
fig.show()

In [108]:
# convert major accuracy to percentage with 4 decimal places
major_model_accuracy_top_2 = {k: round(v * 100, 4) for k, v in major_model_accuracy_top_2.items()}
major_model_accuracy_top_5 = {k: round(v * 100, 4) for k, v in major_model_accuracy_top_5.items()}
major_model_accuracy_top_10 = {k: round(v * 100, 4) for k, v in major_model_accuracy_top_10.items()}
major_model_accuracy_top_20 = {k: round(v * 100, 4) for k, v in major_model_accuracy_top_20.items()}

In [109]:
# creating a dataframe from the accuracy results
major_model_accuracy = pd.DataFrame(
    {
        "Top 2": major_model_accuracy_top_2,
        "Top 5": major_model_accuracy_top_5,
        "Top 10": major_model_accuracy_top_10,
        "Top 20": major_model_accuracy_top_20,
    }
)
major_model_accuracy

Unnamed: 0,Top 2,Top 5,Top 10,Top 20
SBERT,28.1893,31.358,29.5473,29.3827
INSTRUCTOR,37.4486,42.3045,37.8601,33.7243
GLOVE,34.1564,31.5226,33.0453,28.642
DOC2VEC,33.1276,28.1481,26.8724,26.9959
