In [51]:
import os
import json
from typing import Literal

from dotenv import dotenv_values
import pandas as pd
from bs4 import BeautifulSoup
import plotly.express as px
import weaviate
import weaviate.classes as wvc

In [52]:
config = dotenv_values("../.env")

In [53]:
client = weaviate.connect_to_custom(
    http_host="localhost",
    http_port=8080,
    http_secure=False,
    grpc_host="localhost",
    grpc_port=50051,
    grpc_secure=False,
    auth_credentials=weaviate.auth.AuthApiKey(
        config["AUTHENTICATION_APIKEY_ALLOWED_KEYS"]
    ),  # Set this environment variable
)


unclosed <socket.socket fd=80, family=AddressFamily.AF_INET6, type=SocketKind.SOCK_STREAM, proto=6, laddr=('::1', 59117, 0, 0), raddr=('::1', 8080, 0, 0)>



In [54]:
collections = list(client.collections.list_all().keys())

In [55]:
if "Subject" in collections:
    client.collections.delete("Subject")

if "UTS_Subject" in collections:
    client.collections.delete("UTS_Subject")

In [56]:
collections = list(client.collections.list_all().keys())
collections

['USYD_GLOVE_Subject',
 'UTS_INSTRUCTOR_Subject',
 'USYD_MXBAI_Subject',
 'UTS_DOC2VEC_Subject',
 'UTS_GLOVE_Subject',
 'USYD_DOC2VEC_Subject',
 'UTS_MXBAI_Subject',
 'UTS_SBERT_Subject',
 'USYD_SBERT_Subject',
 'USYD_INSTRUCTOR_Subject']

In [57]:
def read_json(file_path):
    with open(file_path, "r") as file:
        data = json.load(file)
    return data

In [58]:
uni_major_equivalents = read_json("./data/major_rels/uni_major_equivalents.json")

In [59]:
uni_major_equivalents

{'MAJ10053': ['Computational Data Science'],
 'MAJ01156': ['Computational Data Science'],
 'MAJ02900': ['Cyber Security'],
 'MAJ03445': ['Cyber Security'],
 'MAJ03519': ['Software Development'],
 'MAJ02080': ['Software Development'],
 'MAJ02901': [],
 'MAJ02092': [],
 'Computational Data Science': ['MAJ10053', 'MAJ01156'],
 'Cyber Security': ['MAJ02900', 'MAJ03445'],
 'Software Development': ['MAJ03519', 'MAJ02080']}

In [60]:
uts_subject_to_majors = read_json("./data/major_rels/uts_subject_to_majors.json")
usyd_subject_to_majors = read_json("./data/major_rels/usyd_subject_to_majors.json")

In [61]:
modelType = Literal["SBERT", "INSTRUCTOR", "GLOVE", "DOC2VEC"]

In [62]:
def get_similar_uts_subjects(
    uts_subject_collection,
    usyd_subject_vector: dict[str, list[float]],
    num_subjects: int = 5,
):
    response = uts_subject_collection.query.near_vector(
        near_vector=usyd_subject_vector,
        limit=num_subjects,
        return_metadata=wvc.query.MetadataQuery(distance=True),
    )
    
    return [o.properties["subjectCode"] for o in response.objects]

In [63]:
def is_share_major(uts_majors: list[str], usyd_majors: list[str]):
    if len(uts_majors) == 0 and len(usyd_majors) == 0:
        # core subject / non-major subject case
        return True
  
    for usyd_major in usyd_majors:
        for uts_major in uts_majors:
            if usyd_major in uni_major_equivalents[uts_major]:
                return True
              
    return False

In [64]:
def is_usyd_subject_code(text: str) -> bool:
    if len(text) != 8:
        return False
      
    if text[4:].isnumeric():
        return True

In [65]:
with open("./data/usyd-comp-sci-degree.html", "r") as file:
    usyd_comp_sci_degree_html = file.read()

usyd_comp_sci_degree_soup = BeautifulSoup(usyd_comp_sci_degree_html, "html.parser")

In [66]:
current_usyd_computer_science_subjects = []

# find all strong tags
for strong_tag in usyd_comp_sci_degree_soup.find_all("strong"):
    # get the a tag in the strong tag
    a_tag = strong_tag.find("a")
    if a_tag:
        # get the text of the a tag
        text = a_tag.text
        if is_usyd_subject_code(text):
            current_usyd_computer_science_subjects.append(text)

In [67]:
def remove_non_included_subjects(uts_subject_collection, usyd_subject_collection, is_remove_usyd: bool = False):
    """
    Remove all uts subjects not in uts_subject_to_majors
    need to do this as weaviate db includes non-computer science subjects as well
    also need to remove subjects from majors that don't have equivalent majors in usyd.
    """

    non_computer_science_subjects = []
    non_equivalent_majors = [
        major
        for major in uni_major_equivalents
        if len(uni_major_equivalents[major]) == 0
    ]

    for item in uts_subject_collection.iterator():
        subject_code = item.properties["subjectCode"]

        if subject_code not in uts_subject_to_majors:
            non_computer_science_subjects.append(subject_code)
            continue

        uts_majors = uts_subject_to_majors[subject_code]

        if len(uts_majors) > 0:
            is_remove = True

            for major in uts_majors:
                if major not in non_equivalent_majors:
                    is_remove = False
                    break

            if is_remove:
                non_computer_science_subjects.append(subject_code)
    
    if len(non_computer_science_subjects) > 0:
        uts_subject_collection.data.delete_many(
            where=wvc.query.Filter.by_property("subjectCode").contains_any(
                non_computer_science_subjects
            )
        )
                
    if is_remove_usyd:
        non_current_usyd_computer_science_subjects = []
        count = 0
        
        for item in usyd_subject_collection.iterator():
            subject_code = item.properties["subjectCode"]
            
            if subject_code not in current_usyd_computer_science_subjects:
                non_current_usyd_computer_science_subjects.append(subject_code)
                count += 1
                
        # print(f"Removed {count} non-current usyd computer science subjects")
                
        if len(non_current_usyd_computer_science_subjects) > 0:
            usyd_subject_collection.data.delete_many(
                where=wvc.query.Filter.by_property("subjectCode").contains_any(
                    non_current_usyd_computer_science_subjects
                )
            )

In [68]:
def get_model_accuracy(num_similar_subjects):
    model_accuracy = {}
    share_major_vals = {}

    for model in ["SBERT", "INSTRUCTOR", "GLOVE", "DOC2VEC", "MXBAI"]:
        num_subjects = 0
        num_share_major = 0
        num_share_major_per_subject = []

        usyd_subject_collection = client.collections.get(f"USYD_{model}_Subject")
        uts_subject_collection = client.collections.get(f"UTS_{model}_Subject")

        remove_non_included_subjects(uts_subject_collection, usyd_subject_collection) # , is_remove_usyd=True

        for item in usyd_subject_collection.iterator(include_vector=True):
            subject_code = item.properties["subjectCode"]
            subject_vector = item.vector

            if type(subject_vector) != list:
                subject_vector = subject_vector["default"]

            usyd_majors = usyd_subject_to_majors[subject_code]

            similar_uts_subjects = get_similar_uts_subjects(
                uts_subject_collection, subject_vector, num_similar_subjects
            )
            
            current_num_share_major = 0

            for uts_subject_code in similar_uts_subjects:
                uts_majors = uts_subject_to_majors[uts_subject_code]

                if is_share_major(uts_majors, usyd_majors):
                    current_num_share_major += 1

                num_subjects += 1
                
            num_share_major_per_subject.append(current_num_share_major)
            num_share_major += current_num_share_major

        model_accuracy[model] = num_share_major / num_subjects
        share_major_vals[model] = num_share_major_per_subject
        
    return model_accuracy, share_major_vals

In [69]:
major_model_accuracy_top_2, _ = get_model_accuracy(2)
major_model_accuracy_top_5, histogram_data_top_5 = get_model_accuracy(5)
major_model_accuracy_top_10, histogram_data_top_10 = get_model_accuracy(10)
major_model_accuracy_top_20, histogram_data_top_20 = get_model_accuracy(20)

In [70]:
fig = px.histogram(
    x=histogram_data_top_5["INSTRUCTOR"],
    nbins=5,
    title="Number of UTS subjects that share major with USYD subjects (Top 5)",
    labels={"x": "Number of UTS subjects that share major with USYD subjects"},
)
fig.show()

In [71]:
fig = px.histogram(
    x=histogram_data_top_10["INSTRUCTOR"],
    nbins=10,
    title="Number of UTS subjects that share major with USYD subjects (Top 10)",
    labels={"x": "Number of UTS subjects that share major with USYD subjects"},
)
fig.show()

In [72]:
max(histogram_data_top_20)

'SBERT'

In [73]:
fig = px.histogram(
    x=histogram_data_top_20["INSTRUCTOR"],
    nbins=20,
    title="Number of UTS subjects that share major with USYD subjects (Top 20)",
    labels={"x": "Number of UTS subjects that share major with USYD subjects"},
)
fig.show()

In [74]:
# convert major accuracy to percentage with 4 decimal places
major_model_accuracy_top_2 = {k: round(v * 100, 2) for k, v in major_model_accuracy_top_2.items()}
major_model_accuracy_top_5 = {k: round(v * 100, 2) for k, v in major_model_accuracy_top_5.items()}
major_model_accuracy_top_10 = {k: round(v * 100, 2) for k, v in major_model_accuracy_top_10.items()}
major_model_accuracy_top_20 = {k: round(v * 100, 2) for k, v in major_model_accuracy_top_20.items()}

In [75]:
# creating a dataframe from the accuracy results
major_model_accuracy = pd.DataFrame(
    {
        "Top 2": major_model_accuracy_top_2,
        "Top 5": major_model_accuracy_top_5,
        "Top 10": major_model_accuracy_top_10,
        "Top 20": major_model_accuracy_top_20,
    }
)
major_model_accuracy

Unnamed: 0,Top 2,Top 5,Top 10,Top 20
SBERT,28.19,31.36,29.55,29.38
INSTRUCTOR,37.45,42.3,37.86,33.72
GLOVE,34.16,31.52,33.05,28.64
DOC2VEC,33.13,28.15,26.87,27.0
MXBAI,41.77,45.35,38.72,35.29
