In [3]:
import os
import json
from typing import Literal

from dotenv import dotenv_values
import pandas as pd
from bs4 import BeautifulSoup
import plotly.express as px
import weaviate
import weaviate.classes as wvc
from weaviate.classes.query import MetadataQuery, Filter

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


In [4]:
config = dotenv_values("../.env")

In [5]:
client = weaviate.connect_to_custom(
    http_host="localhost",
    http_port=8080,
    http_secure=False,
    grpc_host="localhost",
    grpc_port=50051,
    grpc_secure=False,
    auth_credentials=weaviate.auth.AuthApiKey(
        config["AUTHENTICATION_APIKEY_ALLOWED_KEYS"]
    ),  # Set this environment variable
)

In [6]:
collections = list(client.collections.list_all().keys())
collections

['USYD_DOC2VEC_Subject',
 'USYD_GLOVE_Subject',
 'USYD_INSTRUCTOR_Subject',
 'USYD_MXBAI_Subject',
 'USYD_SBERT_Subject',
 'UTS_DOC2VEC_Subject',
 'UTS_GLOVE_Subject',
 'UTS_INSTRUCTOR_Subject',
 'UTS_MXBAI_Subject',
 'UTS_SBERT_Subject']

In [7]:
def get_similar_uts_subjects(
    uts_subject_collection,
    usyd_subject_vector: dict[str, list[float]],
    num_subjects: int = 5,
    degree: str | None = None
):
    if type(usyd_subject_vector) != list:
        usyd_subject_vector = usyd_subject_vector["default"]
    
    response = None
    
    if degree is not None:
        response = uts_subject_collection.query.near_vector(
            near_vector=usyd_subject_vector,
            limit=num_subjects,
            return_metadata=wvc.query.MetadataQuery(distance=True),
            filters=Filter.by_property("degrees").contains_any([degree])
        )
    else:
        response = uts_subject_collection.query.near_vector(
            near_vector=usyd_subject_vector,
            limit=num_subjects,
            return_metadata=wvc.query.MetadataQuery(distance=True)
        )
    
    return [o.properties for o in response.objects]

In [8]:
uts_to_usyd_major = {
  "Biomedical Engineering": "Biomedical",
  "Civil Engineering": "Civil",
  "Electrical Engineering": "Electrical",
  "Electronic Engineering": "Electrical",
  "Mechanical Engineering": "Mechanical",
  "Mechatronic Engineering": "Mechatronic",
  "Software Engineering": "Software",
  "Civil and Environmental Engineering": "Civil",
  "Mechanical and Mechatronic Engineering": "Mechatronic",
  "Electrical and Electronic Engineering": "Electrical",
  "Renewable Energy Engineering": None,
  "Chemical Process Engineering": None,
  "Data Science Engineering": None,
  "Cybersecurity and Privacy" : "Cybersecurity",
  "Enterprise Software Development" : "Software Development",
  "Networking and Cybersecurity" : "Cybersecurity",
  "Quantum Information Science" : None,
  "Interaction Design": None,
  "Mathematical Analysis": None,
  "Business Information Systems Management": None,
  "Artificial Intelligence and Data Analytics": "Computational Data Science",
}

In [9]:
comp_majors = ["Computational Data Science", "Cybersecurity", "Software Development", "Computer Science"]

In [76]:
def get_model_accuracy(num_similar_subjects: int):
    embedding_accuracy = {}

    for embedding_type in ["MXBAI", "INSTRUCTOR", "SBERT", "GLOVE", "DOC2VEC"]:
        usyd_subject_collection = client.collections.get(f"USYD_{embedding_type}_Subject")
        uts_subject_collection = client.collections.get(f"UTS_{embedding_type}_Subject")
        
        embedding_accuracy[embedding_type] = {}
        
        for item in usyd_subject_collection.iterator(include_vector=True):
            usyd_majors = item.properties["majors"]
            subject_vector = item.vector
                    
            similar_subjects = get_similar_uts_subjects(uts_subject_collection, subject_vector, num_subjects=num_similar_subjects)
            
            for uts_subject in similar_subjects:
                uts_majors = uts_subject["majors"]
                
                if len(usyd_majors) == 0:
                    if "No major" not in embedding_accuracy[embedding_type]:
                            embedding_accuracy[embedding_type]["No major"] = {
                                "correct": 0,
                                "total": 0
                            }
                    
                    if len(uts_majors) == 0:
                        embedding_accuracy[embedding_type]["No major"]["correct"] += 1    
                    
                    embedding_accuracy[embedding_type]["No major"]["total"] += 1
                    continue
                
                for uts_major in uts_majors:
                    usyd_equivalent_major = uts_to_usyd_major[uts_major]
                    
                    if usyd_equivalent_major is None:
                        continue
                    
                    if usyd_equivalent_major not in embedding_accuracy[embedding_type]:
                        embedding_accuracy[embedding_type][usyd_equivalent_major] = {
                            "correct": 0,
                            "total": 0
                        }
                    
                    if usyd_equivalent_major in usyd_majors:
                        embedding_accuracy[embedding_type][usyd_equivalent_major]["correct"] += 1
                        
                    embedding_accuracy[embedding_type][usyd_equivalent_major]["total"] += 1
    
    with open("./data/test-accuracy.json", "w") as f:
        json.dump(embedding_accuracy, f, indent=4)
    
    embedding_accuracy_combined = {}
    
    for embedding_type in embedding_accuracy:
        embedding_accuracy_combined[embedding_type] = {}
        
        for major in embedding_accuracy[embedding_type]:
            major_type = None
            
            if major == "No major":
                major_type = "No major"
            elif major in comp_majors:
                major_type = "COMP"
            else:
                major_type = "ENG"
            
            if major_type not in embedding_accuracy_combined:
                embedding_accuracy_combined[embedding_type][major_type] = {
                    "correct": 0,
                    "total": 0
                }
            
            embedding_accuracy_combined[embedding_type][major_type]["correct"] += embedding_accuracy[embedding_type][major]["correct"]
            embedding_accuracy_combined[embedding_type][major_type]["total"] += embedding_accuracy[embedding_type][major]["total"]
    
    for embedding_type in embedding_accuracy_combined:
        for major_type in embedding_accuracy_combined[embedding_type]:
            correct = embedding_accuracy_combined[embedding_type][major_type]["correct"]
            total = embedding_accuracy_combined[embedding_type][major_type]["total"]
            
            embedding_accuracy_combined[embedding_type][major_type]["accuracy"] = round((correct / total) * 100, 2)
            del embedding_accuracy_combined[embedding_type][major_type]["correct"]
            del embedding_accuracy_combined[embedding_type][major_type]["total"]
    
    return embedding_accuracy_combined

In [77]:
model_accuracy_top_2 = get_model_accuracy(2)
model_accuracy_top_5 = get_model_accuracy(5)
model_accuracy_top_10 = get_model_accuracy(10)

In [12]:
# creating a dataframe from the accuracy results
major_model_accuracy = pd.DataFrame(
    [[model["COMP"]["accuracy"] for model in model_accuracy_top_2.values()],
     [model["COMP"]["accuracy"] for model in model_accuracy_top_5.values()],
     [model["COMP"]["accuracy"] for model in model_accuracy_top_10.values()]],
    columns=model_accuracy_top_2.keys(),
)
major_model_accuracy = major_model_accuracy.T
major_model_accuracy.columns = ["Top 2", "Top 5", "Top 10"]
major_model_accuracy

Unnamed: 0,Top 2,Top 5,Top 10
MXBAI,32.0,26.6,29.78
INSTRUCTOR,23.53,23.68,22.01
SBERT,39.02,36.0,23.88
GLOVE,0.0,0.0,5.19
DOC2VEC,0.0,14.29,9.09


In [13]:
# creating a dataframe from the accuracy results
major_model_accuracy = pd.DataFrame(
    [[model["ENG"]["accuracy"] for model in model_accuracy_top_2.values()],
     [model["ENG"]["accuracy"] for model in model_accuracy_top_5.values()],
     [model["ENG"]["accuracy"] for model in model_accuracy_top_10.values()]],
    columns=model_accuracy_top_2.keys(),
)
major_model_accuracy = major_model_accuracy.T
major_model_accuracy.columns = ["Top 2", "Top 5", "Top 10"]
major_model_accuracy

Unnamed: 0,Top 2,Top 5,Top 10
MXBAI,0.0,7.14,74.56
INSTRUCTOR,0.0,10.0,13.33
SBERT,25.0,69.15,65.68
GLOVE,37.5,57.0,0.0
DOC2VEC,0.0,9.09,12.5


In [14]:
# creating a dataframe from the accuracy results
major_model_accuracy = pd.DataFrame(
    [[model["No major"]["accuracy"] for model in model_accuracy_top_2.values()],
     [model["No major"]["accuracy"] for model in model_accuracy_top_5.values()],
     [model["No major"]["accuracy"] for model in model_accuracy_top_10.values()]],
    columns=model_accuracy_top_2.keys(),
)
major_model_accuracy = major_model_accuracy.T
major_model_accuracy.columns = ["Top 2", "Top 5", "Top 10"]
major_model_accuracy

Unnamed: 0,Top 2,Top 5,Top 10
MXBAI,62.5,64.05,61.55
INSTRUCTOR,65.48,67.62,63.81
SBERT,62.5,60.48,59.88
GLOVE,74.4,57.62,48.57
DOC2VEC,44.64,40.71,47.26


In [15]:
with open("./data/major_rels/subject_to_majors.json", "r") as f:
    subject_to_majors = json.load(f)

In [16]:
with open("./data/degree_subject_codes/subject_to_degrees.json", "r") as f:
    subject_to_degrees = json.load(f)

In [17]:
num_majors = [len(majors) for majors in subject_to_majors.values()]

In [18]:
fig = px.histogram(
    x=num_majors,
    # nbins=10,
    title="Number of Majors per Subject (UTS and USYD)",
    labels={"x": "Number of Majors per Subject"},
)
fig.update_layout(
  xaxis=dict(
    tickmode='linear',
    tick0=0,
    dtick=1
  ),
  yaxis_title="Number of Subjects",
)
fig.show()

In [19]:
num_major_eng_subjects = []

In [20]:
for subject, degrees in subject_to_degrees.items():
    if "Bachelor of Engineering (Honours)" in degrees:
        num_major_eng_subjects.append(len(subject_to_majors[subject]))

In [21]:
fig = px.histogram(
    x=num_major_eng_subjects,
    # nbins=10,
    title="Number of Majors per Engineering Subject (UTS and USYD)",
    labels={"x": "Number of Majors per Engineering Subject"},
)
fig.update_layout(
  xaxis=dict(
    tickmode='linear',
    tick0=0,
    dtick=1
  ),
  # set range of y-axis to be between 0 and 300
  yaxis=dict(
    range=[0, 300]
  ),
  yaxis_title="Number of Subjects",
)
fig.show()

In [22]:
num_comp_eng_subjects = 0
num_subjects = 0

for degrees in subject_to_degrees.values():
    if "Bachelor of Engineering (Honours)" in degrees or "Bachelor of Computing Science (Honours)" in degrees or "Bachelor of Advanced Computing" in degrees:
        num_subjects += 1
  
    if "Bachelor of Engineering (Honours)" in degrees and ("Bachelor of Computing Science (Honours)" in degrees or "Bachelor of Advanced Computing" in degrees):
        num_comp_eng_subjects += 1

In [23]:
print(num_comp_eng_subjects, num_subjects, round((num_comp_eng_subjects / num_subjects) * 100, 2))

89 583 15.27


In [36]:
def get_model_accuracy_by_degree(num_similar_subjects: int):
    embedding_accuracy = {}

    for embedding_type in ["MXBAI", "INSTRUCTOR", "SBERT", "GLOVE", "DOC2VEC"]:
        usyd_subject_collection = client.collections.get(f"USYD_{embedding_type}_Subject")
        uts_subject_collection = client.collections.get(f"UTS_{embedding_type}_Subject")
        
        embedding_accuracy[embedding_type] = {}
        
        for item in usyd_subject_collection.iterator(include_vector=True):
            usyd_majors = item.properties["majors"]
            degrees = item.properties["degrees"]
            subject_vector = item.vector
                    
            for degree in degrees:
              similar_subjects = get_similar_uts_subjects(uts_subject_collection, subject_vector, num_subjects=num_similar_subjects, degree=degree)
            
              for uts_subject in similar_subjects:
                  uts_majors = uts_subject["majors"]
                  
                  if len(usyd_majors) == 0:
                      if "No major" not in embedding_accuracy[embedding_type]:
                              embedding_accuracy[embedding_type]["No major"] = {
                                  "correct": 0,
                                  "total": 0
                              }
                      
                      if len(uts_majors) == 0:
                          embedding_accuracy[embedding_type]["No major"]["correct"] += 1    
                      
                      embedding_accuracy[embedding_type]["No major"]["total"] += 1
                      continue
                  
                  for uts_major in uts_majors:
                      usyd_equivalent_major = uts_to_usyd_major[uts_major]
                      
                      if usyd_equivalent_major is None:
                          continue
                      
                      if usyd_equivalent_major not in embedding_accuracy[embedding_type]:
                          embedding_accuracy[embedding_type][usyd_equivalent_major] = {
                              "correct": 0,
                              "total": 0
                          }
                      
                      if usyd_equivalent_major in usyd_majors:
                          embedding_accuracy[embedding_type][usyd_equivalent_major]["correct"] += 1
                      embedding_accuracy[embedding_type][usyd_equivalent_major]["total"] += 1
                    
    
    with open("./data/test-accuracy.json", "w") as f:
        json.dump(embedding_accuracy, f, indent=4)
    
    embedding_accuracy_combined = {}
    
    for embedding_type in embedding_accuracy:
        embedding_accuracy_combined[embedding_type] = {}
        
        for major in embedding_accuracy[embedding_type]:
            major_type = None
            
            if major == "No major":
                major_type = "No major"
            elif major in comp_majors:
                major_type = "COMP"
            else:
                major_type = "ENG"
            
            if major_type not in embedding_accuracy_combined:
                embedding_accuracy_combined[embedding_type][major_type] = {
                    "correct": 0,
                    "total": 0
                }
            
            embedding_accuracy_combined[embedding_type][major_type]["correct"] += embedding_accuracy[embedding_type][major]["correct"]
            embedding_accuracy_combined[embedding_type][major_type]["total"] += embedding_accuracy[embedding_type][major]["total"]
    
    for embedding_type in embedding_accuracy_combined:
        for major_type in embedding_accuracy_combined[embedding_type]:
            correct = embedding_accuracy_combined[embedding_type][major_type]["correct"]
            total = embedding_accuracy_combined[embedding_type][major_type]["total"]
            
            embedding_accuracy_combined[embedding_type][major_type]["accuracy"] = round((correct / total) * 100, 2)
            del embedding_accuracy_combined[embedding_type][major_type]["correct"]
            del embedding_accuracy_combined[embedding_type][major_type]["total"]
    
    return embedding_accuracy_combined

In [40]:
model_accuracy_by_degree_top_2 = get_model_accuracy_by_degree(2)
model_accuracy_by_degree_top_5 = get_model_accuracy_by_degree(5)
model_accuracy_by_degree_top_10 = get_model_accuracy_by_degree(10)

In [41]:
# creating a dataframe from the accuracy results
major_model_accuracy = pd.DataFrame(
    [[model["COMP"]["accuracy"] for model in model_accuracy_by_degree_top_2.values()],
     [model["COMP"]["accuracy"] for model in model_accuracy_by_degree_top_5.values()],
     [model["COMP"]["accuracy"] for model in model_accuracy_by_degree_top_10.values()]],
    columns=model_accuracy_top_2.keys(),
)
major_model_accuracy = major_model_accuracy.T
major_model_accuracy.columns = ["Top 2", "Top 5", "Top 10"]
major_model_accuracy

Unnamed: 0,Top 2,Top 5,Top 10
MXBAI,40.0,19.05,19.51
INSTRUCTOR,35.29,11.11,11.11
SBERT,0.0,0.0,0.0
GLOVE,0.0,100.0,0.0
DOC2VEC,0.0,50.0,0.0


In [44]:
# creating a dataframe from the accuracy results
major_model_accuracy = pd.DataFrame(
    [[model["ENG"]["accuracy"] for model in model_accuracy_by_degree_top_2.values()],
     [model["ENG"]["accuracy"] for model in model_accuracy_by_degree_top_5.values()],
     [model["ENG"]["accuracy"] for model in model_accuracy_by_degree_top_10.values()]],
    columns=model_accuracy_top_2.keys(),
)
major_model_accuracy = major_model_accuracy.T
major_model_accuracy.columns = ["Top 2", "Top 5", "Top 10"]
major_model_accuracy

Unnamed: 0,Top 2,Top 5,Top 10
MXBAI,0.0,58.78,59.3
INSTRUCTOR,0.0,10.0,11.11
SBERT,60.22,62.67,62.35
GLOVE,0.0,0.0,0.0
DOC2VEC,100.0,50.0,0.0


In [45]:
# creating a dataframe from the accuracy results
major_model_accuracy = pd.DataFrame(
    [[model["No major"]["accuracy"] for model in model_accuracy_by_degree_top_2.values()],
     [model["No major"]["accuracy"] for model in model_accuracy_by_degree_top_5.values()],
     [model["No major"]["accuracy"] for model in model_accuracy_by_degree_top_10.values()]],
    columns=model_accuracy_top_2.keys(),
)
major_model_accuracy = major_model_accuracy.T
major_model_accuracy.columns = ["Top 2", "Top 5", "Top 10"]
major_model_accuracy

Unnamed: 0,Top 2,Top 5,Top 10
MXBAI,48.44,64.38,60.62
INSTRUCTOR,62.5,65.0,65.62
SBERT,84.38,75.0,68.75
GLOVE,87.5,78.75,60.0
DOC2VEC,23.44,31.87,37.19


In [72]:
def shared_major_accuracy(num_similar_subjects: int):
    embedding_accuracy = {}
    degree_to_abbrievation = {
      "Bachelor of Engineering (Honours)": "ENG",
      "Bachelor of Computing Science (Honours)": "COMP",
    }
    usyd_to_uts_degree = {
                          'Bachelor of Advanced Computing': 'Bachelor of Computing Science (Honours)',
                          'Bachelor of Engineering (Honours)': 'Bachelor of Engineering (Honours)',
                          }

    for embedding_type in ["MXBAI", "INSTRUCTOR", "SBERT", "GLOVE", "DOC2VEC"]:
        usyd_subject_collection = client.collections.get(f"USYD_{embedding_type}_Subject")
        uts_subject_collection = client.collections.get(f"UTS_{embedding_type}_Subject")
        
        embedding_accuracy[embedding_type] = {
          "COMP": {
              "correct": 0,
              "total": 0
          },
          "ENG": {
              "correct": 0,
              "total": 0,
          }
        }
        
        for item in usyd_subject_collection.iterator(include_vector=True):
            usyd_majors = item.properties["majors"]
            degrees = item.properties["degrees"]
            subject_vector = item.vector
                    
            for degree in degrees:
              if degree == "Bachelor of Nursing (Advanced Studies)":
                continue
              
              equivalent_uts_degree = usyd_to_uts_degree[degree]
              
              similar_subjects = get_similar_uts_subjects(uts_subject_collection, subject_vector, num_subjects=num_similar_subjects, degree=equivalent_uts_degree)
            
              for uts_subject in similar_subjects:
                  uts_majors = uts_subject["majors"]
                  equivalent_usyd_majors = [uts_to_usyd_major[major] for major in uts_majors]
                  
                  if len(usyd_majors) == 0:
                      if len(uts_majors) == 0:
                          embedding_accuracy[embedding_type][degree_to_abbrievation[equivalent_uts_degree]]["correct"] += 1    
                      
                      embedding_accuracy[embedding_type][degree_to_abbrievation[equivalent_uts_degree]]["total"] += 1
                      continue
                  
                  num_shared_majors = len(set(usyd_majors).intersection(equivalent_usyd_majors))
                  embedding_accuracy[embedding_type][degree_to_abbrievation[equivalent_uts_degree]]["correct"] += 1 if num_shared_majors >= 1 else 0
                  embedding_accuracy[embedding_type][degree_to_abbrievation[equivalent_uts_degree]]["total"] += 1
    
    # add accuracy to each embedding degree
    for embedding_type in embedding_accuracy:
        for degree in embedding_accuracy[embedding_type]:
            correct = embedding_accuracy[embedding_type][degree]["correct"]
            total = embedding_accuracy[embedding_type][degree]["total"]
            
            embedding_accuracy[embedding_type][degree]["accuracy"] = round((correct / total) * 100, 2)
            del embedding_accuracy[embedding_type][degree]["correct"]
            del embedding_accuracy[embedding_type][degree]["total"]
    
    return embedding_accuracy

In [73]:
shared_major_accuracy_top_2 = shared_major_accuracy(2)
shared_major_accuracy_top_5 = shared_major_accuracy(5)
shared_major_accuracy_top_10 = shared_major_accuracy(10)

In [74]:
# creating a dataframe from the accuracy results
major_model_accuracy = pd.DataFrame(
    [[model["COMP"]["accuracy"] for model in shared_major_accuracy_top_2.values()],
     [model["COMP"]["accuracy"] for model in shared_major_accuracy_top_5.values()],
     [model["COMP"]["accuracy"] for model in shared_major_accuracy_top_10.values()]],
    columns=model_accuracy_top_2.keys(),
)
major_model_accuracy = major_model_accuracy.T
major_model_accuracy.columns = ["Top 2", "Top 5", "Top 10"]
major_model_accuracy

Unnamed: 0,Top 2,Top 5,Top 10
MXBAI,34.86,32.29,29.17
INSTRUCTOR,36.7,34.5,31.28
SBERT,29.36,30.09,30.18
GLOVE,22.48,20.55,20.73
DOC2VEC,6.42,6.42,10.83


In [75]:
# creating a dataframe from the accuracy results
major_model_accuracy = pd.DataFrame(
    [[model["ENG"]["accuracy"] for model in shared_major_accuracy_top_2.values()],
     [model["ENG"]["accuracy"] for model in shared_major_accuracy_top_5.values()],
     [model["ENG"]["accuracy"] for model in shared_major_accuracy_top_10.values()]],
    columns=model_accuracy_top_2.keys(),
)
major_model_accuracy = major_model_accuracy.T
major_model_accuracy.columns = ["Top 2", "Top 5", "Top 10"]
major_model_accuracy

Unnamed: 0,Top 2,Top 5,Top 10
MXBAI,34.7,34.87,32.14
INSTRUCTOR,43.09,38.62,34.57
SBERT,31.74,29.93,28.12
GLOVE,23.19,20.26,17.86
DOC2VEC,10.03,11.91,12.07
