In [32]:
import os
import json
from typing import Literal

from dotenv import dotenv_values
import pandas as pd
from bs4 import BeautifulSoup
import plotly.express as px
import weaviate
import weaviate.classes as wvc
from weaviate.classes.query import MetadataQuery, Filter

In [33]:
config = dotenv_values("../.env")

In [34]:
client = weaviate.connect_to_custom(
    http_host="localhost",
    http_port=8080,
    http_secure=False,
    grpc_host="localhost",
    grpc_port=50051,
    grpc_secure=False,
    auth_credentials=weaviate.auth.AuthApiKey(
        config["AUTHENTICATION_APIKEY_ALLOWED_KEYS"]
    ),  # Set this environment variable
)


unclosed <socket.socket fd=75, family=AddressFamily.AF_INET6, type=SocketKind.SOCK_STREAM, proto=6, laddr=('::1', 61700, 0, 0), raddr=('::1', 8080, 0, 0)>



In [35]:
collections = list(client.collections.list_all().keys())
collections

['USYD_DOC2VEC_Subject',
 'USYD_GLOVE_Subject',
 'USYD_INSTRUCTOR_Subject',
 'USYD_MXBAI_Subject',
 'USYD_SBERT_Subject',
 'UTS_DOC2VEC_Subject',
 'UTS_GLOVE_Subject',
 'UTS_INSTRUCTOR_Subject',
 'UTS_MXBAI_Subject',
 'UTS_SBERT_Subject']

In [36]:
def get_similar_uts_subjects(
    uts_subject_collection,
    usyd_subject_vector: dict[str, list[float]],
    num_subjects: int = 5,
    degree: str | None = None
):
    if type(usyd_subject_vector) != list:
        usyd_subject_vector = usyd_subject_vector["default"]
    
    response = None
    
    if degree is not None:
        response = uts_subject_collection.query.near_vector(
            near_vector=usyd_subject_vector,
            limit=num_subjects,
            return_metadata=wvc.query.MetadataQuery(distance=True),
            filters=Filter.by_property("degrees").contains_any([degree])
        )
    else:
        response = uts_subject_collection.query.near_vector(
            near_vector=usyd_subject_vector,
            limit=num_subjects,
            return_metadata=wvc.query.MetadataQuery(distance=True)
        )
    
    return [o.properties for o in response.objects]

In [37]:
uts_to_usyd_major = {
  "Biomedical Engineering": "Biomedical",
  "Civil Engineering": "Civil",
  "Electrical Engineering": "Electrical",
  "Electronic Engineering": "Electrical",
  "Mechanical Engineering": "Mechanical",
  "Mechatronic Engineering": "Mechatronic",
  "Software Engineering": "Software",
  "Civil and Environmental Engineering": "Civil",
  "Mechanical and Mechatronic Engineering": "Mechatronic",
  "Electrical and Electronic Engineering": "Electrical",
  "Renewable Energy Engineering": None,
  "Chemical Process Engineering": None,
  "Data Science Engineering": None,
  "Cybersecurity and Privacy" : "Cybersecurity",
  "Enterprise Software Development" : "Software Development",
  "Networking and Cybersecurity" : "Cybersecurity",
  "Quantum Information Science" : None,
  "Interaction Design": None,
  "Mathematical Analysis": None,
  "Business Information Systems Management": None,
  "Artificial Intelligence and Data Analytics": "Computational Data Science",
}

In [38]:
comp_majors = ["Computational Data Science", "Cybersecurity", "Software Development", "Computer Science"]

In [39]:
with open("./data/major_rels/subject_to_majors.json", "r") as f:
    subject_to_majors = json.load(f)

In [40]:
with open("./data/degree_subject_codes/subject_to_degrees.json", "r") as f:
    subject_to_degrees = json.load(f)

In [41]:
num_majors = [len(majors) for majors in subject_to_majors.values()]

In [42]:
fig = px.histogram(
    x=num_majors,
    # nbins=10,
    title="Number of Majors per Subject (UTS and USYD)",
    labels={"x": "Number of Majors per Subject"},
)
fig.update_layout(
  xaxis=dict(
    tickmode='linear',
    tick0=0,
    dtick=1
  ),
  yaxis_title="Number of Subjects",
)
fig.show()

In [43]:
num_major_eng_subjects = []

In [44]:
for subject, degrees in subject_to_degrees.items():
    if "Bachelor of Engineering (Honours)" in degrees:
        num_major_eng_subjects.append(len(subject_to_majors[subject]))

In [45]:
fig = px.histogram(
    x=num_major_eng_subjects,
    # nbins=10,
    title="Number of Majors per Engineering Subject (UTS and USYD)",
    labels={"x": "Number of Majors per Engineering Subject"},
)
fig.update_layout(
  xaxis=dict(
    tickmode='linear',
    tick0=0,
    dtick=1
  ),
  # set range of y-axis to be between 0 and 300
  yaxis=dict(
    range=[0, 300]
  ),
  yaxis_title="Number of Subjects",
)
fig.show()

In [46]:
num_comp_eng_subjects = 0
num_subjects = 0

for degrees in subject_to_degrees.values():
    if "Bachelor of Engineering (Honours)" in degrees or "Bachelor of Computing Science (Honours)" in degrees or "Bachelor of Advanced Computing" in degrees:
        num_subjects += 1
  
    if "Bachelor of Engineering (Honours)" in degrees and ("Bachelor of Computing Science (Honours)" in degrees or "Bachelor of Advanced Computing" in degrees):
        num_comp_eng_subjects += 1

In [47]:
print(num_comp_eng_subjects, num_subjects, round((num_comp_eng_subjects / num_subjects) * 100, 2))

89 583 15.27


In [48]:
def shared_major_accuracy(num_similar_subjects: int, is_core_always_correct: bool):
    embedding_accuracy = {}
    degree_to_abbrievation = {
      "Bachelor of Engineering (Honours)": "ENG",
      "Bachelor of Computing Science (Honours)": "COMP",
    }
    usyd_to_uts_degree = {
                          'Bachelor of Advanced Computing': 'Bachelor of Computing Science (Honours)',
                          'Bachelor of Engineering (Honours)': 'Bachelor of Engineering (Honours)',
                          }

    for embedding_type in ["MXBAI", "INSTRUCTOR", "SBERT", "GLOVE", "DOC2VEC"]:
        usyd_subject_collection = client.collections.get(f"USYD_{embedding_type}_Subject")
        uts_subject_collection = client.collections.get(f"UTS_{embedding_type}_Subject")
        
        embedding_accuracy[embedding_type] = {
          "COMP": {
              "correct": 0,
              "total": 0
          },
          "ENG": {
              "correct": 0,
              "total": 0,
          }
        }
        
        for item in usyd_subject_collection.iterator(include_vector=True):
            usyd_majors = item.properties["majors"]
            degrees = item.properties["degrees"]
            subject_vector = item.vector
                    
            for degree in degrees:
              if degree == "Bachelor of Nursing (Advanced Studies)":
                continue
              
              equivalent_uts_degree = usyd_to_uts_degree[degree]
              
              similar_subjects = get_similar_uts_subjects(uts_subject_collection, subject_vector, num_subjects=num_similar_subjects, degree=equivalent_uts_degree)
            
              for uts_subject in similar_subjects:
                  uts_majors = uts_subject["majors"]
                  equivalent_usyd_majors = [uts_to_usyd_major[major] for major in uts_majors]
                  
                  if is_core_always_correct:
                    if len(usyd_majors) == 0 or len(uts_majors) == 0:
                      embedding_accuracy[embedding_type][degree_to_abbrievation[equivalent_uts_degree]]["correct"] += 1
                      embedding_accuracy[embedding_type][degree_to_abbrievation[equivalent_uts_degree]]["total"] += 1
                      continue
                  else:
                    if len(usyd_majors) == 0:
                      if len(uts_majors) == 0:
                          embedding_accuracy[embedding_type][degree_to_abbrievation[equivalent_uts_degree]]["correct"] += 1    
                      
                      embedding_accuracy[embedding_type][degree_to_abbrievation[equivalent_uts_degree]]["total"] += 1
                      continue
                  
                  num_shared_majors = len(set(usyd_majors).intersection(equivalent_usyd_majors))
                  embedding_accuracy[embedding_type][degree_to_abbrievation[equivalent_uts_degree]]["correct"] += 1 if num_shared_majors >= 1 else 0
                  embedding_accuracy[embedding_type][degree_to_abbrievation[equivalent_uts_degree]]["total"] += 1
    
    # add accuracy to each embedding degree
    for embedding_type in embedding_accuracy:
        for degree in embedding_accuracy[embedding_type]:
            correct = embedding_accuracy[embedding_type][degree]["correct"]
            total = embedding_accuracy[embedding_type][degree]["total"]
            
            embedding_accuracy[embedding_type][degree]["accuracy"] = round((correct / total) * 100, 2)
            del embedding_accuracy[embedding_type][degree]["correct"]
            del embedding_accuracy[embedding_type][degree]["total"]
    
    return embedding_accuracy

In [49]:
shared_major_accuracy_top_2 = shared_major_accuracy(2, True)
shared_major_accuracy_top_5 = shared_major_accuracy(5, True)
shared_major_accuracy_top_10 = shared_major_accuracy(10, True)

In [55]:
# creating a dataframe from the accuracy results
major_model_accuracy_comp = pd.DataFrame(
    [[model["COMP"]["accuracy"] for model in shared_major_accuracy_top_2.values()],
     [model["COMP"]["accuracy"] for model in shared_major_accuracy_top_5.values()],
     [model["COMP"]["accuracy"] for model in shared_major_accuracy_top_10.values()]],
    columns=shared_major_accuracy_top_2.keys(),
)
major_model_accuracy_comp = major_model_accuracy_comp.T
major_model_accuracy_comp.columns = ["Top 2", "Top 5", "Top 10"]
major_model_accuracy_comp

Unnamed: 0,Top 2,Top 5,Top 10
MXBAI,57.34,60.37,59.27
INSTRUCTOR,58.26,57.43,57.43
SBERT,58.26,58.72,57.16
GLOVE,54.13,52.48,52.11
DOC2VEC,35.32,33.39,40.28


In [56]:
# creating a dataframe from the accuracy results
major_model_accuracy_eng = pd.DataFrame(
    [[model["ENG"]["accuracy"] for model in shared_major_accuracy_top_2.values()],
     [model["ENG"]["accuracy"] for model in shared_major_accuracy_top_5.values()],
     [model["ENG"]["accuracy"] for model in shared_major_accuracy_top_10.values()]],
    columns=shared_major_accuracy_top_2.keys(),
)
major_model_accuracy_eng = major_model_accuracy_eng.T
major_model_accuracy_eng.columns = ["Top 2", "Top 5", "Top 10"]
major_model_accuracy_eng

Unnamed: 0,Top 2,Top 5,Top 10
MXBAI,75.99,73.82,75.2
INSTRUCTOR,78.45,77.04,75.76
SBERT,72.2,74.54,72.66
GLOVE,76.32,75.26,69.61
DOC2VEC,43.09,48.36,49.54


In [63]:
# make df with top 2 next to each other from major_model_accuracy_comp and major_model_accuracy_eng
major_model_accuracy = pd.concat([major_model_accuracy_comp, major_model_accuracy_eng], axis=1)
major_model_accuracy[["Top 2", "Top 5", "Top 10"]]

Unnamed: 0,Top 2,Top 2.1,Top 5,Top 5.1,Top 10,Top 10.1
MXBAI,57.34,75.99,60.37,73.82,59.27,75.2
INSTRUCTOR,58.26,78.45,57.43,77.04,57.43,75.76
SBERT,58.26,72.2,58.72,74.54,57.16,72.66
GLOVE,54.13,76.32,52.48,75.26,52.11,69.61
DOC2VEC,35.32,43.09,33.39,48.36,40.28,49.54


In [29]:
shared_major_accuracy_top_2 = shared_major_accuracy(2, False)
shared_major_accuracy_top_5 = shared_major_accuracy(5, False)
shared_major_accuracy_top_10 = shared_major_accuracy(10, False)

In [30]:
# creating a dataframe from the accuracy results
major_model_accuracy = pd.DataFrame(
    [[model["COMP"]["accuracy"] for model in shared_major_accuracy_top_2.values()],
     [model["COMP"]["accuracy"] for model in shared_major_accuracy_top_5.values()],
     [model["COMP"]["accuracy"] for model in shared_major_accuracy_top_10.values()]],
    columns=shared_major_accuracy_top_2.keys(),
)
major_model_accuracy = major_model_accuracy.T
major_model_accuracy.columns = ["Top 2", "Top 5", "Top 10"]
major_model_accuracy

Unnamed: 0,Top 2,Top 5,Top 10
MXBAI,34.86,32.29,29.17
INSTRUCTOR,36.7,34.5,31.28
SBERT,29.36,30.09,30.18
GLOVE,22.48,20.55,20.73
DOC2VEC,6.42,6.42,10.83


In [31]:
# creating a dataframe from the accuracy results
major_model_accuracy = pd.DataFrame(
    [[model["COMP"]["accuracy"] for model in shared_major_accuracy_top_2.values()],
     [model["COMP"]["accuracy"] for model in shared_major_accuracy_top_5.values()],
     [model["COMP"]["accuracy"] for model in shared_major_accuracy_top_10.values()]],
    columns=shared_major_accuracy_top_2.keys(),
)
major_model_accuracy = major_model_accuracy.T
major_model_accuracy.columns = ["Top 2", "Top 5", "Top 10"]
major_model_accuracy

Unnamed: 0,Top 2,Top 5,Top 10
MXBAI,34.86,32.29,29.17
INSTRUCTOR,36.7,34.5,31.28
SBERT,29.36,30.09,30.18
GLOVE,22.48,20.55,20.73
DOC2VEC,6.42,6.42,10.83
