In [116]:
import os
import json

import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import plotly.express as px

In [117]:
with open("./data/degree_subject_codes/subject_to_degrees.json") as f:
    subject_to_degrees = json.load(f)

In [118]:
with open("./data/major_rels/subject_to_majors.json") as f:
    subject_to_majors = json.load(f)

In [119]:
with open("./data/degree_subject_codes/subject_to_median_year.json") as f:
    subject_to_median_year = json.load(f)

In [120]:
subjects = {}

In [121]:
for uni in ["uts", "usyd"]:
  for embedding_type in os.listdir(f"./data/embeddings/{uni}"):
    if embedding_type not in subjects:
      subjects[embedding_type] = {
        "subject_codes": [],
        "degrees": [],
        "majors": [],
        "median_years": [],
        "embeddings": []
      }
    
    for embedding_file in os.listdir(f"./data/embeddings/{uni}/{embedding_type}"):
      if not embedding_file.endswith(".json"):
        continue
      
      subject_code = embedding_file.replace(".json", "")
      
      with open(f"./data/embeddings/{uni}/{embedding_type}/{embedding_file}", "r") as f:
        embedding = json.load(f)
        
      subjects[embedding_type]["subject_codes"].append(subject_code)
      subjects[embedding_type]["degrees"].append(subject_to_degrees[subject_code])
      subjects[embedding_type]["majors"].append(subject_to_majors[subject_code])
      subjects[embedding_type]["median_years"].append(subject_to_median_year[subject_code])
      subjects[embedding_type]["embeddings"].append(embedding)

In [122]:
degree_abbreviations = {
  "Bachelor of Engineering (Honours)": "ENGI",
  "Bachelor of Computing Science (Honours)": "COMP",
  "Bachelor of Advanced Computing": "COMP",
  "Bachelor of Nursing (Advanced Studies)": "NURS",
  "Bachelor of Nursing": "NURS",
}

In [123]:
def get_degree_abbr(degrees):
  if len(degrees) == 0:
    return "None"
  
  # highest number of degrees is 2
  if len(degrees) == 2:
    return f"ENGI - COMP"
  
  return degree_abbreviations[degrees[0]]

In [124]:
uts_to_usyd_major = {
  "Biomedical Engineering": "Biomedical",
  "Civil Engineering": "Civil",
  "Electrical Engineering": "Electrical",
  "Electronic Engineering": "Electrical",
  "Mechanical Engineering": "Mechanical",
  "Mechatronic Engineering": "Mechatronic",
  "Software Engineering": "Software",
  "Civil and Environmental Engineering": "Civil",
  "Mechanical and Mechatronic Engineering": "Mechatronic",
  "Electrical and Electronic Engineering": "Electrical",
  "Renewable Energy Engineering": None,
  "Chemical Process Engineering": None,
  "Data Science Engineering": None,
  "Cybersecurity and Privacy" : "Cybersecurity",
  "Enterprise Software Development" : "Software Development",
  "Networking and Cybersecurity" : "Cybersecurity",
  "Quantum Information Science" : None,
  "Interaction Design": None,
  "Mathematical Analysis": None,
  "Business Information Systems Management": None,
  "Artificial Intelligence and Data Analytics": "Computational Data Science",
}

In [125]:
usyd_to_uts_major = { v: k for k, v in uts_to_usyd_major.items() }

In [126]:
def get_major_label(majors):
  if len(majors) == 1:
    if majors[0] in usyd_to_uts_major:
      return usyd_to_uts_major[majors[0]][:4].upper()
    else:
      return majors[0][:4].upper()
    
  if len(majors) >= 2:
    major_abbreviations = []
    
    for major in majors:
      if major in usyd_to_uts_major:
        major_abbreviations.append(str(usyd_to_uts_major[major])[:4])
      else:
        major_abbreviations.append(major[:4])
        
    major_abbreviations = sorted(list(set(major_abbreviations)))
    
    return " - ".join(major_abbreviations).upper()
    
  return "CORE/NONE"

In [127]:
sbert_embeddings = np.array([subject for subject in subjects["sbert"]["embeddings"]])
degrees = [get_degree_abbr(subject) for subject in subjects["sbert"]["degrees"]]
majors = [get_major_label(subject) for subject in subjects["sbert"]["majors"]]
unis = ["UTS" if subject_code.isdigit() else "USYD" for subject_code in subjects["sbert"]["subject_codes"]]

In [128]:
tsne = TSNE(n_components=2, random_state=42)
X_tsne = tsne.fit_transform(sbert_embeddings)
tsne.kl_divergence_

0.6710178852081299

In [129]:
sbert_subject_df = pd.DataFrame({
  "x": X_tsne[:, 0],
  "y": X_tsne[:, 1],
  "degree": degrees,
  "major": majors,
  "subject_code": subjects["sbert"]["subject_codes"],
  "uni": unis
})

In [130]:
fig = px.scatter(sbert_subject_df, x="x", y="y", color="degree", symbol="uni", hover_name="subject_code", hover_data=["subject_code", "uni"])
fig.update_layout(
    title="t-SNE of SBERT embeddings of UTS and USYD subjects using by Degree",
    xaxis_title="First t-SNE",
    yaxis_title="Second t-SNE",
)
fig.show()

In [131]:
fig = px.scatter(sbert_subject_df, x="x", y="y", color="major", labels={"color": "Major"}, hover_data=["subject_code", "uni"], symbol="uni")
fig.update_layout(
    title="t-SNE visualization of SBERT embeddings of UTS and USYD subjects by Major",
    xaxis_title="First t-SNE",
    yaxis_title="Second t-SNE",
)
fig.show()