In [153]:
import os
import json
from typing import Literal

import pandas as pd
import numpy as np
from sklearn.manifold import TSNE
import plotly.express as px

In [154]:
with open("./data/degree_subject_codes/subject_to_degrees.json") as f:
    subject_to_degrees = json.load(f)

In [155]:
with open("./data/major_rels/subject_to_majors.json") as f:
    subject_to_majors = json.load(f)

In [156]:
with open("./data/degree_subject_codes/subject_to_median_year.json") as f:
    subject_to_median_year = json.load(f)

In [157]:
subjects = {}

In [158]:
for uni in ["uts", "usyd"]:
  for embedding_type in os.listdir(f"./data/embeddings/{uni}"):
    if embedding_type not in subjects:
      subjects[embedding_type] = {
        "subject_codes": [],
        "degrees": [],
        "majors": [],
        "median_years": [],
        "embeddings": []
      }
    
    for embedding_file in os.listdir(f"./data/embeddings/{uni}/{embedding_type}"):
      if not embedding_file.endswith(".json"):
        continue
      
      subject_code = embedding_file.replace(".json", "")
      
      with open(f"./data/embeddings/{uni}/{embedding_type}/{embedding_file}", "r") as f:
        embedding = json.load(f)
        
      subjects[embedding_type]["subject_codes"].append(subject_code)
      subjects[embedding_type]["degrees"].append(subject_to_degrees[subject_code])
      subjects[embedding_type]["majors"].append(subject_to_majors[subject_code])
      subjects[embedding_type]["median_years"].append(subject_to_median_year[subject_code])
      subjects[embedding_type]["embeddings"].append(embedding)

In [159]:
degree_abbreviations = {
  "Bachelor of Engineering (Honours)": "ENGI",
  "Bachelor of Computing Science (Honours)": "COMP",
  "Bachelor of Advanced Computing": "COMP",
  "Bachelor of Nursing (Advanced Studies)": "NURS",
  "Bachelor of Nursing": "NURS",
}

In [160]:
def get_degree_abbr(degrees):
  if len(degrees) == 0:
    return "None"
  
  # highest number of degrees is 2
  if len(degrees) == 2:
    return f"ENGI - COMP"
  
  return degree_abbreviations[degrees[0]]

In [161]:
uts_to_usyd_major = {
  "Biomedical Engineering": "Biomedical",
  "Civil Engineering": "Civil",
  "Electrical Engineering": "Electrical",
  "Electronic Engineering": "Electrical",
  "Mechanical Engineering": "Mechanical",
  "Mechatronic Engineering": "Mechatronic",
  "Software Engineering": "Software",
  "Civil and Environmental Engineering": "Civil",
  "Mechanical and Mechatronic Engineering": "Mechatronic",
  "Electrical and Electronic Engineering": "Electrical",
  "Renewable Energy Engineering": None,
  "Chemical Process Engineering": None,
  "Data Science Engineering": None,
  "Cybersecurity and Privacy" : "Cybersecurity",
  "Enterprise Software Development" : "Software Development",
  "Networking and Cybersecurity" : "Cybersecurity",
  "Quantum Information Science" : None,
  "Interaction Design": None,
  "Mathematical Analysis": None,
  "Business Information Systems Management": None,
  "Artificial Intelligence and Data Analytics": "Computational Data Science",
}

In [162]:
usyd_to_uts_major = { v: k for k, v in uts_to_usyd_major.items() }

In [163]:
def get_major_label(majors):
  if len(majors) == 1:
    if majors[0] in usyd_to_uts_major:
      return usyd_to_uts_major[majors[0]][:4].upper()
    else:
      return majors[0][:4].upper()
    
  if len(majors) >= 2:
    major_abbreviations = []
    
    for major in majors:
      if major in usyd_to_uts_major:
        major_abbreviations.append(str(usyd_to_uts_major[major])[:4])
      else:
        major_abbreviations.append(major[:4])
        
    major_abbreviations = sorted(list(set(major_abbreviations)))
    
    return " - ".join(major_abbreviations).upper()
    
  return "CORE/NONE"

In [164]:
EmbeddingType = Literal["mxbai", "instructor", "sbert", "glove", "doc2vec"]

In [165]:
def get_subject_features(embedding_type: EmbeddingType):
  embeddings = np.array([subject for subject in subjects[embedding_type]["embeddings"]])
  degrees = [get_degree_abbr(subject) for subject in subjects[embedding_type]["degrees"]]
  majors = [get_major_label(subject) for subject in subjects[embedding_type]["majors"]]
  unis = ["UTS" if subject_code.isdigit() else "USYD" for subject_code in subjects[embedding_type]["subject_codes"]]
  years = [str(year) for year in subjects[embedding_type]["median_years"]]
  
  return embeddings, degrees, majors, unis, years

In [166]:
def show_degree_plot(embedding_type: EmbeddingType):
  embeddings, degrees, majors, unis, _ = get_subject_features(embedding_type)
  
  tsne = TSNE(n_components=2, random_state=42)
  X_tsne = tsne.fit_transform(embeddings)
  tsne.kl_divergence_
  
  subject_df = pd.DataFrame({
  "x": X_tsne[:, 0],
  "y": X_tsne[:, 1],
  "degree": degrees,
  "major": majors,
  "subject_code": subjects[embedding_type]["subject_codes"],
  "uni": unis
  })
  
  fig = px.scatter(subject_df, x="x", y="y", color="degree", symbol="uni", hover_name="subject_code", hover_data=["subject_code", "uni"])
  fig.update_layout(
      title=f"t-SNE of {embedding_type.upper()} embeddings of UTS and USYD subjects using by Degree",
      xaxis_title="First t-SNE",
      yaxis_title="Second t-SNE",
  )
  fig.show()

In [167]:
def show_major_plot(embedding_type: Literal["mxbai", "instructor", "sbert", "glove", "doc2vec"]):
  embeddings, degrees, majors, unis, _ = get_subject_features(embedding_type)
  
  tsne = TSNE(n_components=2, random_state=42)
  X_tsne = tsne.fit_transform(embeddings)
  tsne.kl_divergence_
  
  subject_df = pd.DataFrame({
  "x": X_tsne[:, 0],
  "y": X_tsne[:, 1],
  "degree": degrees,
  "major": majors,
  "subject_code": subjects[embedding_type]["subject_codes"],
  "uni": unis
  })
  
  fig = px.scatter(subject_df, x="x", y="y", color="major", labels={"color": "Major"}, hover_data=["subject_code", "uni"], symbol="uni")
  fig.update_layout(
      title=f"t-SNE visualization of {embedding_type.upper()} embeddings of UTS and USYD subjects by Major",
      xaxis_title="First t-SNE",
      yaxis_title="Second t-SNE",
  )
  fig.show()

In [168]:
def show_year_plot(embedding_type: Literal["mxbai", "instructor", "sbert", "glove", "doc2vec"]):
  embeddings, degrees, majors, unis, years = get_subject_features(embedding_type)
  
  tsne = TSNE(n_components=2, random_state=42)
  X_tsne = tsne.fit_transform(embeddings)
  tsne.kl_divergence_
  
  subject_df = pd.DataFrame({
  "x": X_tsne[:, 0],
  "y": X_tsne[:, 1],
  "degree": degrees,
  "major": majors,
  "subject_code": subjects[embedding_type]["subject_codes"],
  "uni": unis,
  "year": years
  })
  
  fig = px.scatter(subject_df, x="x", y="y", color="year", labels={"color": "Year"}, hover_data=["subject_code", "uni"], symbol="uni")
  fig.update_layout(
      title=f"t-SNE visualization of {embedding_type.upper()} embeddings of UTS and USYD subjects by Year",
      xaxis_title="First t-SNE",
      yaxis_title="Second t-SNE",
  )
  fig.show()

In [169]:
show_degree_plot("mxbai")

In [170]:
show_degree_plot("doc2vec")

In [171]:
show_major_plot("instructor")

In [172]:
show_major_plot("glove")

In [173]:
show_major_plot("doc2vec")

In [174]:
show_year_plot("mxbai")

In [175]:
show_year_plot("doc2vec")