In [10]:
import os
import json

import pandas as pd
from tqdm.notebook import tqdm
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, recall_score, precision_score

In [11]:
with open("./data/degree_subject_codes/subject_to_degrees.json") as f:
    subject_to_degrees = json.load(f)

In [12]:
with open("./data/degree_subject_codes/subject_to_min_year.json") as f:
    subject_to_min_year = json.load(f)

In [13]:
with open("./data/major_rels/subject_to_majors.json") as f:
    subject_to_majors = json.load(f)

In [14]:
uts_to_usyd_major = {
  "Biomedical Engineering": "Biomedical",
  "Civil Engineering": "Civil",
  "Electrical Engineering": "Electrical",
  "Electronic Engineering": "Electrical",
  "Mechanical Engineering": "Mechanical",
  "Mechatronic Engineering": "Mechatronic",
  "Software Engineering": "Software",
  "Civil and Environmental Engineering": "Civil",
  "Mechanical and Mechatronic Engineering": "Mechatronic",
  "Electrical and Electronic Engineering": "Electrical",
  "Renewable Energy Engineering": None,
  "Chemical Process Engineering": None,
  "Data Science Engineering": None,
  "Cybersecurity and Privacy" : "Cybersecurity",
  "Enterprise Software Development" : "Software Development",
  "Networking and Cybersecurity" : "Cybersecurity",
  "Quantum Information Science" : None,
  "Interaction Design": None,
  "Mathematical Analysis": None,
  "Business Information Systems Management": None,
  "Artificial Intelligence and Data Analytics": "Computational Data Science",
}

In [15]:
uts_comp_sci_codes = []
usyd_comp_sci_codes = []

uts_eng_codes = []
usyd_eng_codes = []

uts_nursing_codes = []
usyd_nursing_codes = []

for subject, degrees in subject_to_degrees.items():
    if "Bachelor of Computing Science (Honours)" in degrees:
        uts_comp_sci_codes.append(subject)
        continue
        
    if "Bachelor of Advanced Computing" in degrees:
        usyd_comp_sci_codes.append(subject)
        continue
      
    if "Bachelor of Engineering (Honours)" in degrees:
        if subject.isdigit():
          uts_eng_codes.append(subject)
        else:
          usyd_eng_codes.append(subject)
        continue
      
    if "Bachelor of Nursing" in degrees:
        uts_nursing_codes.append(subject)
        continue
    
    if "Bachelor of Nursing (Advanced Studies)" in degrees:
        usyd_nursing_codes.append(subject)
        continue

In [16]:
def is_share_major(uts_majors, usyd_majors):
    if len(uts_majors) == 0 or len(usyd_majors) == 0:
        return True
  
    for uts_major in uts_majors:
        if uts_to_usyd_major[uts_major] in usyd_majors:
            return True

    return False

In [17]:
def get_non_rpl_subject_pairs(uts_degree_subject_codes, usyd_degree_subject_codes, is_major_criterion=True):
  precedence_table = []
  
  for uts_subject_code in tqdm(uts_degree_subject_codes):
    uts_min_year = subject_to_min_year.get(uts_subject_code, 10)
    
    if uts_min_year is None:
      uts_min_year = 10
    
    uts_majors = subject_to_majors.get(uts_subject_code, [])
  
    for usyd_subject_code in usyd_degree_subject_codes:
      usyd_min_year = subject_to_min_year.get(usyd_subject_code, -1)
      usyd_majors = subject_to_majors.get(usyd_subject_code, [])
      
      is_equivalent = None
      
      if is_major_criterion:
        is_equivalent = True if (uts_min_year <= usyd_min_year) and is_share_major(uts_majors, usyd_majors) else False
      else:
        is_equivalent = True if uts_min_year <= usyd_min_year else False
      
      precedence_table.append([uts_subject_code, usyd_subject_code, is_equivalent])
      
  non_rpl_subject_pairs_df = pd.DataFrame(precedence_table, columns=["uts_subject", "usyd_subject", "is_not_rpl"])
  return non_rpl_subject_pairs_df

In [18]:
comp_non_rpl_subject_pairs = get_non_rpl_subject_pairs(uts_comp_sci_codes, usyd_comp_sci_codes)
eng_non_rpl_subject_pairs = get_non_rpl_subject_pairs(uts_eng_codes, usyd_eng_codes)
nursing_non_rpl_subject_pairs = get_non_rpl_subject_pairs(uts_nursing_codes, usyd_nursing_codes, is_major_criterion=False)

  0%|          | 0/104 [00:00<?, ?it/s]

  0%|          | 0/139 [00:00<?, ?it/s]

  0%|          | 0/50 [00:00<?, ?it/s]

In [19]:
comp_non_rpl_subject_pairs.head()

Unnamed: 0,uts_subject,usyd_subject,is_not_rpl
0,33130,DATA1001,True
1,33130,DATA1901,True
2,33130,ELEC1601,True
3,33130,INFO1110,True
4,33130,INFO1111,True


In [20]:
print("Number not-rpl comp pairs:", f'{len(comp_non_rpl_subject_pairs[comp_non_rpl_subject_pairs["is_not_rpl"]])}/{len(comp_non_rpl_subject_pairs)}', "=", len(comp_non_rpl_subject_pairs[comp_non_rpl_subject_pairs["is_not_rpl"]])/len(comp_non_rpl_subject_pairs))
print("Number not-rpl eng pairs:", f'{len(eng_non_rpl_subject_pairs[eng_non_rpl_subject_pairs["is_not_rpl"]])}/{len(eng_non_rpl_subject_pairs)}', "=", len(eng_non_rpl_subject_pairs[eng_non_rpl_subject_pairs["is_not_rpl"]])/len(eng_non_rpl_subject_pairs))
print("Number not-rpl nursing pairs:", f'{len(nursing_non_rpl_subject_pairs[nursing_non_rpl_subject_pairs["is_not_rpl"]])}/{len(nursing_non_rpl_subject_pairs)}', "=", len(nursing_non_rpl_subject_pairs[nursing_non_rpl_subject_pairs["is_not_rpl"]])/len(nursing_non_rpl_subject_pairs))

Number not-rpl comp pairs: 3611/11336 = 0.3185426958362738
Number not-rpl eng pairs: 14517/32109 = 0.4521162290946464
Number not-rpl nursing pairs: 880/1200 = 0.7333333333333333


In [21]:
subject_to_embedding = {}

for uni in ["uts", "usyd"]:
  for subject_filename in os.listdir(f"./data/embeddings/{uni}/mxbai"):
    if not subject_filename.endswith(".json"):
      continue
    
    with open(f"./data/embeddings/{uni}/mxbai/{subject_filename}") as f:
      subject_to_embedding[subject_filename.replace(".json", "")] = json.load(f)

In [22]:
subject_pair_embeddings = []

for df in [comp_non_rpl_subject_pairs, eng_non_rpl_subject_pairs, nursing_non_rpl_subject_pairs]:
  for _, row in df.iterrows():
    uts_subject_code = row["uts_subject"]
    usyd_subject_code = row["usyd_subject"]
    
    uts_subject_embedding = subject_to_embedding[uts_subject_code]
    usyd_subject_embedding = subject_to_embedding[usyd_subject_code]
    
    subject_pair_embeddings.append(uts_subject_embedding + usyd_subject_embedding)

In [23]:
is_not_rpl_vals = comp_non_rpl_subject_pairs["is_not_rpl"].values.tolist() + eng_non_rpl_subject_pairs["is_not_rpl"].values.tolist() + nursing_non_rpl_subject_pairs["is_not_rpl"].values.tolist()

In [24]:
print(len(subject_pair_embeddings), len(is_not_rpl_vals))

44645 44645


In [25]:
X_train, X_test, y_train, y_test = train_test_split(
    subject_pair_embeddings, is_not_rpl_vals, test_size=0.2, random_state=42
)

In [26]:
clf = RandomForestClassifier(n_estimators=100)
clf.fit(X_train, y_train)
preds = clf.predict(X_test)
probas = clf.predict_proba(X_test)

In [27]:
report = classification_report(y_test, preds)
print(report)

              precision    recall  f1-score   support

       False       0.97      0.98      0.98      5178
        True       0.98      0.95      0.97      3751

    accuracy                           0.97      8929
   macro avg       0.97      0.97      0.97      8929
weighted avg       0.97      0.97      0.97      8929



In [28]:
recall_score(y_test, preds)

0.9541455611836843

In [29]:
precision_score(y_test, preds)

0.9762684124386252