In [1]:
import os
import json

from bs4 import BeautifulSoup

In [2]:
uni_major_to_major_name = {
    "uts": {},
    "usyd": {
        "Computational Data Science": "Computational Data Science",
        "Computer Science": "Computer Science",
        "Cyber Security": "Cyber Security",
        "Software Development": "Software Development",
    },
}

In [3]:
major_id_to_soup = {
    "uts": {},
    "usyd": {},
}

In [4]:
for file in os.listdir("./data/major_pages/uts"):
    if file.endswith(".html"):
        with open(f"./data/major_pages/uts/{file}", "r") as f:
            major_page = f.read()
            soup = BeautifulSoup(major_page, "html.parser")
            title = soup.find("h1").text
            
            major_id = title.split(" ")[0]
            major_name = title.replace(major_id, "").strip()
            
            major_id_to_soup["uts"][major_id] = soup
            
            uni_major_to_major_name["uts"][major_id] = major_name

In [5]:
for file in os.listdir("./data/major_pages/usyd"):
    if file.endswith(".html"):
        with open(f"./data/major_pages/usyd/{file}", "r") as f:
            major_page = f.read()
            soup = BeautifulSoup(major_page, "html.parser")
            
            major_id = file.split("-")[0].strip()
            major_id_to_soup["usyd"][major_id] = soup

In [6]:
with open("./data/degree_subject_codes/uni_degree_subjects.json", "r") as f:
    uni_degree_subjects = json.load(f)

In [7]:
uts_computer_science_subjects = uni_degree_subjects[0][1]["c09119"]
usyd_computer_science_subjects = uni_degree_subjects[1][1]["advanced_computing_table"]

In [8]:
uts_subject_to_majors = {}

for subject_code in uts_computer_science_subjects:
    if subject_code not in uts_subject_to_majors:
        uts_subject_to_majors[subject_code] = []
        
    for major_id, major_soup in major_id_to_soup["uts"].items():
        for link in major_soup.find_all("a"):
            if subject_code in link.text:
                uts_subject_to_majors[subject_code].append(major_id)

In [9]:
def is_usyd_subject_code(text: str) -> bool:
    if len(text) != 8:
        return False
      
    if text[4:].isnumeric():
        return True

In [10]:
usyd_subject_to_majors = {}

for subject_code in usyd_computer_science_subjects:
    if subject_code not in usyd_subject_to_majors:
        usyd_subject_to_majors[subject_code] = []

    for major_id, major_soup in major_id_to_soup["usyd"].items():
        for strong in major_soup.find_all("strong"):
            link = strong.find("a")
            
            if not link:
                continue
            
            if not is_usyd_subject_code(link.text):
                continue
            
            if subject_code in link.text:
                usyd_subject_to_majors[subject_code].append(major_id)

In [11]:
print(
    "Percentage of USYD subjects with 1 or more majors:",
    len([x for x in usyd_subject_to_majors.items() if len(x[1]) >= 1])
    / len(usyd_computer_science_subjects),
)

Percentage of USYD subjects with 1 or more majors: 0.1522633744855967


In [12]:
print(
    "Percentage of UTS subjects with 1 or more majors:",
    len([x for x in uts_subject_to_majors.items() if len(x[1]) >= 1])
    / len(uts_subject_to_majors),
)

Percentage of UTS subjects with 1 or more majors: 0.7857142857142857


## Verifying all USYD Computer Science Subjects have been Scraped

The reason for the low percentage of USYD subjects is mainly because there exist far more only-core subjects than major subjects.

Another reason is that there is a small number of non-Computer Science subjects included in the USYD dataset as subject codes for requisites and anti-requisites were scraped (which may not necessarily be from Computer Science).

In [13]:
major_frequency = {}

for subject_code, majors in usyd_subject_to_majors.items():
    for major in majors:
        if major not in major_frequency:
            major_frequency[major] = 0
        major_frequency[major] += 1

In [14]:
major_frequency

{'Computational Data Science': 18,
 'Computer Science': 26,
 'Cybersecurity': 11,
 'Software Development': 12}

In [15]:
usyd_num_non_core_subjects = {}

for major_id, major_soup in major_id_to_soup["usyd"].items():
    usyd_num_non_core_subjects[major_id] = 0
    
    for strong in major_soup.find_all("strong"):
        link = strong.find("a")
        
        if not link:
            continue
        
        if not is_usyd_subject_code(link.text):
            continue
        
        if link.text not in usyd_computer_science_subjects:
            usyd_num_non_core_subjects[major_id] += 1

usyd_num_non_core_subjects

{'Computational Data Science': 0,
 'Computer Science': 0,
 'Cybersecurity': 0,
 'Software Development': 0}

## Creating a Major Equivalents Dictionary

Needs to be unidirectional so I manually created on direction and automated the other based on it.

In [16]:
uni_major_equivalents = {
    "MAJ10053": ["Computational Data Science"],
    "MAJ01156": ["Computational Data Science"],
    "MAJ02900": ["Cyber Security"],
    "MAJ03445": ["Cyber Security"],
    "MAJ03519": ["Software Development"],
    "MAJ02080": ["Software Development"],
    "MAJ02901": [],
    "MAJ02092": [],
}

In [17]:
usyd_to_uts_majors = {}

for uts_major, usyd_majors in uni_major_equivalents.items():
  for usyd_major in usyd_majors:
    if usyd_major not in usyd_to_uts_majors:
      usyd_to_uts_majors[usyd_major] = []
      
    usyd_to_uts_majors[usyd_major].append(uts_major)

In [18]:
uni_major_equivalents = { **uni_major_equivalents, **usyd_to_uts_majors }
uni_major_equivalents

{'MAJ10053': ['Computational Data Science'],
 'MAJ01156': ['Computational Data Science'],
 'MAJ02900': ['Cyber Security'],
 'MAJ03445': ['Cyber Security'],
 'MAJ03519': ['Software Development'],
 'MAJ02080': ['Software Development'],
 'MAJ02901': [],
 'MAJ02092': [],
 'Computational Data Science': ['MAJ10053', 'MAJ01156'],
 'Cyber Security': ['MAJ02900', 'MAJ03445'],
 'Software Development': ['MAJ03519', 'MAJ02080']}

In [19]:
os.makedirs("./data/major_rels", exist_ok=True)

In [20]:
with open("./data/major_rels/usyd_subject_to_majors.json", "w") as f:
    json.dump(usyd_subject_to_majors, f)

In [21]:
with open("./data/major_rels/uts_subject_to_majors.json", "w") as f:
    json.dump(uts_subject_to_majors, f)

In [22]:
with open("./data/major_rels/uni_major_equivalents.json", "w") as f:
    json.dump(uni_major_equivalents, f)