In [1]:
import os
import requests

from tqdm.notebook import tqdm
from bs4 import BeautifulSoup

In [2]:
degree_pages = []

In [3]:
for degree_page_file in os.listdir("./test-degrees"):
  with open("./test-degrees/" + degree_page_file, "r") as f:
    degree_page = f.read()
    degree_pages.append(degree_page)

In [4]:
def get_last_char_index(text: str) -> int:
  for i in range(len(text) - 1, -1, -1):
    if text[i].isalpha():
      return i
  return -1

In [5]:
x = "MAJ02900"
x[:get_last_char_index(x)+1], x[get_last_char_index(x)+1:]

('MAJ', '02900')

In [6]:
def get_course_program(soup: BeautifulSoup):
  course_program = {
    "subjects": {},
  }
  subject_table = None
  
  for table in soup.findAll("table"):
    for a in table.findAll("a"):
      if "https://handbook.uts.edu.au/subjects" in a.get("href"):
        subject_table = table
        
  for a in subject_table.findAll("a"):
    code = a.text.strip()
    
    # is a subject
    if code.isdigit():
      course_program["subjects"][code] = a.get("href")
      continue
    
    # could be a major, sub-major, elective group etc
    last_char_index = get_last_char_index(code)
    program_type, code = code[:last_char_index+1], code[last_char_index+1:]
    
    if program_type not in course_program:
      course_program[program_type] = {}
    
    course_program[program_type][code] = a.get("href")
    
  return course_program

In [7]:
degree_code_title = {}
course_programs = {}

for degree_page in degree_pages:
  soup = BeautifulSoup(degree_page, "html.parser")
  
  degree_title = soup.find("h1").text.strip()
  degree_code = degree_title.split(" ")[0].split("v")[0]
  degree_code_title[degree_code] = degree_title
  
  program = get_course_program(soup)
  course_programs[degree_code] = program

In [8]:
print("Total number of subjects:", sum([len(x["subjects"]) for x in course_programs.values()]))

Total number of subjects: 281


In [9]:
# combine list of sets
all_subjects = set()
for program in course_programs.values():
  all_subjects = all_subjects.union(set(program["subjects"].keys()))

print("Total number of unique subjects:", len(all_subjects))

Total number of unique subjects: 265


In [10]:
os.makedirs("course_programs", exist_ok=True)

In [11]:
for degree_code, program in course_programs.items():
  dir_path = f"course_programs/{degree_code}"
  
  os.makedirs(dir_path, exist_ok=True)
  
  for program_type, program_data in program.items():
    os.makedirs(f"{dir_path}/{program_type}", exist_ok=True)
    
    if program_type == "subjects":
      for subject_code, subject_url in tqdm(program_data.items(), total=len(program_data)):
        with open(f"{dir_path}/subjects/{subject_code}.html", "w") as f:
          url = requests.get(f"https://handbook.uts.edu.au/subjects/details/{subject_code}.html")
          htmltext = url.text
          f.write(htmltext)
      continue
    
    for program_code, program_url in program_data.items():
      with open(f"{dir_path}/{program_type}/{program_code}.html", "w") as f:
        url = requests.get(program_url)
        htmltext = url.text
        f.write(htmltext)

  0%|          | 0/98 [00:00<?, ?it/s]

  0%|          | 0/155 [00:00<?, ?it/s]

  0%|          | 0/28 [00:00<?, ?it/s]