In [150]:
import os
import json
from dataclasses import dataclass, asdict
from typing import Literal

from bs4 import BeautifulSoup

In [151]:
@dataclass
class Subject:
  code: str
  name: str
  
  def __str__(self) -> str:
    return f"{self.code} - {self.name}"
  
  def __hash__(self) -> int:
    return hash(self.code)
  
  def __eq__(self, other) -> bool:
    return self.code == other.code

In [152]:
@dataclass
class SuggestedTiming:
    year: int
    session: Literal["Autumn", "Spring", "Summer"]

    def __str__(self) -> str:
        return f"{self.year} - {self.session}"

    def __hash__(self) -> int:
        return hash(str(self))

    def __eq__(self, other) -> bool:
        return str(self) == str(other)

    def __lt__(self, other):
        if self.year != other.year:
            return self.year < other.year

        sessions = ["Autumn", "Spring", "Summer"]
        return sessions.index(self.session) < sessions.index(other.session)

In [153]:
# function to convert dict key to str
def convert_key_to_str(obj):
  if isinstance(obj, dict):
    return {str(k): convert_key_to_str(v) for k, v in obj.items()}
  elif isinstance(obj, list):
    return [convert_key_to_str(i) for i in obj]
  else:
    return obj

In [154]:
def serialize_subject_to_suggested_timing(subject_to_suggested_timing: dict) -> dict:
    subject_to_suggested_timing = convert_key_to_str(subject_to_suggested_timing)

    for subject, suggestedTimings in subject_to_suggested_timing.items():
        print(suggestedTimings)
        subject_to_suggested_timing[subject] = [
            asdict(suggestedTiming) for suggestedTiming in suggestedTimings
        ]

    return subject_to_suggested_timing

In [155]:
def get_preferred_subject_order(degree_html: str):
    subject_to_suggested_timing = {}
    degree_soup = BeautifulSoup(degree_html, "html.parser")
    
    degree_name = degree_soup.find("h1").text.strip()

    current_year = None
    current_session = None

    for table_row in degree_soup.find_all("table")[-1].find_all("tr"):
        row_text = table_row.text.strip()

        if "Year" in row_text:
            current_year = int(row_text.split(" ")[-1])
            continue

        if "session" in row_text:
            current_session = row_text.split(" ")[0]
            continue

        subject_link = table_row.find("a")

        if (
            subject_link is not None
            and "https://handbook.uts.edu.au/subjects/" in subject_link.get("href")
        ):
            subject_code = subject_link.text.strip()
            subject_name = (
                table_row.find("td").text.strip().replace(subject_code, "").strip()
            )
            subject = Subject(subject_code, subject_name)
            suggested_timing = SuggestedTiming(current_year, current_session)

            if subject not in subject_to_suggested_timing:
                subject_to_suggested_timing[subject] = set()

            if current_year is not None and current_session is not None:
                subject_to_suggested_timing[subject].add(suggested_timing)

    for subject, suggestedTimings in subject_to_suggested_timing.items():
        subject_to_suggested_timing[subject] = sorted(list(suggestedTimings))
        
    subject_to_suggested_timing = serialize_subject_to_suggested_timing(subject_to_suggested_timing)

    return degree_name, subject_to_suggested_timing

In [156]:
# function that saves dict as json with indent = 2
def save_dict_as_json(dict, filename):
    with open(filename, 'w') as file:
        json.dump(dict, file, indent=2)

In [157]:
for html_file in os.listdir("./test-degrees"):
    degree_name, degree_preferred_subject_order = get_preferred_subject_order(open(f"./test-degrees/{html_file}").read())
    save_dict_as_json(degree_preferred_subject_order, f"./degree_timelines/{degree_name}.json")

[SuggestedTiming(year=1, session='Autumn')]
[SuggestedTiming(year=1, session='Autumn')]
[SuggestedTiming(year=1, session='Autumn')]
[SuggestedTiming(year=1, session='Autumn')]
[SuggestedTiming(year=1, session='Spring')]
[SuggestedTiming(year=1, session='Spring')]
[SuggestedTiming(year=1, session='Spring')]
[SuggestedTiming(year=1, session='Spring')]
[SuggestedTiming(year=2, session='Autumn'), SuggestedTiming(year=2, session='Spring')]
[SuggestedTiming(year=2, session='Autumn'), SuggestedTiming(year=2, session='Spring'), SuggestedTiming(year=3, session='Autumn')]
[SuggestedTiming(year=2, session='Autumn')]
[SuggestedTiming(year=2, session='Autumn')]
[SuggestedTiming(year=2, session='Spring')]
[SuggestedTiming(year=2, session='Spring'), SuggestedTiming(year=3, session='Autumn'), SuggestedTiming(year=3, session='Spring'), SuggestedTiming(year=4, session='Autumn'), SuggestedTiming(year=4, session='Spring')]
[SuggestedTiming(year=2, session='Spring')]
[SuggestedTiming(year=3, session='Autum