In [551]:
import os
import json
from tqdm.notebook import tqdm

In [552]:
# function to read a file
def read_file(file):
    with open(file, 'r') as f:
        return f.read()

In [553]:
[1, 2, 3, 4, 5][1:]

[2, 3, 4, 5]

In [554]:
def remove_unecessary_info(text: str) -> str:
    lines = text.split("\n")
    
    start_line_index = None
    end_line_index = None
    
    for i, line in enumerate(lines):
      if line == "### Description":
        start_line_index = i
        
      if line == "[UTS: Handbook 2024](https://www.handbook.uts.edu.au/index.html)":
        end_line_index = i
        break
        
    return "\n".join(lines[start_line_index:end_line_index])

In [555]:
def fix_slos(slos_text: str) -> str | None:
    try:
        is_even = False
        chars = list(slos_text)

        for i, char in enumerate(chars):
            if char == "|":
                if is_even:
                    chars[i] = "\n"
                    chars[i - 1] = ""
                    chars[i + 1] = ""
                else:
                    chars[i] = ""

                is_even = not is_even

        return "".join(chars)
    except:
        return None

In [556]:
def get_slos(text: str) -> str:
    lines = text.split("\n")
    
    start_line_index = None
    end_line_index = None
    
    for i, line in enumerate(lines):
      if line == "### Subject learning objectives (SLOs)":
        start_line_index = i
        continue
        
      if start_line_index is not None and "###" in line:
        end_line_index = i
        break
      
    slos = None
    
    for line in lines[start_line_index:end_line_index]:
      if "|" in line:
        slos = line
        break
        
    return slos

In [557]:
def fix_bold_text(text: str) -> str:
    lines = text.split("\n")
    
    for i, line in enumerate(lines):
      num_bold_tags = line.count("**")
      
      if i + 1 < len(lines) and num_bold_tags % 2 != 0:
        lines[i] += " " + lines[i+1]
        lines[i+1] = ""
        
    return "\n".join(lines)

In [558]:
def remove_greater_than_two_line_break(text: str):
    lines = text.split("\n")
    is_second_line = False
    remove_indices = []
    
    for i, line in enumerate(lines):
      if line.strip() == "":
        if is_second_line:
          remove_indices.append(i)
        else:
          is_second_line = True
      else:
        is_second_line = False
        
    for i in sorted(remove_indices, reverse=True):
      del lines[i]
          
    return "\n".join(lines)

In [559]:
test_file = read_file("./subjects_cleaned/markdown/33130.md")
test_file_2 = read_file("./subjects_cleaned/markdown/33230.md")

In [560]:
def clean_markdown(text: str) -> str:
    text = remove_unecessary_info(text)
    text = fix_bold_text(text)
    text = remove_greater_than_two_line_break(text)
    slos = get_slos(text)
    updated_slos = fix_slos(slos)
    
    if updated_slos is not None:
        text = text.replace(slos, updated_slos)
        
    text = text.replace("https://handbook.uts.edu.au/", "/")
    
    return text

In [561]:
def get_requisite_subjects(requisite_text: str) -> list[list[dict[str, str]]]:
    requisites = []
    requisite_text = requisite_text.replace("\n", " ")
    
    for requisite in requisite_text.split("AND"):
        requisites.append([])
        
        for requisite in requisite.split("OR"):
            id = None
            name = None
            
            if "https" in requisite:
                id = requisite.split("]")[0].replace("[", "").strip()
                name = requisite.split(")")[1].strip().replace("_", "").strip()
            else:
                req_words = requisite.strip().split(" ")
                id = req_words[0]
                name = " ".join(req_words[1:]).replace("_", "").strip()
          
            requisites[-1].append({
              "subjectId": id,
              "subjectName": name,
            })
            
    return requisites

In [562]:
def get_requisites(requisite_text: str) -> tuple[list[list[dict[str, str]]]]:    
    requisites = []
    antirequisites = []
    
    if "_Requisite(s):" in requisite_text:
        if "_Anti-requisite(s):" in requisite_text:
            requisites = get_requisite_subjects(requisite_text.split("_Anti-requisite(s):")[0])
            antirequisites = get_requisite_subjects(requisite_text.split("_Anti-requisite(s):")[1])
        else:
            requisites = get_requisite_subjects(requisite_text.split("_Requisite(s):")[1])
    else:
        if "_Anti-requisite(s):" in requisite_text:
            antirequisites = get_requisite_subjects(requisite_text.split("_Anti-requisite(s):")[1])
        
    return requisites, antirequisites

In [563]:
def get_subject(subject_code: str, text: str) -> dict[str, any]:
    name = None
    sessions = ["AUTUMN"]
    credit_points = None
    result_type = "Grade and marks"
    content = clean_markdown(text)

    lines = text.split("\n")

    req_start_index = None
    req_end_index = None

    for i, line in enumerate(lines):
        if name is None and "#" in line:
            name = line.replace("# ", "").strip()
            name = name.replace(f"{subject_code} ", "")

        if "_Credit points:_" in line:
            credit_points = int(
                line.replace("_Credit points:_", "").strip().split(" ")[0]
            )

        if "_Result type:_" in line:
            result_type = line.replace("_Result type:_", "").strip()

        if "_Requisite(s):" in line or "_Anti-requisite(s):" in line:
            req_start_index = i

        if req_start_index is not None and line.strip() == "":
            req_end_index = i
            break

    requisites, antirequisites = get_requisites(
        "\n".join(lines[req_start_index:req_end_index])
    )

    return {
        "_id": subject_code,
        "name": name,
        "sessions": sessions,
        "creditPoints": credit_points,
        "resultType": result_type,
        "content": content,
        "requisites": requisites,
        "antirequisites": antirequisites,
    }

In [564]:
get_subject("33130", test_file)

{'_id': '33130',
 'name': 'Mathematics 1',
 'sessions': ['AUTUMN'],
 'creditPoints': 6,
 'resultType': 'Grade and marks',
 'content': '### Description\n\nThis subject develops the knowledge and skills necessary for problem-solving\nand mathematical modelling at an introductory level. Differential calculus is\napplied to model situations in science and engineering that involve\noscillations. Integral calculus is used to solve selected problems involving\nfirst- and second-order differential equations, and to calculate areas,\nvolumes, lengths and other physical quantities. Vectors, matrix multiplication\nand determinants are introduced and applied to problem-solving and modelling.\nSequences and series are reviewed and power series introduced where power\nseries are used to approximate more functions.\n\n### Subject learning objectives (SLOs)\n\nUpon successful completion of this subject students should be able to:\n\n1.  Describe the relevance of mathematics to engineering and science 

In [565]:
get_subject("33230", test_file_2)

{'_id': '33230',
 'name': 'Mathematics 2',
 'sessions': ['AUTUMN'],
 'creditPoints': 6,
 'resultType': 'Grade and marks',
 'content': '### Description\n\nThis subject consists of two parts: multivariate calculus and an introduction\nto statistics. The mathematical part develops the mathematical skills required\nfor mathematical modelling of systems involving more than one independent\nvariable. The statistics part is an introduction to descriptive statistics,\nstatistical inference and simple linear regression. Topics include linear\nalgebra, solutions to sets of equations resulting from particular problems,\neigenvectors and eigenvalues, partial derivatives, optimisation, multiple\nintegrals and their applications, and probability with a focus on the\ndetermination of the reliability of a system of components in various\nengineering contexts.\n\n### Subject learning objectives (SLOs)\n\nUpon successful completion of this subject students should be able to:\n\n1.  model real world prob

In [566]:
subjects = []
files = os.listdir("./subjects_cleaned/markdown")

for filename in tqdm(files, total=len(files)):
    if filename.endswith(".md"):
        subject_code = filename.replace(".md", "")
        text = read_file(f"./subjects_cleaned/markdown/{filename}")
        subjects.append(get_subject(subject_code, text))

  0%|          | 0/265 [00:00<?, ?it/s]

In [567]:
with open("./subjects_cleaned/subjects.json", "w") as f:
    json.dump(subjects, f, indent=2)