In [412]:
import os
import json
from dataclasses import dataclass, asdict
import uuid

import pandas as pd

In [413]:
# function to read text file
def read_file(file):
    with open(file, 'r') as f:
        return f.read()

In [414]:
# function to read json file as dict
def read_json(file):
    with open(file, 'r') as f:
        return json.load(f)

In [415]:
subject_id_to_degree_id = {}

for degree_id in os.listdir("./course_programs"):
    if not os.path.isdir(f"./course_programs/{degree_id}"):
        continue

    for subject_id in os.listdir(f"./course_programs/{degree_id}/subjects"):
        if not subject_id.endswith(".html"):
            continue

        subject_id_to_degree_id[subject_id.replace(".html", "")] = degree_id

### Subject Nodes

In [416]:
subject_nodes = pd.read_json("./subjects_cleaned/subjects.json")
subject_nodes.drop(columns=["sessions", "requisites", "antiRequisites"], inplace=True)
subject_nodes.rename(
    columns={
        "_id": "subjectId:ID(Subject-ID)",
        "creditPoints": "creditPoints:int",
    },
    inplace=True,
)
subject_nodes["degreeId"] = [
    subject_id_to_degree_id.get(str(subject_id), "")
    for subject_id in subject_nodes["subjectId:ID(Subject-ID)"].tolist()
]
subject_nodes[":LABEL"] = "subject"
subject_nodes.head()

Unnamed: 0,subjectId:ID(Subject-ID),name,creditPoints:int,resultType,content,degreeId,:LABEL
0,41384,Chemical Thermodynamics and Reactor Design,6,Grade and marks,### Description\n\nThis subject provides stude...,C09066,subject
1,41021,Interaction Design Studio,12,Grade and marks,### Description\n\nThis is a capstone project ...,C09119,subject
2,43124,Renewable Energy Technology,6,Grade and marks,### Description\n\nThe world is heavily depend...,C09066,subject
3,48850,Environmental Planning and Law,6,Grade and marks,### Description\n\nThe objectives of this subj...,C09066,subject
4,31260,Fundamentals of Interaction Design,6,Grade and marks,### Description\n\nThis subject focuses on the...,C09119,subject


In [417]:
subject_nodes.to_csv("./degree_timelines/dag/subjects.csv", index=False)

### Requisite and Antirequisite Relations

In [418]:
subjects = read_json("./subjects_cleaned/subjects.json")

In [419]:
subject_ids = set([subject["_id"] for subject in subjects])

In [420]:
requisite_relations = []
requisite_group_relations = []
anti_requisite_relations = [] # anti requisites can only have AND relations

In [421]:
@dataclass
class RequisiteGroup:
    subject_ids: set[str]
    requisite_group_id: str
    
    def __init__(self, subject_ids: set[str]):
        self.subject_ids = subject_ids
        self.requisite_group_id = str(uuid.uuid4())
        
    def __eq__(self, other):
        return set(self.subject_ids) == set(other.subject_ids)

In [422]:
existing_requisite_groups = []

In [423]:
def get_requisite_group(group: RequisiteGroup):
    for existing_group in existing_requisite_groups:
        if existing_group == group:
            return existing_group
          
    return None

In [424]:
# function that replaces any non-numerical characters an empty string
def clean_req_code(string):
    return ''.join([i for i in string if i.isdigit()])

In [425]:
num_requisite_groups = 0

In [426]:
for subject in subjects:
    subject_id = subject["_id"]
    requisites = subject["requisites"]
    anti_requisites = subject["antiRequisites"]

    for requisite_group in requisites:
        requisite_group = [
            req
            for req in [clean_req_code(req["subjectId"]) for req in requisite_group]
            if req in subject_ids
        ]
        
        if len(requisite_group) == 0:
            continue

        if len(requisite_group) == 1:
            requisite_relations.append([subject_id, requisite_group[0], "requisite"])
            continue

        requisite_group = RequisiteGroup(set(requisite_group))
        requisite_group_id = requisite_group.requisite_group_id

        existing_requisite_group = get_requisite_group(requisite_group)

        if existing_requisite_group is not None:
            requisite_group_id = existing_requisite_group.requisite_group_id
        else:
            existing_requisite_groups.append(requisite_group)

        requisite_group_relations.append([subject_id, requisite_group_id, "requisite"])

        num_requisite_groups += 1

    for anti_requisite_group in anti_requisites:
        anti_requisite = anti_requisite_group[0]["subjectId"]

        if anti_requisite in subject_ids:
            anti_requisite_relations.append(
                [subject_id, anti_requisite, "anti-requisite"]
            )

In [427]:
len(existing_requisite_groups), num_requisite_groups

(31, 40)

In [428]:
pd.DataFrame(requisite_relations, columns=["subjectId:START_ID(Subject-ID)", "subjectId:END_ID(Subject-ID)", ":TYPE"]).to_csv("./degree_timelines/dag/requisites.csv", index=False)
pd.DataFrame(requisite_group_relations, columns=["subjectId:START_ID(Subject-ID)", "requisiteGroupId:END_ID(Requisite-Group-ID)", ":TYPE"]).to_csv("./degree_timelines/dag/requisite_groups_rels.csv", index=False)
pd.DataFrame(anti_requisite_relations, columns=["subjectId:START_ID(Subject-ID)", "subjectId:END_ID(Subject-ID)", ":TYPE"]).to_csv("./degree_timelines/dag/anti_requisites.csv", index=False)

In [429]:
existing_requisite_groups[:5]

[RequisiteGroup(subject_ids={'48531', '41278'}, requisite_group_id='d5c46c01-50fe-4915-88e5-a38cdbf4fc70'),
 RequisiteGroup(subject_ids={'31266', '41082', '48430', '48023', '41092', '31268', '31257', '41039'}, requisite_group_id='8ff2faf9-c29c-4cbd-9b65-87b2d3840434'),
 RequisiteGroup(subject_ids={'48641', '41057'}, requisite_group_id='927757d3-c902-4703-9716-784a3bb9cdf4'),
 RequisiteGroup(subject_ids={'41039', '48023', '48430'}, requisite_group_id='becf8b1c-14e0-4b1f-b063-2c1af45cce38'),
 RequisiteGroup(subject_ids={'41039', '48023'}, requisite_group_id='30c5b4f5-d920-4cce-83d5-aebff8610d77')]

### Requisite Groups

In [430]:
requisite_group_nodes = [[requisite_group.requisite_group_id, "requisite-group"] for requisite_group in existing_requisite_groups]
pd.DataFrame(requisite_group_nodes, columns=["requisiteGroupId:ID(Requisite-Group-ID)", ":LABEL"]).to_csv("./degree_timelines/dag/requisite_groups.csv", index=False)