In [1]:
import PyPDF2
import pandas as pd
from tabula import read_pdf
import regex as re

In [2]:
reader = open("webregMain.pdf", "rb")
pdfReader = PyPDF2.PdfFileReader(reader)

In [3]:
pageObj = pdfReader.getPage(0)
raw_data = pageObj.extract_text()
raw_data_lines=raw_data.split("\n")

# Find the index where the first course begins, by finding the line that starts with "Action"
course_start_index = [i for i in range(len(raw_data_lines)) if raw_data_lines[i].startswith("Action")][0]
raw_data_lines = raw_data_lines[course_start_index + 1:]

In [4]:
raw_df = read_pdf("webregMain.pdf", pages="all")[0]

In [5]:
raw_df

Unnamed: 0.1,Unnamed: 0,Subject\rCourse,Title,Section\rCode,Type,Instructor,Grade\rOption,Units,Days,Time,BLDG,Room,Status /\r(Position),Action,Unnamed: 1
0,DSC 80,ractice of Data Science,A00,LE,"au, Samuel Ethan",L,4.0,TuTh,3:30p-4:50p,CENTR,216,Enrolled,,,
1,,,A01,DI,,,,F,10:00a-10:50a,CENTR,212,,,,
2,,inal Exam,,FI,,,,M 12/11/2023,3:00p-5:59p,TBA,TBA,,,,
3,JAPN 10A,irst Year Japanese I,D00,LE,"wamoto, Naoki",L,5.0,TuTh,9:30a-10:50a,CENTR,207,Enrolled,,,
4,,,D01,TU,,,,MWF,9:00a-9:50a,CENTR,202,,,,
5,,inal Exam,,FI,,,,Th 12/14/2023,8:00a-10:59a,TBA,TBA,,,,
6,MATH 20D,ntro/Differential Equations,B00,LE,"hm, Ko Woon",L,4.0,MWF,11:00a-11:50a,PCYNH,106,Enrolled,,,
7,,,B01,DI,,,,Th,6:00p-6:50p,APM,6402,,,,
8,,Lab,,,,,,,,,,,,,
9,,,,LA,,,,TBA,TBA,TBA,TBA,,,,


In [6]:
def repair_column(df: pd.DataFrame, col: str, raw_data_lines: list) -> list:
    """
    Tabula does not always parse the columns correctly due to the way webreg formats their tables, 
    and PyPDF2 only reads the text in the pdf, not the tables.
    so this function takes in the raw dataframe, the column name to be fixed, and the raw_data_lines list.
    It then identifies "broken" columns by finding the ones that start with a lower case letter, and fixes them by
    matching the value in the broken column with the data in the raw_data_lines list using regex.
    """
    if col == "Units":
        return df[col]
    broken_col = df[col].tolist()
    # Fix the broken column by finding matching text in the "Title" column and the raw_data_lines list
    for i in range(len(broken_col)):
        broken_val = broken_col[i]
        fix_val = raw_data_lines[i]
        
        # if broken_val is NaN, continue
        if broken_val != broken_val:
            continue

        # If the first character in broken_val is in lower case, fix it by matching it to the fix_val
        if broken_val[0].islower():
            # Find the index of the string in broken_val inside the fix_val, and decrement by one to get the missing character
            search_value = re.search(broken_val, fix_val)
            
            # If search_value is None, try to take the first 5 characters of broken_val and search for it in fix_val        
            if search_value is None:
                search_value = re.search(broken_val[:5], fix_val)
            try:
                fix_index = search_value.start()
            except:
                continue    
            missing_char = fix_val[fix_index - 1]
            broken_col[i] = missing_char + broken_val
    return broken_col

In [7]:
df = raw_df.copy()
df.columns = list(df.columns[1:]) + ["dummy"]
df.columns = [title.replace("\r", " ") for title in df.columns]
df = df[df.columns[:-3]]

# If the last row is all NaN, drop it
if df.iloc[-1].isnull().all():
    df = df.iloc[:-1]
# For each NaN in the "Subject Course" column, fill it with the value above it
df["Subject Course"] = df["Subject Course"].fillna(method="ffill")
for col in df.columns:
    df[col] = repair_column(df, col, raw_data_lines)
df = df[["Subject Course", "Title", "Section Code", "Type", "Days", "Time", "Instructor", "Units", "Status / (Position)", "BLDG", "Room", "Grade Option"]]
df

Unnamed: 0,Subject Course,Title,Section Code,Type,Days,Time,Instructor,Units,Status / (Position),BLDG,Room,Grade Option
0,DSC 80,Practice of Data Science,A00,LE,TuTh,3:30p-4:50p,"Lau, Samuel Ethan",4.0,Enrolled,CENTR,216,L
1,DSC 80,,A01,DI,F,10:00a-10:50a,,,,CENTR,212,
2,DSC 80,Final Exam,,FI,M 12/11/2023,3:00p-5:59p,,,,TBA,TBA,
3,JAPN 10A,First Year Japanese I,D00,LE,TuTh,9:30a-10:50a,"Iwamoto, Naoki",5.0,Enrolled,CENTR,207,L
4,JAPN 10A,,D01,TU,MWF,9:00a-9:50a,,,,CENTR,202,
5,JAPN 10A,Final Exam,,FI,Th 12/14/2023,8:00a-10:59a,,,,TBA,TBA,
6,MATH 20D,Intro/Differential Equations,B00,LE,MWF,11:00a-11:50a,"Ohm, Ko Woon",4.0,Enrolled,PCYNH,106,L
7,MATH 20D,,B01,DI,Th,6:00p-6:50p,,,,APM,6402,
8,MATH 20D,Lab,,,,,,,,,,
9,MATH 20D,,,LA,TBA,TBA,,,,TBA,TBA,


In [8]:
weekly_df = df.copy()

drop_list, non_weekly_events = [], []
days = weekly_df["Days"].tolist()

for i in range(len(days)):
    day = days[i]
    # If day is NaN or TBA, append the index to drop_list
    if day != day or day == "TBA":
        drop_list.append(i)
        continue
    # If day contains a number, append the index to non_weekly_events
    if any(char.isdigit() for char in day):
        non_weekly_events.append(i)

# Save the non weekly events as non_weekly_df
non_weekly_df = weekly_df.iloc[non_weekly_events]

# Drop the rows in drop_list
weekly_df = weekly_df.drop(drop_list)

# Drop the non weekly events from weekly_df
weekly_df = weekly_df.drop(non_weekly_events)

# Reset the index of weekly_df
weekly_df.reset_index(inplace=True)
weekly_df.drop(columns=["index"], inplace=True)

In [9]:
# For each row in weekly_df, split the "Days" column by uppercase letters into a list
weekly_df["Days"] = weekly_df["Days"].apply(lambda day: re.findall('[A-Z][^A-Z]*', day)) 

# Split the values in the "Time" column to get the start and end time
weekly_df["Time"] = weekly_df["Time"].apply(lambda time: time.split("-"))

In [10]:
# For the "Title" column of weekly_df, if the value is NaN then replace it with the Subject Course value 
# and add the type value at the end
for i in range(len(weekly_df)):
    row = weekly_df.iloc[i]
    title = row["Title"]
    if title != title:
        subject_course = row["Subject Course"]
        course_type = row["Type"]
        new_title = subject_course + " " + course_type
        weekly_df.loc[i, "Title"] = new_title
# Add a new column for the node parser to use, which is a concatenation of the "Subject Course" and "Type" columns
weekly_df["Node Title"] = weekly_df["Subject Course"] + " " + weekly_df["Type"]
weekly_df

Unnamed: 0,Subject Course,Title,Section Code,Type,Days,Time,Instructor,Units,Status / (Position),BLDG,Room,Grade Option,Node Title
0,DSC 80,Practice of Data Science,A00,LE,"[Tu, Th]","[3:30p, 4:50p]","Lau, Samuel Ethan",4.0,Enrolled,CENTR,216,L,DSC 80 LE
1,DSC 80,DSC 80 DI,A01,DI,[F],"[10:00a, 10:50a]",,,,CENTR,212,,DSC 80 DI
2,JAPN 10A,First Year Japanese I,D00,LE,"[Tu, Th]","[9:30a, 10:50a]","Iwamoto, Naoki",5.0,Enrolled,CENTR,207,L,JAPN 10A LE
3,JAPN 10A,JAPN 10A TU,D01,TU,"[M, W, F]","[9:00a, 9:50a]",,,,CENTR,202,,JAPN 10A TU
4,MATH 20D,Intro/Differential Equations,B00,LE,"[M, W, F]","[11:00a, 11:50a]","Ohm, Ko Woon",4.0,Enrolled,PCYNH,106,L,MATH 20D LE
5,MATH 20D,MATH 20D DI,B01,DI,[Th],"[6:00p, 6:50p]",,,,APM,6402,,MATH 20D DI
6,MATH 140A,Foundations of Real Analysis I,B00,LE,"[M, W, F]","[2:00p, 2:50p]","Sheng, Hongyi",4.0,Enrolled,WLH,2111,L,MATH 140A LE
7,MATH 140A,MATH 140A DI,B01,DI,[Tu],"[6:00p, 6:50p]",,,,PODEM,133,,MATH 140A DI
8,MATH 173A,Optimization/Data Science I,A00,LE,"[Tu, Th]","[2:00p, 3:20p]","Cloninger, Alexander",4.0,Planned,CENTR,119,L,MATH 173A LE
9,MATH 173A,MATH 173A DI,A02,DI,[F],"[6:00p, 6:50p]",,,,APM,5402,,MATH 173A DI


In [11]:

# Format the "Days" column in non_weekly_df to only contain the date
exam_days = non_weekly_df["Days"].tolist()
exam_days = [day.split(" ")[1] for day in exam_days]
non_weekly_df.loc[:, "Days"] = exam_days

# Rename the "Days" into "Date"
non_weekly_df = non_weekly_df.rename(columns={"Days": "Date"})

non_weekly_df.drop(columns=["Section Code", "Instructor", "Units", "Status / (Position)", "Grade Option"], inplace=True)

non_weekly_df["Time"] = non_weekly_df["Time"].apply(lambda x: x.split("-"))
non_weekly_df

Unnamed: 0,Subject Course,Title,Type,Date,Time,BLDG,Room
2,DSC 80,Final Exam,FI,12/11/2023,"[3:00p, 5:59p]",TBA,TBA
5,JAPN 10A,Final Exam,FI,12/14/2023,"[8:00a, 10:59a]",TBA,TBA
10,MATH 20D,Midterm,MI,10/27/2023,"[7:00p, 7:50p]",PCYNH,106
11,MATH 20D,Midterm,MI,11/29/2023,"[7:00p, 7:50p]",PCYNH,106
12,MATH 20D,Final Exam,FI,12/09/2023,"[8:00a, 10:59a]",TBA,TBA
15,MATH 140A,Final Exam,FI,12/13/2023,"[3:00p, 5:59p]",TBA,TBA
18,MATH 173A,Final Exam,FI,12/14/2023,"[3:00p, 5:59p]",TBA,TBA
