In [175]:
import pandas as pd
import os

# Define a function to extract semester and year from file name
def extract_semester_year(filename):
    parts = filename.split('.')

    semester = parts[0][-4:-2].upper() 
    year = int('20'+parts[0][-2:])       
    return semester, year

# Create an empty list to store dataframes
dfs = []

# Iterate through each file in the directory
for filename in os.listdir('./data'):
    # Read the CSV file into a dataframe
    df = pd.read_csv(f'./data/{filename}')
    # Extract semester and year from filename
    semester, year = extract_semester_year(filename)
    # Add semester and year columns to dataframe
    df['Semester'] = semester
    df['Year'] = year
    # Append the modified dataframe to the list
    dfs.append(df)

# Concatenate all dataframes into a single dataframe
combined_df = pd.concat(dfs, ignore_index=True)

# Merge columns related to location, instructor, credits, notes, section #, curriculum category
combined_df['Location'] = combined_df['Location'].fillna(combined_df['Location: MAC\n(unless noted\notherwise)']).fillna(combined_df['Location: MAC\r\n(unless noted\r\notherwise)']).fillna(combined_df['Location: MAC\n(unless noted otherwise)']).fillna(combined_df['Location for Instr Delivery or Staging of Materials'])
combined_df['Instructors'] = combined_df['Instructor / Teaching Team'].fillna(combined_df['Instructor'])
combined_df['Credits'] = combined_df['Credits'].fillna(combined_df['Credit s'])
combined_df['Notes'] = combined_df['Notes'].fillna(combined_df['Registration Notes'])
combined_df['Section #'] = combined_df['Section #'].fillna(combined_df['Sect #']).fillna(combined_df['Sec #'])
combined_df['Curriculum Category'] = combined_df['Curriculum Category'].fillna(combined_df['Curriculum Notes']).fillna(combined_df['Degree Requirement Note']).fillna(combined_df['Unnamed: 11']).fillna(combined_df['Unnamed: 12']).fillna(combined_df['Unnamed: 13']).fillna(combined_df['Curriculum Role'])

# Drop unnecessary columns
combined_df.drop(columns=["Location: MAC\n(unless noted\notherwise)", "Location: MAC\r\n(unless noted\r\notherwise)", "Location: MAC\n(unless noted otherwise)", "Location for Instr Delivery or Staging of Materials", "Instructor / Teaching Team", "Instructor", "Credit s", 'Unnamed: 11', 'Unnamed: 12', 'Unnamed: 13', 'Curriculum Role', 'Curriculum Notes', 'Degree Requirement Note', 'Registration Notes', 'Sect #', 'Sec #'], inplace=True)

# Reorder the columns
combined_df = combined_df[['Semester', 'Year', 'Area', 'Course #', 'Course Title', 'Time', 'Location', 'Credits', 'Enroll Limits', 'Instructors', 'Section #', 'Waitlist', 'Curriculum Category', 'Notes', 'Delivery Mode']]

# Write the combined dataframe to a CSV file
# combined_df.to_csv('combined_schedule.csv', index=False)

In [176]:
# column = 'Curriculum Role'
# print(combined_df[combined_df[column].notnull()][column])

In [177]:
num_missing = len(combined_df) - combined_df.count()
print(num_missing)
# combined_df["Location"].value_counts(dropna = False)

Semester                  0
Year                      0
Area                    143
Course #                 10
Course Title             11
Time                     73
Location                168
Credits                  22
Enroll Limits            85
Instructors              17
Section #                10
Waitlist                688
Curriculum Category     448
Notes                   637
Delivery Mode          1149
dtype: int64


In [178]:
combined_df.head()

Unnamed: 0,Semester,Year,Area,Course #,Course Title,Time,Location,Credits,Enroll Limits,Instructors,Section #,Waitlist,Curriculum Category,Notes,Delivery Mode
0,FA,2019,AHS,AHSE0112,AHSE0112: The Olin Conductorless Orchestra,T 7:30pm-9:00pm; R 6:30-9:00pm,AC318,1,26,"Dabby, Diana",1,"yes, small",AHS Elective,,
1,FA,2019,AHS,AHSE2170,AHSE2170: Teaching and Learning in Undergradua...,T 12:50-3:15pm; T 6:30pm-7:15pm,"CC209, 211\r\nCrescent Room",4,15,"Zastavker, Yevgeniya; Burger, Jordyn",1,"yes, large",AHS Elective,,
2,FA,2019,AHS,AHSE2170,AHSE2170: Teaching and Learning in Undergradua...,T 12:50-3:15pm; W 6:30pm-7:15pm,"CC209, 211\r\nCrescent Room",4,15,"Zastavker, Yevgeniya; Burger, Jordyn",2,"yes, large",AHS Elective,,
3,FA,2019,AHS,AHSE3130,AHSE3130: Advanced Digital Photography,TF 1:30-3:10pm,AC313,4,12,"Donis-Keller, Helen",1,"yes, small",AHS Elective,,
4,FA,2019,AHS,AHSE3190,"AHSE3190: Arts, Humanities, Social Science Pre...",,,1,50,"Epstein, Gillian",1,,AHS Capstone Prereq,,
