In [1]:
import numpy as np
import pandas as pd

In [2]:
student_data = pd.read_excel("Raw Data/Anon Enrollment Data.xlsx")
enrol_nums_data = pd.read_excel("Raw Data/Course Enrollment Numbers.xlsx", sheet_name="Course Numbers")
enrol_nums_additional = pd.read_excel("Raw Data/Course Enrollment Numbers.xlsx", sheet_name="abbreviated course names")
timetable_data = pd.read_excel("Raw Data/School of Mathematics - Timetable Data.xlsx")
room_data = pd.read_excel("Raw Data/Timetabling KB Rooms_5205.xlsx")

In [3]:
unique_students = student_data["UUN"].unique()
unique_years = student_data["Year Of Programme"].unique()
unique_degrees = student_data["Programme Of Study Sought Title"].unique()
unique_schools = student_data["Programme School Name"].unique()
unique_years_taken = student_data["Normal Year Taken"].unique()
unique_course_codes = student_data['Course Code'].unique()
unique_courses = student_data['Course Name'].unique()

In [4]:
enrol_nums_cols = list(enrol_nums_data.columns)

In [5]:
student_cols = list(student_data.columns)

In [6]:
math_courses_only = student_data[student_data["Course Code"].str.startswith("MATH")]
math_courses = math_courses_only["Course Name"].unique()

enrol_nums_courses = list(enrol_nums_data["Course"])
uncommon_maths_courses = [i for i in math_courses if i not in enrol_nums_courses]

In [7]:
uncommon_maths_courses

['Dissertation (CAM)',
 'Dissertation (CMF)',
 'Dissertation (FMO)',
 'Dissertation (Op. Res.)',
 'Dissertation (SDS)',
 'Dissertation (Statistics)',
 'Dissertation (Stats. Op. Res.)',
 'Mathematics Dissertation',
 'Mathematics Project',
 'Mathematics Reading Course - S1',
 'Mathematics Reading Course - S2',
 'MIGS: Advanced Course 1',
 'MIGS: Asymptotic and Analytical Methods',
 'MIGS: Computational Methods for Data Driven Modelling',
 'MIGS: Continuum Mechanics',
 'MIGS: Dynamical Systems and Conservation Laws',
 'MIGS: Elliptic and Parabolic PDEs',
 'MIGS: Extended Project',
 'MIGS: Foundations of Probability',
 'MIGS: Functional Analysis',
 'MIGS: Mathematical Modelling and Applied Analysis',
 'MIGS: Measure and Integration',
 'MIGS: Modern Regression and Bayesian Methods',
 'MIGS: Numerical Methods',
 'MIGS: Project 1',
 'MIGS: Project 2',
 'MIGS: Regression and Simulation Methods',
 'MIGS: Stochastic Processes',
 'Project in Mathematics (Double)']

In [8]:
with open('Keys/course_abbreviation_key.txt', 'r') as file:
    # Read the entire contents of the file
    abbr_key = file.read()
file.close()

abbr_key = abbr_key.split("\n")
abbr_key = [i.strip() for i in abbr_key if i.strip()]

In [9]:
enrol_nums_copy = enrol_nums_data.copy()

In [10]:
abbr_map = {}
for course in abbr_key:
    abbreviation, description, code = course.split(' = ')
    abbr_map[code] = abbreviation

enrol_nums_copy["Abbreviated Course Name"] = enrol_nums_copy["Code"].map(abbr_map)

In [11]:
len(enrol_nums_copy["Abbreviated Course Name"]) == len(enrol_nums_additional["Abbreviated Course Name"])

True

In [12]:
enrol_nums_copy["Abbreviated Course Name"].all() == enrol_nums_additional["Abbreviated Course Name"].all()

True

# Changing Room Data

In [13]:
room_copy = room_data.copy()
new_room_cols = ["campus", "building", "room_name", "capacity",
                 "ownership", "gt_or_ts", "school_priority", "room_layout","furniture_config",
                 "control_system", "desktop", "induction_system", "lec_recording", "microphone", "pres_facilities", 
                 "sound_system", "wall_mounted_writing"]

room_copy.columns = new_room_cols

In [14]:
unique_campuses = room_copy["campus"].unique()
unique_buildings = room_copy["building"].unique()
unique_rooms = room_copy["room_name"].unique()
unique_ownership = room_copy["ownership"].unique()

In [15]:
if len(unique_campuses)==1:
    if unique_campuses.all() == "King's Buildings Campus":
        room_copy["campus"] = room_copy["campus"].replace("King's Buildings Campus", "KB")
else:
    print("There is more than one campus in the dataset.")


building_replacements = {"Alrick":"ALR",
                         "Ashworth":"ASH",
                         "Daniel Rutherford Building":"DRB",
                         "Eng Sanderson Building":"ESB",
                         "Grant Institute":"GRA",
                         "Hudson Beare Building":"HBB",
                         "Joseph Black Building":"JBB",
                         "JCMB":"JCMB",
                         "Murchison House":"MH",
                         "Nucleus":"NUC",
                         "Swann Building":"SB"}

room_copy["building"] = room_copy["building"].replace(building_replacements)


allocation_replacements = {"1. Centrally Allocated Space":"centrally_allocated", "1. Locally Allocated Space": "locally_allocated"}
room_copy["ownership"] = room_copy["ownership"].replace(allocation_replacements)


gt_ts_replacements = {"2. General Teaching":"GT", "2. Teaching Studio":"TS"}
room_copy["gt_or_ts"] = room_copy["gt_or_ts"].replace(gt_ts_replacements)

In [16]:
room_copy["school_priority"] = room_copy["school_priority"].str.split(" - ").str[1]
room_copy["room_layout"] = room_copy["room_layout"].str.split(" - ").str[1]

In [17]:
room_copy["school_priority"] = room_copy["school_priority"].fillna("no priority")

priority_replacements = {"Engineering":"eng",
                         "Biological":"bio",
                         "Geosciences":"geo",
                         "Chemistry":"chem",
                         "Mathematics":"maths",
                         "Physics and Astronomy":"phys",
                         "Mathematics/ Physics and Astronomy":"maths_phys"}
room_copy["school_priority"] = room_copy["school_priority"].replace(priority_replacements)


layout_replacements = {"Classroom Style":"classroom",
                       "Theatre Style":"theatre",
                       "Boardroom Style":"boardroom"}
room_copy["room_layout"] = room_copy["room_layout"].replace(layout_replacements)

In [18]:
room_copy["furniture_config"] = room_copy["furniture_config"].str.split("\. ").str[1]

In [19]:
furniture_replacements = {"Tables and Chairs":"tables_chairs",
                          "Flexible seating (not tables)":"flex_seating",
                          "Retractable Seating":"retract_seating"}
room_copy["furniture_config"] = room_copy["furniture_config"].replace(furniture_replacements)

In [20]:
list(room_copy["furniture_config"].unique())

['tables_chairs', 'flex_seating', nan, 'retract_seating']

In [21]:
true_false_cols = ["control_system", "desktop", "induction_system", "lec_recording", "microphone", "pres_facilities", "sound_system", "wall_mounted_writing"]

for col in true_false_cols:
    room_copy[col] = room_copy[col].apply(lambda x: True if pd.notna(x) and x.strip() else False)

In [22]:
dummy_enc_cols = []


room_furniture_encoded = pd.get_dummies(room_copy["furniture_config"], prefix="furniture")
room_layout_encoded = pd.get_dummies(room_copy["room_layout"], prefix="layout")
gt_ts_encoded = pd.get_dummies(room_copy["gt_or_ts"])
ownership_encoded = pd.get_dummies(room_copy["ownership"])
priority_encoded = pd.get_dummies(room_copy["school_priority"], prefix="priority")


room_copy = pd.concat([room_copy, room_furniture_encoded, room_layout_encoded, gt_ts_encoded, ownership_encoded, priority_encoded], axis=1)
to_drop = ["furniture_config", "room_layout", "gt_or_ts", "ownership", "school_priority"]
room_copy = room_copy.drop(to_drop, axis=1)

In [23]:
# room_copy.drop("furniture_config", axis=1)
encoded_room_data = room_copy

encoded_room_data.to_excel('Processed Data/encoded_room_data.xlsx', index=False)

# Cleaning Timetabling Data

In [51]:
timetable_copy = timetable_data.copy()

In [52]:
timetable_copy = timetable_data.rename(columns = {"Course Name":"Course", "Course Code":"Code"})

In [53]:
timetable_copy = pd.merge(timetable_copy, enrol_nums_additional[["Delivery Period", "Normal Year Taken", "Code"]], on = "Code", how = "left")
timetable_copy.shape[0] == timetable_data.shape[0]

True

In [54]:
timetable_copy["Abbreviated Course Name"] = timetable_copy["Code"].map(abbr_map)

def split_on_star(entry):
    if isinstance(entry, str) and '*' in entry:
        return entry.split('*')[1]
    else:
        return entry

cols_w_star = ["Activity Type Name", "Delivery Semester", "Zone Name"]

for col in cols_w_star:
    timetable_copy[col] = timetable_copy[col].apply(split_on_star)

In [55]:
def num_list(entry):
    numbers = []
    for part in entry.split(','):
        if '-' in part:
            start, end = map(int, part.split('-'))
            numbers.extend(range(start, end + 1))
        else:
            numbers.append(int(part))
    return numbers

timetable_copy["Teaching Week Pattern"] = timetable_copy["Teaching Week Pattern"].apply(num_list)


week_key = {}
with open("Keys/week_codes.txt", 'r', encoding='utf-8') as file:
    for line in file:
        line = line.replace('\u2060', '')
        
        line = line.strip()
        if line:  # Skip empty lines
            key_part = line.split('=')
            key_part[0] = int(key_part[0].strip())
            week_key[key_part[0]] = key_part[1].strip()

def convert_to_strings(numbers):
    return [week_key[number] for number in numbers]

timetable_copy["Teaching Week Pattern"] = timetable_copy["Teaching Week Pattern"].apply(convert_to_strings)

timetable_copy["Teaching Week Pattern"] = timetable_copy["Teaching Week Pattern"].apply(lambda x: ", ".join(x))

In [56]:
time_periods = {}
with open('Keys/time_periods.txt', 'r') as file:
    for line in file:
        period, times = line.strip().split('=')
        start, end = times[1:].split(', ')
        time_periods[int(period.strip())] = [start.strip("["), end.strip("]")]
time_periods

{1: ['09:00', '10:00'],
 2: ['10:00', '11:00'],
 3: ['11:00', '12:00'],
 4: ['12:00', '13:00'],
 5: ['13:00', '14:00'],
 6: ['14:00', '15:00'],
 7: ['15:00', '16:00'],
 8: ['16:00', '17:00'],
 9: ['17:00', '18:00']}

In [57]:
def get_time_periods(start, end):
    encompassed_periods = []
    for period, (period_start, period_end) in time_periods.items():
        start_hour, start_minute = map(int, period_start.split(':'))
        end_hour, end_minute = map(int, period_end.split(':'))
        
        class_start_hour, class_start_minute = map(int, start.split(':'))
        class_end_hour, class_end_minute = map(int, end.split(':'))
        
        # Compare hours and minutes
        if (class_start_hour == start_hour and class_start_minute == start_minute) and (class_end_hour == end_hour and class_end_minute == end_minute):
            encompassed_periods.append(period)
        elif (class_start_hour < end_hour or (class_start_hour == end_hour and class_start_minute < end_minute)) and \
           (class_end_hour > start_hour or (class_end_hour == start_hour and class_end_minute > start_minute)):
            encompassed_periods.append(period)
    return encompassed_periods

# Apply function to each row and create new column
timetable_copy['time_periods'] = timetable_copy.apply(lambda x: get_time_periods(x['Scheduled Start Time'], x['Scheduled End Time']), axis=1)

In [58]:
timetable_copy.to_excel("Processed Data/encoded_timetable_data.xlsx", index = False)