#### Phase 1: Data Import and Preparation

In [None]:
# Import necessary libaries and packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
from datetime import datetime
import warnings
warnings.filterwarnings('ignore')

# Load and examine the dataframe structure
file_path = "../data/raw/df.xlsx"
data = pd.ExcelFile(file_path)

# All courses (sheets) except first two sheets (metadata)
course_list = [sheet for sheet in data.sheet_names if sheet not in ['List of Courses', 'Data Points Defined']]

print(f"There are {len(course_list)} courses in this excel file, which are:")
for i, course in enumerate(course_list):
    print(f"{i+1}. {course}")

#### Phase 2: Combine all courses (sheets) into one sheet

This function combines multiple courses (sheets) into one single sheet (dataframe) by handling course codes with or without spaces (e.g., "XXXX2000" or "XXXX 2000").

In [None]:
# Combining all courses (sheets)

all_courses = []
for course in course_list:
    course_df = data.parse(course)
    course_df['course_identifier'] = course
    all_courses.append(course_df)

data = pd.concat(all_courses, ignore_index = True)

In [None]:
data.head(5)

In [None]:
def parse_course_identifier(df, col_name):
    def parse_single(identifier):
        if pd.isna(identifier):
            return pd.Series(['Unknown', 'Unknown', 'Unknown'])
        
        pattern = r'^([A-Z]+)\s?(\d+)\s+([A-Za-z]+)\s+(\d{4})$'
        match = re.match(pattern, str(identifier).strip())
        
        if match:
            course_code = f"{match.group(1)} {match.group(2)}"
            term = match.group(3)
            year = match.group(4)
            return pd.Series([course_code, term, year])
        else:
            return pd.Series(['Unknown', 'Unknown', 'Unknown'])
    
    df[['course_code', 'term', 'year']] = df[col_name].apply(parse_single)
    return df

In [None]:
def standardize_columns(df):
    df.columns = df.columns.str.lower().str.replace(' ', '_').str.replace('-', '_')
    return df

In [None]:
# apply functions to standardize the column names
df = parse_course_identifier(data, 'course_identifier')
data = standardize_columns(data)
data['course_code'].unique()

In [None]:
# display column names
print(f"\nList of new dataset columns:")
for i, col in enumerate(data.columns):
    print(f"{i+1}. {col}")
                

In [None]:
data.head(5)

#### Phase 3: Data Quality Assessment

- look for negative and missing values, outliers by checking the range (min, max)
- write a summary on the table means

In [None]:
# create and display a summary table 
summary_table = []

for col in data.columns:
    empty_count = data[col].isna().sum()
    numeric_column = pd.to_numeric(data[col], errors = 'coerce')
    neg_count = (numeric_column < 0).sum()
    min_val = numeric_column.min()
    max_val = numeric_column.max()
    
    summary_table.append({
        'Column Name': col,
        'Empty Value': empty_count,
        'Negative Value': neg_count,
        'Min Value': f'{min_val:.2f}' if pd.notna(min_val) else 'N/A',
        'Max Value': f'{max_val:.2f}' if pd.notna(max_val) else 'N/A',
        'Column Data Type': data[col].dtype,
    })

summary = pd.DataFrame(summary_table)
display(summary)  

#### Phase 4: Analysis


