In [1]:
 #=== IMPORT LIBRARIES ===
import pandas as pd

# === LOAD DATASET ===
df = pd.read_csv("Final_Education_Dataset.csv")

# Drop index column if exists
df.drop(columns=["Unnamed: 0"], errors="ignore", inplace=True)

In [2]:
# === HANDLE MISSING DATA ===

# Drop columns with more than 90% missing values
missing_ratio = df.isnull().mean()
columns_to_drop = missing_ratio[missing_ratio > 0.90].index.tolist()
df.drop(columns=columns_to_drop, inplace=True)

# Convert REF_DATE to datetime and sort
df["REF_DATE"] = pd.to_datetime(df["REF_DATE"], format="%d-%m-%Y", errors='coerce')
df.sort_values(by=["GEO", "REF_DATE"], inplace=True)

# Interpolate numeric columns only within each GEO group
numeric_cols = df.select_dtypes(include="number").columns
df[numeric_cols] = df.groupby("GEO")[numeric_cols].transform(lambda group: group.interpolate(method='linear'))

# Fill remaining missing values with group-wise (province-level) mean
df[numeric_cols] = df.groupby("GEO")[numeric_cols].transform(lambda group: group.fillna(group.mean()))

# Final fallback: fill any remaining NaNs with overall column means
df[numeric_cols] = df[numeric_cols].fillna(df[numeric_cols].mean())


In [3]:
# === FEATURE ENGINEERING ===

# Avoid division by zero
df.replace(0, pd.NA, inplace=True)

# 1. Educator-to-operating spending ratio
df["Educator_to_OperatingSpending"] = df["Total, work status"] / df["Total operating expenditures"]

# 2. Salary-to-EPI ratio
df["Salary_to_EPI"] = df["Teachers salaries"] / df["Education price index (EPI)"]

# 3. Operational spend per educator
df["OpSpend_per_Educator"] = df["Total operating expenditures"] / df["Total, work status"]

# 4. Education Access Index: average of participation rates
participation_cols = ["College", "Elementary and/or High School", "University"]
if all(col in df.columns for col in participation_cols):
    df["Education_Access_Index"] = df[participation_cols].mean(axis=1)

# 5. Capital efficiency
if "Teachers' salaries sub-index" in df.columns and "Total expenditures" in df.columns:
    df["Capital_Efficiency"] = df["Total expenditures"] / df["Teachers' salaries sub-index"]


In [5]:
df.head()

Unnamed: 0,REF_DATE,GEO,Full-time educators,Part-time educators,"Total, work status",Education price index (EPI),Fees and contractual services sub-index,Instructional supplies sub-index,Non-salary sub-index,Non-teaching salaries sub-index,...,Total expenditures,Total operating expenditures,College,Elementary and/or High School,University,Educator_to_OperatingSpending,Salary_to_EPI,OpSpend_per_Educator,Education_Access_Index,Capital_Efficiency
0,01-01-1947,Alberta,31970.454545,8851.636364,40822.227273,123.073208,147.908302,148.026226,138.573962,111.846226,...,528.0,528.0,10.517241,5.689655,20.275862,77.314824,1.511296,0.012934,12.16092,4.36559
11,01-01-1948,Alberta,31970.454545,8851.636364,40822.227273,123.073208,147.908302,148.026226,138.573962,111.846226,...,5432.666667,499.083333,10.517241,5.689655,20.275862,81.794411,0.826067,0.012226,12.16092,44.918174
22,01-01-1949,Alberta,31970.454545,8851.636364,40822.227273,123.073208,147.908302,148.026226,138.573962,111.846226,...,1050.0,1050.0,10.517241,5.689655,20.275862,38.878312,2.49445,0.025721,12.16092,8.681571
33,01-01-1950,Alberta,31970.454545,8851.636364,40822.227273,123.073208,147.908302,148.026226,138.573962,111.846226,...,1072.0,1072.0,10.517241,5.689655,20.275862,38.080436,2.673206,0.02626,12.16092,8.863471
44,01-01-1951,Alberta,31970.454545,8851.636364,40822.227273,123.073208,147.908302,148.026226,138.573962,111.846226,...,1116.0,1116.0,10.517241,5.689655,20.275862,36.579057,2.591953,0.027338,12.16092,9.22727


In [6]:
df.sort_values(by="REF_DATE", inplace=True)

In [7]:
df.head()

Unnamed: 0,REF_DATE,GEO,Full-time educators,Part-time educators,"Total, work status",Education price index (EPI),Fees and contractual services sub-index,Instructional supplies sub-index,Non-salary sub-index,Non-teaching salaries sub-index,...,Total expenditures,Total operating expenditures,College,Elementary and/or High School,University,Educator_to_OperatingSpending,Salary_to_EPI,OpSpend_per_Educator,Education_Access_Index,Capital_Efficiency
0,01-01-1947,Alberta,31970.454545,8851.636364,40822.227273,123.073208,147.908302,148.026226,138.573962,111.846226,...,528.0,528.0,10.517241,5.689655,20.275862,77.314824,1.511296,0.012934,12.16092,4.36559
5,01-01-1947,Newfoundland and Labrador,5406.272727,1131.545455,6538.772727,121.117736,149.788302,148.026226,133.266792,123.15,...,2532.5,2532.5,12.517241,4.827586,26.965517,2.581944,5.519423,0.387305,14.770115,21.233638
6,01-01-1947,Nova Scotia,9786.954545,15979.881818,9787.5,115.090943,149.907736,148.026226,129.900943,116.503208,...,3520.75,3520.75,9.172414,6.689655,26.689655,2.779947,8.25217,0.359719,14.183908,31.178372
4,01-01-1947,New Brunswick,7382.318182,317.454545,7699.772727,119.046226,151.00434,148.026226,130.646604,117.013962,...,1544.25,1544.25,9.37931,5.344828,23.551724,4.986092,3.252938,0.200558,12.758621,13.261722
7,01-01-1947,Ontario,129052.909091,23818.363636,152870.454545,124.026038,149.176792,148.026226,141.104528,114.254151,...,4509.0,4509.0,14.482759,8.482759,27.551724,33.903405,9.925335,0.029496,16.83908,37.250987


In [9]:
# === FINAL CLEANUP ===
df["REF_DATE"] = pd.to_datetime(df["REF_DATE"], errors='coerce')  # Ensure REF_DATE is in datetime format
df["REF_DATE"] = df["REF_DATE"].dt.strftime("%d-%m-%Y")

# === EXPORT CLEANED AND ENHANCED DATA ===
df.to_csv("Enhanced_Education_Dataset.csv", index=False)
print("✅ Cleaned and enhanced dataset saved as 'Enhanced_Education_Dataset.csv'")

✅ Cleaned and enhanced dataset saved as 'Enhanced_Education_Dataset.csv'
