In [4]:
import pandas as pd
import numpy as np
from prophet import Prophet
from functools import reduce

# Load dataset
file_path = "Enhanced_Education_Dataset.csv"
edu_df = pd.read_csv(file_path)

# Convert REF_DATE to datetime and extract year
edu_df['REF_DATE'] = pd.to_datetime(edu_df['REF_DATE'], format='%d-%m-%Y')
edu_df['Year'] = edu_df['REF_DATE'].dt.year
edu_df = edu_df.rename(columns={'Province': 'GEO'})

# Filter from 1991 onward
edu_df = edu_df[edu_df['Year'] >= 1991]

# Identify all numeric columns to forecast
exclude_cols = {'REF_DATE', 'Year', 'GEO'}
forecast_columns = [col for col in edu_df.columns if col not in exclude_cols and pd.api.types.is_numeric_dtype(edu_df[col])]

# Results container
forecast_by_column = {col: [] for col in forecast_columns}

# Forecast each numeric column by province
for column in forecast_columns:
    temp_df = edu_df[['REF_DATE', 'GEO', column]].copy()
    pivot_df = temp_df.pivot(index='REF_DATE', columns='GEO', values=column)
    pivot_df.index = pd.to_datetime(pivot_df.index)
    monthly_df = pivot_df.resample('MS').interpolate(method='linear')

    for geo in monthly_df.columns:
        df_geo = monthly_df[[geo]].reset_index()
        df_geo.columns = ['ds', 'y']

        # Remove outliers using IQR
        q1, q3 = df_geo['y'].quantile([0.25, 0.75])
        iqr = q3 - q1
        lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
        df_geo = df_geo[(df_geo['y'] >= lower) & (df_geo['y'] <= upper)]

        if len(df_geo) < 10:
            continue  # skip if insufficient data

        model = Prophet(
            yearly_seasonality=False,
            weekly_seasonality=False,
            daily_seasonality=False,
            changepoint_prior_scale=0.5
        )
        model.add_seasonality(name='monthly', period=30.5, fourier_order=5)
        model.add_seasonality(name='yearly', period=365.25, fourier_order=10)
        model.fit(df_geo)

        # Forecast for 01-01-YYYY from 2020 to 2035
        custom_dates = [f"01-01-{year}" for year in range(2020, 2036)]
        future_df = pd.DataFrame({'ds': pd.to_datetime(custom_dates, format='%d-%m-%Y')})

        forecast = model.predict(future_df)

        result = forecast[['ds', 'yhat']].copy()
        result.columns = ['REF_DATE', column]
        result['GEO'] = geo
        result['REF_DATE'] = result['REF_DATE'].dt.strftime('%d-%m-%Y')  # match original format

        # Replace negative/zero with NaN first
        result[column] = result[column].apply(lambda x: x if x > 0 else np.nan)

        # Fill NaN using linear interpolation
        result[column] = result[column].interpolate(method='linear', limit_direction='both')

        # If still NaN (e.g. all values were bad), fill with small positive fallback (5th percentile)
        fallback_value = df_geo['y'].quantile(0.05)
        result[column] = result[column].fillna(fallback_value)

        forecast_by_column[column].append(result)

# Merge all forecasts
merged_forecasts = []
for column in forecast_columns:
    df = pd.concat(forecast_by_column[column], ignore_index=True)
    df = df[['REF_DATE', 'GEO', column]]
    merged_forecasts.append(df)

# Combine all columns into one DataFrame
final_df = reduce(lambda left, right: pd.merge(left, right, on=['REF_DATE', 'GEO'], how='inner'), merged_forecasts)

# Final formatting
final_df = final_df.sort_values(by=['GEO', 'REF_DATE']).reset_index(drop=True)
final_df.to_csv("Education_Forecast_2020_2035.csv", index=False)

# Preview
print(final_df.head())
print("✅ Forecast (with filled values) saved to 'Education_Forecast_2020_2035.csv'")


00:32:55 - cmdstanpy - INFO - Chain [1] start processing
00:32:55 - cmdstanpy - INFO - Chain [1] done processing
00:32:55 - cmdstanpy - INFO - Chain [1] start processing
00:32:55 - cmdstanpy - INFO - Chain [1] done processing
00:32:55 - cmdstanpy - INFO - Chain [1] start processing
00:32:56 - cmdstanpy - INFO - Chain [1] done processing
00:32:56 - cmdstanpy - INFO - Chain [1] start processing
00:32:56 - cmdstanpy - INFO - Chain [1] done processing
00:32:56 - cmdstanpy - INFO - Chain [1] start processing
00:32:56 - cmdstanpy - INFO - Chain [1] done processing
00:32:56 - cmdstanpy - INFO - Chain [1] start processing
00:32:56 - cmdstanpy - INFO - Chain [1] done processing
00:32:57 - cmdstanpy - INFO - Chain [1] start processing
00:32:57 - cmdstanpy - INFO - Chain [1] done processing
00:32:57 - cmdstanpy - INFO - Chain [1] start processing
00:32:57 - cmdstanpy - INFO - Chain [1] done processing
00:32:57 - cmdstanpy - INFO - Chain [1] start processing
00:32:57 - cmdstanpy - INFO - Chain [1]

     REF_DATE      GEO  Full-time educators  Part-time educators  \
0  01-01-2020  Alberta         35524.541544          5279.565150   
1  01-01-2021  Alberta         35639.497579          4988.207004   
2  01-01-2022  Alberta         35906.567155          4728.811482   
3  01-01-2023  Alberta         36008.784388          4632.090324   
4  01-01-2024  Alberta         35907.245112          4652.888292   

   Total, work status  Education price index (EPI)  \
0        40814.165459                   165.015826   
1        40537.143097                   165.083508   
2        40401.381353                   165.034848   
3        40449.732004                   165.004076   
4        40514.889260                   165.075646   

   Fees and contractual services sub-index  Instructional supplies sub-index  \
0                               210.366844                        206.529843   
1                               210.191678                        206.126231   
2                         

In [5]:
edu_df.head()

Unnamed: 0,REF_DATE,GEO,Full-time educators,Part-time educators,"Total, work status",Education price index (EPI),Fees and contractual services sub-index,Instructional supplies sub-index,Non-salary sub-index,Non-teaching salaries sub-index,...,Total operating expenditures,College,Elementary and/or High School,University,Educator_to_OperatingSpending,Salary_to_EPI,OpSpend_per_Educator,Education_Access_Index,Capital_Efficiency,Year
484,1991-01-01,Quebec,68160.681818,33531.545455,101692.090909,125.03,136.25,130.64,131.11,121.9,...,578801.0,21.413793,5.551724,21.655172,0.175694,2559.617692,5.691701,16.206897,4833.292919,1991
485,1991-01-01,Prince Edward Island,1462.363636,200.590909,1662.136364,128.12,135.35,130.64,122.65,128.26,...,489735.5,10.034483,5.413793,26.62069,0.003394,2101.838121,294.642191,14.022989,3955.440334,1991
486,1991-01-01,Canada,307115.863636,80013.136364,387129.136364,126.27,136.11,130.64,127.69,121.21,...,1343906.0,14.517241,6.758621,24.793103,0.288063,5956.094084,3.471467,15.356322,11220.992728,1991
487,1991-01-01,Nova Scotia,9786.954545,15979.881818,9787.5,122.4,138.06,130.64,125.75,120.87,...,311969.5,9.172414,6.689655,26.689655,0.031373,1392.757353,31.874278,14.183908,2711.026219,1991
488,1991-01-01,Saskatchewan,10467.409091,1696.636364,12164.181818,119.44,134.61,130.64,126.33,119.14,...,13701.0,6.137931,6.517241,24.793103,0.887832,49.497656,1.12634,12.482759,122.385384,1991
