In [2]:
import pandas as pd
import numpy as np
from prophet import Prophet
from functools import reduce

# Load dataset
file_path = "D:\Personal Projects\IRCC_Project\machine learning\Education\Enhanced_Education_Dataset.csv"
edu_df = pd.read_csv(file_path)

# Prepare REF_DATE and GEO
edu_df['REF_DATE'] = pd.to_datetime(edu_df['REF_DATE'], format='%d-%m-%Y')
edu_df['Year'] = edu_df['REF_DATE'].dt.year
edu_df = edu_df.rename(columns={'Province': 'GEO'})

# Filter from 1991 onward
edu_df = edu_df[edu_df['Year'] >= 1991]

# Define educator columns to forecast
educator_types = {
    'Full-time educators': 'Full-time',
    'Part-time educators': 'Part-time',
    'Total, work status': 'Total'
}

# Prepare results container
forecast_by_type = {label: [] for label in educator_types.values()}

# Forecast for each educator type
for column, label in educator_types.items():
    temp_df = edu_df[['REF_DATE', 'GEO', column]].copy()
    pivot_df = temp_df.pivot(index='REF_DATE', columns='GEO', values=column)
    pivot_df.index = pd.to_datetime(pivot_df.index)
    monthly_df = pivot_df.resample('MS').interpolate(method='linear')

    for geo in monthly_df.columns:
        df_geo = monthly_df[[geo]].reset_index()
        df_geo.columns = ['ds', 'y']

        # Remove outliers using IQR
        q1, q3 = df_geo['y'].quantile([0.25, 0.75])
        iqr = q3 - q1
        lower, upper = q1 - 1.5 * iqr, q3 + 1.5 * iqr
        df_geo = df_geo[(df_geo['y'] >= lower) & (df_geo['y'] <= upper)]

        if len(df_geo) < 10:
            continue

        model = Prophet(
            yearly_seasonality=False,
            weekly_seasonality=False,
            daily_seasonality=False,
            changepoint_prior_scale=0.5
        )
        model.add_seasonality(name='monthly', period=30.5, fourier_order=5)
        model.add_seasonality(name='yearly', period=365.25, fourier_order=10)
        model.fit(df_geo)

        future = pd.date_range(start='2024-01-01', end='2035-12-01', freq='MS')
        future_df = pd.DataFrame({'ds': future})
        forecast = model.predict(future_df)

        result = forecast[['ds', 'yhat']].copy()
        result.columns = ['REF_DATE', label]
        result['GEO'] = geo
        result['REF_DATE'] = result['REF_DATE'].dt.strftime('%d-%m-%Y')  # match original format

        forecast_by_type[label].append(result)

# Merge forecasts side-by-side
merged_forecasts = []
for label in ['Full-time', 'Part-time', 'Total']:
    df = pd.concat(forecast_by_type[label], ignore_index=True)
    df = df[['REF_DATE', 'GEO', label]]
    merged_forecasts.append(df)

# Combine all forecast types into one DataFrame
final_df = reduce(lambda left, right: pd.merge(left, right, on=['REF_DATE', 'GEO']), merged_forecasts)

# Ensure final columns and sort
final_df = final_df[['REF_DATE', 'GEO', 'Full-time', 'Part-time', 'Total']]
final_df = final_df.sort_values(by='REF_DATE').reset_index(drop=True)

# Export to CSV
final_df.to_csv("Educator_Typewise_Forecast_2024_2035.csv", index=False)

# Preview
print(final_df.head())
print("✅ Forecast saved to 'Educator_Typewise_Forecast_2024_2035.csv'")


23:25:11 - cmdstanpy - INFO - Chain [1] start processing
23:25:12 - cmdstanpy - INFO - Chain [1] done processing
23:25:12 - cmdstanpy - INFO - Chain [1] start processing
23:25:12 - cmdstanpy - INFO - Chain [1] done processing
23:25:13 - cmdstanpy - INFO - Chain [1] start processing
23:25:13 - cmdstanpy - INFO - Chain [1] done processing
23:25:14 - cmdstanpy - INFO - Chain [1] start processing
23:25:14 - cmdstanpy - INFO - Chain [1] done processing
23:25:14 - cmdstanpy - INFO - Chain [1] start processing
23:25:15 - cmdstanpy - INFO - Chain [1] done processing
23:25:15 - cmdstanpy - INFO - Chain [1] start processing
23:25:16 - cmdstanpy - INFO - Chain [1] done processing
23:25:16 - cmdstanpy - INFO - Chain [1] start processing
23:25:16 - cmdstanpy - INFO - Chain [1] done processing
23:25:17 - cmdstanpy - INFO - Chain [1] start processing
23:25:17 - cmdstanpy - INFO - Chain [1] done processing
23:25:18 - cmdstanpy - INFO - Chain [1] start processing
23:25:18 - cmdstanpy - INFO - Chain [1]

     REF_DATE                   GEO     Full-time     Part-time          Total
0  01-01-2024               Alberta  35907.245112   4652.888292   40514.889260
1  01-01-2024         New Brunswick   8450.497391    283.563945    8242.948148
2  01-01-2024  Prince Edward Island   1557.889957     29.059087    1525.720408
3  01-01-2024              Manitoba  12403.003253   1423.057544   13789.903216
4  01-01-2024                Quebec  82636.828439  37372.943900  121321.826114
✅ Forecast saved to 'Educator_Typewise_Forecast_2024_2035.csv'


In [5]:
edu_df.head()

Unnamed: 0,REF_DATE,GEO,Full-time educators,Part-time educators,"Total, work status",Education price index (EPI),Fees and contractual services sub-index,Instructional supplies sub-index,Non-salary sub-index,Non-teaching salaries sub-index,...,Total operating expenditures,College,Elementary and/or High School,University,Educator_to_OperatingSpending,Salary_to_EPI,OpSpend_per_Educator,Education_Access_Index,Capital_Efficiency,Year
484,1991-01-01,Quebec,68160.681818,33531.545455,101692.090909,125.03,136.25,130.64,131.11,121.9,...,578801.0,21.413793,5.551724,21.655172,0.175694,2559.617692,5.691701,16.206897,4833.292919,1991
485,1991-01-01,Prince Edward Island,1462.363636,200.590909,1662.136364,128.12,135.35,130.64,122.65,128.26,...,489735.5,10.034483,5.413793,26.62069,0.003394,2101.838121,294.642191,14.022989,3955.440334,1991
486,1991-01-01,Canada,307115.863636,80013.136364,387129.136364,126.27,136.11,130.64,127.69,121.21,...,1343906.0,14.517241,6.758621,24.793103,0.288063,5956.094084,3.471467,15.356322,11220.992728,1991
487,1991-01-01,Nova Scotia,9786.954545,15979.881818,9787.5,122.4,138.06,130.64,125.75,120.87,...,311969.5,9.172414,6.689655,26.689655,0.031373,1392.757353,31.874278,14.183908,2711.026219,1991
488,1991-01-01,Saskatchewan,10467.409091,1696.636364,12164.181818,119.44,134.61,130.64,126.33,119.14,...,13701.0,6.137931,6.517241,24.793103,0.887832,49.497656,1.12634,12.482759,122.385384,1991
