In [None]:
import pandas as pd

# Load datasets
edu_df = pd.read_csv("Enhanced_Education_Dataset.csv")
pop_df = pd.read_csv("D:\Personal Projects\IRCC_Project\datasets\Population\Population_Demographics_by_Year_and_Province_and_Canada.csv")

# Extract year from REF_DATE and clean up columns for merge
edu_df['Year'] = pd.to_datetime(edu_df['REF_DATE'], errors='coerce').dt.year
edu_df = edu_df.rename(columns={'GEO': 'Province'})
pop_df['Province'] = pop_df['Province'].str.strip()  # Avoid mismatch from extra spaces

# Filter both datasets to include only from 1991 onwards
edu_df = edu_df[edu_df['Year'] >= 1991]
pop_df = pop_df[pop_df['Year'] >= 1991]

# Merge on Year and Province
merged_df = pd.merge(edu_df, pop_df, on=['Year', 'Province'], how='left')

# Create population-based metrics
merged_df['Educators_per_1000'] = (
    merged_df['Total, work status'].astype(float) / merged_df['Population Estimate'].astype(float)
) * 1000
merged_df['OpSpend_per_Capita'] = merged_df['Total operating expenditures'] / merged_df['Population Estimate']
merged_df['Access_Index_per_Capita'] = merged_df['Education_Access_Index'] / merged_df['Population Estimate']

# Replace 0 with NaN and forward-fill by province
merged_df['Population Estimate'] = merged_df['Population Estimate'].replace(0, pd.NA)
merged_df['Population Estimate'] = merged_df.groupby('Province')['Population Estimate'].transform(lambda x: x.ffill().bfill())

merged_df['Data_Quality_Flag'] = (
    (merged_df['Population Estimate'].isna()) |
    (merged_df['Total, work status'].isna()) |
    (merged_df['Total, work status'] <= 0)
)

merged_df_clean = merged_df[~merged_df['Data_Quality_Flag']]
merged_df['Educators_per_1000'] = merged_df['Educators_per_1000'].round(2)


# # Optional: Save to CSV
# merged_df.to_csv("Merged_Education_Population_Data_1991_onwards.csv", index=False)
# print("✅ Merged dataset saved to 'Merged_Education_Population_Data_1991_onwards.csv'")

✅ Merged dataset saved to 'Merged_Education_Population_Data_1991_onwards.csv'


In [9]:
merged_df.head(20
            )

Unnamed: 0,REF_DATE,Province,Full-time educators,Part-time educators,"Total, work status",Education price index (EPI),Fees and contractual services sub-index,Instructional supplies sub-index,Non-salary sub-index,Non-teaching salaries sub-index,...,Capital_Efficiency,Year,Total PRs,Total TRs,Total Births,Total Deaths,Population Estimate,Educators_per_1000,OpSpend_per_Capita,Access_Index_per_Capita
0,01-01-1991,Quebec,68160.681818,33531.545455,101692.090909,125.03,136.25,130.64,131.11,121.9,...,4833.292919,1991,0.0,0.0,581654.0,98242.0,7026241.0,14.473186,0.082377,2.306624e-06
1,01-01-1991,Prince Edward Island,1462.363636,200.590909,1662.136364,128.12,135.35,130.64,122.65,128.26,...,3955.440334,1991,0.0,0.0,11296.0,2376.0,130477.0,12.738922,3.753424,0.0001074748
2,01-01-1991,Canada,307115.863636,80013.136364,387129.136364,126.27,136.11,130.64,127.69,121.21,...,11220.992728,1991,0.0,0.0,2413304.0,391138.0,27790000.0,13.930519,0.048359,5.525844e-07
3,01-01-1991,Nova Scotia,9786.954545,15979.881818,9787.5,122.4,138.06,130.64,125.75,120.87,...,2711.026219,1991,0.0,0.0,72112.0,14510.0,912792.0,10.722596,0.341775,1.553904e-05
4,01-01-1991,Saskatchewan,10467.409091,1696.636364,12164.181818,119.44,134.61,130.64,126.33,119.14,...,122.385384,1991,0.0,0.0,91862.0,16196.0,1002651.0,12.13202,0.013665,1.244975e-05
5,01-01-1991,British Columbia,29352.818182,8329.909091,37682.454545,128.46,128.43,130.64,119.38,120.16,...,1757.021277,1991,0.0,0.0,273628.0,47954.0,3339935.0,11.282392,0.064098,4.263969e-06
6,01-01-1991,Newfoundland and Labrador,5406.272727,1131.545455,6538.772727,126.44,136.73,130.64,126.91,128.26,...,1878.301961,1991,0.0,0.0,42968.0,7596.0,577377.0,11.324962,0.386695,2.558141e-05
7,01-01-1991,New Brunswick,7382.318182,317.454545,7699.772727,122.23,137.62,130.64,122.7,120.87,...,1180.505038,1991,0.0,0.0,56978.0,10938.0,743210.0,10.360158,0.181064,1.716691e-05
8,01-01-1991,Manitoba,12008.045455,1908.0,13916.590909,122.47,136.62,130.64,124.74,120.95,...,415.160445,1991,0.0,0.0,103642.0,17886.0,1106196.0,12.580583,0.041465,1.157534e-05
9,01-01-1991,Alberta,31970.454545,8851.636364,40822.227273,120.7,135.07,130.64,125.78,120.18,...,744.255426,1991,0.0,0.0,256630.0,28902.0,2572947.0,15.865942,0.030589,4.726456e-06


In [5]:
import pandas as pd
from prophet import Prophet

# Prepare to store all forecasts
forecast_all = []

# Get province list + 'Canada' for national forecast
geo_list = merged_df['Province'].unique().tolist()
geo_list.append('Canada')

# Loop through each province (and Canada)
for geo in geo_list:
    if geo == 'Canada':
        df_geo = merged_df.groupby('Year')['Educators_per_1000'].mean().reset_index()
    else:
        df_geo = merged_df[merged_df['Province'] == geo].groupby('Year')['Educators_per_1000'].mean().reset_index()

    # Ensure column names match Prophet requirements
    df_geo.columns = ['ds', 'y']
    df_geo['ds'] = pd.to_datetime(df_geo['ds'], format='%Y')

    # Skip if insufficient data
    if df_geo['y'].isnull().sum() > 5 or len(df_geo) < 5:
        continue

    # Fit Prophet model
    model = Prophet(yearly_seasonality=True, daily_seasonality=False, weekly_seasonality=False)
    model.fit(df_geo)

    # Create monthly future dataframe (from Jan 2024 to Dec 2035)
    future_months = pd.date_range(start='2024-01-01', end='2035-12-01', freq='MS')
    future_df = pd.DataFrame({'ds': future_months})

    # Predict
    forecast = model.predict(future_df)

    # Extract relevant columns
    result = forecast[['ds', 'yhat']].copy()
    result.columns = ['REF_DATE', 'Educators_per_1000']
    result['GEO'] = geo

    # Store result
    forecast_all.append(result)

# Combine all province forecasts
final_forecast_df = pd.concat(forecast_all, ignore_index=True)
final_forecast_df = final_forecast_df[['REF_DATE', 'GEO', 'Educators_per_1000']]

# Optional: Save to CSV
final_forecast_df.to_csv("Monthly_Educators_per_1000_Forecast_2024_2035.csv", index=False)

# Display preview
print(final_forecast_df.head())
print("✅ Forecast dataset saved to 'Monthly_Educators_per_1000_Forecast_2024_2035.csv'")

15:36:01 - cmdstanpy - INFO - Chain [1] start processing
15:36:02 - cmdstanpy - INFO - Chain [1] done processing
15:36:03 - cmdstanpy - INFO - Chain [1] start processing
15:36:03 - cmdstanpy - INFO - Chain [1] done processing
15:36:03 - cmdstanpy - INFO - Chain [1] start processing
15:36:04 - cmdstanpy - INFO - Chain [1] done processing
15:36:04 - cmdstanpy - INFO - Chain [1] start processing
15:36:04 - cmdstanpy - INFO - Chain [1] done processing
15:36:05 - cmdstanpy - INFO - Chain [1] start processing
15:36:05 - cmdstanpy - INFO - Chain [1] done processing
15:36:06 - cmdstanpy - INFO - Chain [1] start processing
15:36:06 - cmdstanpy - INFO - Chain [1] done processing
15:36:06 - cmdstanpy - INFO - Chain [1] start processing
15:36:07 - cmdstanpy - INFO - Chain [1] done processing
15:36:07 - cmdstanpy - INFO - Chain [1] start processing
15:36:07 - cmdstanpy - INFO - Chain [1] done processing
15:36:07 - cmdstanpy - INFO - Chain [1] start processing
15:36:08 - cmdstanpy - INFO - Chain [1]

    REF_DATE     GEO  Educators_per_1000
0 2024-01-01  Quebec           14.133378
1 2024-02-01  Quebec            2.258106
2 2024-03-01  Quebec           16.069003
3 2024-04-01  Quebec            6.757504
4 2024-05-01  Quebec           -4.309188
✅ Forecast dataset saved to 'Monthly_Educators_per_1000_Forecast_2024_2035.csv'
