In [7]:
import pandas as pd

def correct_population_dataset(
    file_path,
    canada_population_file,
    provinces_age_gender_file,
    canada_age_gender_file,
    output_path=None
):
    # Load base demographic data
    df = pd.read_csv(file_path)
    df_unique = df.drop_duplicates(subset=["Year", "Province"])

    # Filter for 1991 onward
    df_unique = df_unique[df_unique["Year"] >= 1991]

    # Load official Canada population
    canada_population_df = pd.read_csv(canada_population_file)
    canada_population_df = canada_population_df[canada_population_df["Year"].isin(df_unique["Year"].unique())]

    # Standardize for merging
    common_columns = ["Year", "Province", "Total PRs", "Total TRs", "Total Births", "Total Deaths", "Population Estimate"]
    df_unique = df_unique[common_columns]
    canada_population_df = canada_population_df[common_columns]

    # Remove existing 'Canada', calculate new one
    df_corrected = df_unique[df_unique["Province"] != "Canada"]
    grouped_df = df_corrected.groupby('Year').sum().reset_index()
    grouped_df['Province'] = 'Canada'

    # Merge calculated with official
    canada_population_df = canada_population_df.set_index('Year')
    grouped_df = grouped_df.set_index('Year')
    for col in ['Total PRs', 'Total TRs', 'Total Births', 'Total Deaths']:
        canada_population_df[col] = grouped_df[col].combine_first(canada_population_df[col])
    canada_population_df = canada_population_df.reset_index()

    # Combine all into one base file
    df_final = pd.concat([df_corrected, canada_population_df], ignore_index=True)
    df_final = df_final.sort_values(by=["Year", "Province"]).reset_index(drop=True)

    # -------------------------------
    # 🔹 Load & Merge Age-Gender Data
    # -------------------------------
    def clean_age_gender(df):
        df = df[['REF_DATE', 'GEO', 'Gender', 'Age group', 'VALUE']].copy()
        df.columns = ['Year', 'Province', 'Gender', 'Age_Group', 'Population']
        df = df[df['Gender'].isin(['Men+', 'Women+'])].dropna(subset=['Population'])
        df['Year'] = pd.to_datetime(df['Year'].astype(str) + '-01-01')
        df['REF_DATE'] = df['Year'].dt.strftime('01-%m-%Y')
        df['Year'] = df['Year'].dt.year
        df = df[df['Year'] >= 1991]
        return df

    provinces_df = pd.read_csv(provinces_age_gender_file)
    canada_df = pd.read_csv(canada_age_gender_file)
    provinces_clean = clean_age_gender(provinces_df)
    canada_clean = clean_age_gender(canada_df)

    combined = pd.concat([provinces_clean, canada_clean], ignore_index=True)

    # Pivot to get separate columns for Male and Female
    gender_pivoted = combined.pivot_table(
        index=['REF_DATE', 'Year', 'Province', 'Age_Group'],
        columns='Gender',
        values='Population',
        aggfunc='sum'
    ).reset_index()

    gender_pivoted = gender_pivoted.rename(columns={'Men+': 'Male_Population', 'Women+': 'Female_Population'})

    # Merge with base demographic summary
    final_merged = pd.merge(gender_pivoted, df_final, how='left', on=['Year', 'Province'])

    # 🔄 Final sort by REF_DATE and Province (mixed order)
    final_sorted = final_merged.sort_values(by=['REF_DATE', 'Province']).reset_index(drop=True)

    # Optionally export
    if output_path:
        final_sorted.to_csv(output_path, index=False)
        print(f"✅ Final dataset saved to {output_path}")

    return final_sorted


# === Usage ===
file_path = "D:/Personal Projects/IRCC_Project/datasets/Population/Metadata/Population Timeseries.csv"
canada_population_file = "D:/Personal Projects/IRCC_Project/datasets/Population/Metadata/canada_population_data.csv"
provinces_age_gender_file = "D:/Personal Projects/IRCC_Project/datasets/Population/Metadata/Demographic Data/Age Gender Distribution Provinces.csv"
canada_age_gender_file = "D:/Personal Projects/IRCC_Project/datasets/Population/Metadata/Demographic Data/Age Gender Distribution Canada.csv"
output_csv = "D:/Personal Projects/IRCC_Project/datasets/Population/Population_Demographics_by_Year_and_Province_and_Canada.csv"

final_df = correct_population_dataset(
    file_path,
    canada_population_file,
    provinces_age_gender_file,
    canada_age_gender_file,
    output_path=output_csv
)
print("✅ Data correction and merging completed.")

  provinces_df = pd.read_csv(provinces_age_gender_file)


✅ Final dataset saved to D:/Personal Projects/IRCC_Project/datasets/Population/Population_Demographics_by_Year_and_Province_and_Canada.csv
✅ Data correction and merging completed.


In [36]:
# load the correct dataset and check for canada_population
corrected_df[corrected_df["Province"] == "Canada"]

Unnamed: 0,Year,Province,Total PRs,Total TRs,Total Births,Total Deaths,Population Estimate
2,1991,Canada,0,0,2413304,391138,27790000
73,1996,Canada,0,0,2196976,425760,29671000
149,2001,Canada,0,145900,2005556,439076,31190000
235,2006,Canada,0,172280,2142873,456158,32399000
321,2011,Canada,0,248440,2269536,487022,34474000
407,2016,Canada,27275,426455,2302890,534426,36565000
460,2019,Canada,18630,662365,2234104,570602,37590000
479,2020,Canada,24720,554370,2163832,616824,38005238
498,2021,Canada,24700,651840,2219960,623280,36991981
517,2022,Canada,35455,829425,2104800,668162,38250000


In [37]:
corrected_df.head(20)

Unnamed: 0,Year,Province,Total PRs,Total TRs,Total Births,Total Deaths,Population Estimate
0,1991,Alberta,0,0,256630,28902,2572947
1,1991,British Columbia,0,0,273628,47954,3339935
2,1991,Canada,0,0,2413304,391138,27790000
3,1991,Manitoba,0,0,103642,17886,1106196
4,1991,New Brunswick,0,0,56978,10938,743210
5,1991,Newfoundland and Labrador,0,0,42968,7596,577377
6,1991,Northwest Territories including Nunavut,0,0,9812,474,59711
7,1991,Nova Scotia,0,0,72112,14510,912792
8,1991,Ontario,0,0,909316,145834,10355101
9,1991,Prince Edward Island,0,0,11296,2376,130477


In [38]:
def calculate_metrics(df):
    # Initialize new columns with default value 0
    df['Net Migration Rate'] = 0
    df['Natural Growth Rate'] = 0
    df['Net Migration'] = 0
    df['Natural Increase'] = 0
    df['Net Population Change'] = 0
    df['Population Growth Rate (%)'] = 0

    for index, row in df.iterrows():
        if row['Total PRs'] != 0 and row['Total TRs'] != 0:
            df.at[index, 'Net Migration'] = row['Total PRs'] - row['Total TRs']
        
        if row['Total Births'] != 0 and row['Total Deaths'] != 0:
            df.at[index, 'Natural Increase'] = row['Total Births'] - row['Total Deaths']
        
        if row['Population Estimate'] != 0:
            if df.at[index, 'Net Migration'] != 0:
                df.at[index, 'Net Migration Rate'] = (df.at[index, 'Net Migration'] / row['Population Estimate']) * 1000
            
            if df.at[index, 'Natural Increase'] != 0:
                df.at[index, 'Natural Growth Rate'] = (df.at[index, 'Natural Increase'] / row['Population Estimate']) * 1000
            
            if df.at[index, 'Net Migration'] != 0 or df.at[index, 'Natural Increase'] != 0:
                df.at[index, 'Net Population Change'] = df.at[index, 'Net Migration'] + df.at[index, 'Natural Increase']
            
            if df.at[index, 'Net Population Change'] != 0:
                df.at[index, 'Population Growth Rate (%)'] = (df.at[index, 'Net Population Change'] / row['Population Estimate']) * 100

    return df

# Calculate metrics
corrected_df = calculate_metrics(corrected_df)
corrected_df.head(20)

Unnamed: 0,Year,Province,Total PRs,Total TRs,Total Births,Total Deaths,Population Estimate,Net Migration Rate,Natural Growth Rate,Net Migration,Natural Increase,Net Population Change,Population Growth Rate (%)
0,1991,Alberta,0,0,256630,28902,2572947,0.0,88.508625,0,227728,227728,8.850862
1,1991,British Columbia,0,0,273628,47954,3339935,0.0,67.568381,0,225674,225674,6.756838
2,1991,Canada,0,0,2413304,391138,27790000,0.0,72.765959,0,2022166,2022166,7.276596
3,1991,Manitoba,0,0,103642,17886,1106196,0.0,77.523332,0,85756,85756,7.752333
4,1991,New Brunswick,0,0,56978,10938,743210,0.0,61.947498,0,46040,46040,6.19475
5,1991,Newfoundland and Labrador,0,0,42968,7596,577377,0.0,61.263265,0,35372,35372,6.126326
6,1991,Northwest Territories including Nunavut,0,0,9812,474,59711,0.0,156.386595,0,9338,9338,15.63866
7,1991,Nova Scotia,0,0,72112,14510,912792,0.0,63.105286,0,57602,57602,6.310529
8,1991,Ontario,0,0,909316,145834,10355101,0.0,73.730039,0,763482,763482,7.373004
9,1991,Prince Edward Island,0,0,11296,2376,130477,0.0,68.364539,0,8920,8920,6.836454


In [39]:
corrected_df.to_csv("D:/Personal Projects/IRCC_Project/datasets/Population/Population_Metrics_by_Year_and_Province.csv", index=False)
print("Dataset with metrics saved successfully!")

Dataset with metrics saved successfully!
