In [3]:
import pandas as pd

def correct_population_dataset(
    file_path,
    canada_population_file,
    provinces_age_gender_file,
    canada_age_gender_file,
    output_path=None
):
    # Load base demographic data
    df = pd.read_csv(file_path)
    df['Province'] = df['Province'].str.strip()
    df_unique = df.drop_duplicates(subset=["Year", "Province"])
    df_unique = df_unique[df_unique["Year"] >= 1991]

    # Load official Canada population
    canada_population_df = pd.read_csv(canada_population_file)
    canada_population_df = canada_population_df[canada_population_df["Year"].isin(df_unique["Year"].unique())]

    common_columns = ["Year", "Province", "Total PRs", "Total TRs", "Total Births", "Total Deaths", "Population Estimate"]
    df_unique = df_unique[common_columns]
    canada_population_df = canada_population_df[common_columns]

    # Recalculate 'Canada' totals
    df_corrected = df_unique[df_unique["Province"] != "Canada"]
    grouped_df = df_corrected.groupby('Year').sum().reset_index()
    grouped_df['Province'] = 'Canada'

    canada_population_df = canada_population_df.set_index('Year')
    grouped_df = grouped_df.set_index('Year')
    for col in ['Total PRs', 'Total TRs', 'Total Births', 'Total Deaths']:
        canada_population_df[col] = grouped_df[col].combine_first(canada_population_df[col])
    canada_population_df = canada_population_df.reset_index()

    df_final = pd.concat([df_corrected, canada_population_df], ignore_index=True)
    df_final = df_final.sort_values(by=["Year", "Province"]).reset_index(drop=True)

    # Clean Age-Gender Data
    def clean_age_gender(df):
        df = df[['REF_DATE', 'GEO', 'Gender', 'Age group', 'VALUE']].copy()
        df.columns = ['Year', 'Province', 'Gender', 'Age_Group', 'Population']
        df['Province'] = df['Province'].str.strip()
        df = df[df['Gender'].isin(['Men+', 'Women+'])].dropna(subset=['Population'])
        df['Year'] = pd.to_datetime(df['Year'].astype(str) + '-01-01')
        df['REF_DATE'] = df['Year'].dt.strftime('01-%m-%Y')
        df['Year'] = df['Year'].dt.year
        df = df[df['Year'] >= 1991]
        return df

    provinces_df = pd.read_csv(provinces_age_gender_file)
    canada_df = pd.read_csv(canada_age_gender_file)
    provinces_clean = clean_age_gender(provinces_df)
    canada_clean = clean_age_gender(canada_df)
    combined = pd.concat([provinces_clean, canada_clean], ignore_index=True)

    # Pivot to Male/Female columns
    pivoted = combined.pivot_table(
        index=['REF_DATE', 'Year', 'Province', 'Age_Group'],
        columns='Gender',
        values='Population',
        aggfunc='sum'
    ).reset_index().rename(columns={'Men+': 'Male_Population', 'Women+': 'Female_Population'})

    pivoted['Combined_Population'] = pivoted['Male_Population'] + pivoted['Female_Population']

    # Filter only valid, non-overlapping age groups
    age_group_map = {
        '0 to 14 years': 'Children (0–14)',
        '15 to 64 years': 'Working Age (15–64)',
        '90 years and older': 'Elderly (90+)',
        'All ages': 'All Ages'
    }

    pivoted['Age_Group_Category'] = pivoted['Age_Group'].map(age_group_map)
    pivoted = pivoted.dropna(subset=['Age_Group_Category'])

    # Aggregate gender and combined values
    aggregated = pivoted.groupby(['REF_DATE', 'Year', 'Province', 'Age_Group_Category']).agg({
        'Male_Population': 'sum',
        'Female_Population': 'sum',
        'Combined_Population': 'sum'
    }).reset_index()

    # Merge demographic totals for birth/death/estimate data
    demographic_info = df_final[['Year', 'Province', 'Total PRs', 'Total TRs', 'Total Births', 'Total Deaths', 'Population Estimate']]
    final_df = pd.merge(aggregated, demographic_info, on=['Year', 'Province'], how='left')

    # Fill missing values from 'All Ages' row within same year-province
    def fill_missing_from_all_ages(df):
        fill_cols = ['Total PRs', 'Total TRs', 'Total Births', 'Total Deaths', 'Population Estimate']
        all_ages_ref = df[df['Age_Group_Category'] == 'All Ages'][['Year', 'Province'] + fill_cols]

        df_filled = pd.merge(
            df,
            all_ages_ref,
            on=['Year', 'Province'],
            suffixes=('', '_ref'),
            how='left'
        )

        for col in fill_cols:
            df_filled[col] = df_filled[col].fillna(df_filled[f'{col}_ref'])
            df_filled.drop(columns=[f'{col}_ref'], inplace=True)

        return df_filled

    final_df = fill_missing_from_all_ages(final_df)

    # Export final version
    if output_path:
        final_df.to_csv(output_path, index=False)
        print(f"✅ Final cleaned dataset saved to {output_path}")

    return final_df

# === Usage ===
file_path = "D:/Personal Projects/IRCC_Project/datasets/Population/Metadata/Population Timeseries.csv"
canada_population_file = "D:/Personal Projects/IRCC_Project/datasets/Population/Metadata/canada_population_data.csv"
provinces_age_gender_file = "D:/Personal Projects/IRCC_Project/datasets/Population/Metadata/Demographic Data/Age Gender Distribution Provinces.csv"
canada_age_gender_file = "D:/Personal Projects/IRCC_Project/datasets/Population/Metadata/Demographic Data/Age Gender Distribution Canada.csv"
output_csv = "D:/Personal Projects/IRCC_Project/datasets/Population/Population_Demographics_by_Year_and_Province_and_Canada.csv"

final_df = correct_population_dataset(
    file_path,
    canada_population_file,
    provinces_age_gender_file,
    canada_age_gender_file,
    output_path=output_csv
)
print("✅ Data correction, cleaning, and merging completed.")


✅ Final cleaned dataset saved to D:/Personal Projects/IRCC_Project/datasets/Population/Population_Demographics_by_Year_and_Province_and_Canada.csv
✅ Data correction, cleaning, and merging completed.


In [4]:
# load the correct dataset and check for canada_population
final_df.head(20)

Unnamed: 0,REF_DATE,Year,Province,Age_Group_Category,Male_Population,Female_Population,Combined_Population,Total PRs,Total TRs,Total Births,Total Deaths,Population Estimate
0,01-01-1991,1991,Alberta,All Ages,1306905,1285401,2592306,0.0,0.0,256630.0,28902.0,2572947.0
1,01-01-1991,1991,Alberta,Children (0–14),312956,296828,609784,0.0,0.0,256630.0,28902.0,2572947.0
2,01-01-1991,1991,Alberta,Elderly (90+),2183,4900,7083,0.0,0.0,256630.0,28902.0,2572947.0
3,01-01-1991,1991,Alberta,Working Age (15–64),892557,857414,1749971,0.0,0.0,256630.0,28902.0,2572947.0
4,01-01-1991,1991,British Columbia,All Ages,1681631,1692156,3373787,0.0,0.0,273628.0,47954.0,3339935.0
5,01-01-1991,1991,British Columbia,Children (0–14),345371,329419,674790,0.0,0.0,273628.0,47954.0,3339935.0
6,01-01-1991,1991,British Columbia,Elderly (90+),3598,9040,12638,0.0,0.0,273628.0,47954.0,3339935.0
7,01-01-1991,1991,British Columbia,Working Age (15–64),1151071,1119838,2270909,0.0,0.0,273628.0,47954.0,3339935.0
8,01-01-1991,1991,Canada,All Ages,13904391,14133029,28037420,0.0,0.0,2413304.0,391138.0,27790000.0
9,01-01-1991,1991,Canada,Children (0–14),2969226,2826334,5795560,0.0,0.0,2413304.0,391138.0,27790000.0


In [38]:
def calculate_metrics(df):
    # Initialize new columns with default value 0
    df['Net Migration Rate'] = 0
    df['Natural Growth Rate'] = 0
    df['Net Migration'] = 0
    df['Natural Increase'] = 0
    df['Net Population Change'] = 0
    df['Population Growth Rate (%)'] = 0

    for index, row in df.iterrows():
        if row['Total PRs'] != 0 and row['Total TRs'] != 0:
            df.at[index, 'Net Migration'] = row['Total PRs'] - row['Total TRs']
        
        if row['Total Births'] != 0 and row['Total Deaths'] != 0:
            df.at[index, 'Natural Increase'] = row['Total Births'] - row['Total Deaths']
        
        if row['Population Estimate'] != 0:
            if df.at[index, 'Net Migration'] != 0:
                df.at[index, 'Net Migration Rate'] = (df.at[index, 'Net Migration'] / row['Population Estimate']) * 1000
            
            if df.at[index, 'Natural Increase'] != 0:
                df.at[index, 'Natural Growth Rate'] = (df.at[index, 'Natural Increase'] / row['Population Estimate']) * 1000
            
            if df.at[index, 'Net Migration'] != 0 or df.at[index, 'Natural Increase'] != 0:
                df.at[index, 'Net Population Change'] = df.at[index, 'Net Migration'] + df.at[index, 'Natural Increase']
            
            if df.at[index, 'Net Population Change'] != 0:
                df.at[index, 'Population Growth Rate (%)'] = (df.at[index, 'Net Population Change'] / row['Population Estimate']) * 100

    return df

# Calculate metrics
corrected_df = calculate_metrics(corrected_df)
corrected_df.head(20)

Unnamed: 0,Year,Province,Total PRs,Total TRs,Total Births,Total Deaths,Population Estimate,Net Migration Rate,Natural Growth Rate,Net Migration,Natural Increase,Net Population Change,Population Growth Rate (%)
0,1991,Alberta,0,0,256630,28902,2572947,0.0,88.508625,0,227728,227728,8.850862
1,1991,British Columbia,0,0,273628,47954,3339935,0.0,67.568381,0,225674,225674,6.756838
2,1991,Canada,0,0,2413304,391138,27790000,0.0,72.765959,0,2022166,2022166,7.276596
3,1991,Manitoba,0,0,103642,17886,1106196,0.0,77.523332,0,85756,85756,7.752333
4,1991,New Brunswick,0,0,56978,10938,743210,0.0,61.947498,0,46040,46040,6.19475
5,1991,Newfoundland and Labrador,0,0,42968,7596,577377,0.0,61.263265,0,35372,35372,6.126326
6,1991,Northwest Territories including Nunavut,0,0,9812,474,59711,0.0,156.386595,0,9338,9338,15.63866
7,1991,Nova Scotia,0,0,72112,14510,912792,0.0,63.105286,0,57602,57602,6.310529
8,1991,Ontario,0,0,909316,145834,10355101,0.0,73.730039,0,763482,763482,7.373004
9,1991,Prince Edward Island,0,0,11296,2376,130477,0.0,68.364539,0,8920,8920,6.836454


In [39]:
corrected_df.to_csv("D:/Personal Projects/IRCC_Project/datasets/Population/Population_Metrics_by_Year_and_Province.csv", index=False)
print("Dataset with metrics saved successfully!")

Dataset with metrics saved successfully!
