In [1]:
!conda env export > census_environment.yml

In [2]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
import codecs
import csv
import openpyxl

In [3]:
data_directory_census = "/home/paulharford/college/project/project_data/census/population_age_group_gender2016_2022_hse_region.xlsx"

In [4]:
full_path_census = os.path.abspath(data_directory_census)

In [5]:
df_census = pd.read_excel(full_path_census)

In [6]:
pd.set_option('display.float_format', '{:,.0f}'.format)


In [7]:
df_census.head()

Unnamed: 0,HSE Regions,Sex,Age Group,UNIT,VALUE,Age group pop 2022,Age group pop 2016,Population in 2022,Population in 2016
0,HSE Dublin and North East,Male,60 - 64 years,%,0,55793,50859,1187082,1082117
1,HSE Dublin and North East,Male,65 - 69 years,%,0,46296,42203,1187082,1082117
2,HSE Dublin and North East,Male,70 - 74 years,%,0,39174,35710,1187082,1082117
3,HSE Dublin and North East,Male,75 - 79 years,%,0,29677,27053,1187082,1082117
4,HSE Dublin and North East,Male,80 - 84 years,%,0,17806,16232,1187082,1082117


In [8]:
#Filter to 60+ age groups
age_60plus_labels = [
    "60 - 64 years", 
    "65 - 69 years", 
    "70 - 74 years", 
    "75 - 79 years", 
    "80 - 84 years", 
    "85 - 89 years",  # or however it's labeled in your dataset
    "90+ years"
]
df_census_region = df_census[df_census['Age Group'].isin(age_60plus_labels)]

#Group by the region

group_cols = ['HSE Regions'] 


# Sum the numeric population columns, ignoring unit/value.
pop_cols = ['Age group pop 2022', 'Age group pop 2016']

df_agg_60plus = (
    df_census_region
    .groupby(group_cols, as_index=False)[pop_cols]
    .sum()
)

# 4) Now df_agg_60plus is your aggregated 60+ population per region 
#    (and per sex, if included).
df_agg_60plus


Unnamed: 0,HSE Regions,Age group pop 2022,Age group pop 2016
0,HSE Dublin and Midlands,356699,327547
1,HSE Dublin and North East,414292,377659
2,HSE Dublin and South East,358333,333333
3,HSE Midwest,167289,155908
4,HSE South West,295505,275658
5,HSE West and North West,319054,298740


In [9]:
##Estimate otehr years linear interpolation 
# Years we want estimates for
years = range(2014, 2024)  # 2016 through 2022

rows = []
for _, row in df_agg_60plus.iterrows():
    region = row['HSE Regions']
    pop_2016 = row['Age group pop 2016']
    pop_2022 = row['Age group pop 2022']
    
    for y in years:
        # Linear interpolation formula
        # fraction of time from 2016 to y out of the total 6-year span (2016->2022)
        fraction = (y - 2016) / (2022 - 2016)  # 0.0 to 1.0
        pop_est = pop_2016 + (pop_2022 - pop_2016) * fraction
        
        rows.append({
            'HSE Regions': region,
            'Year': y,
            'Population (Linear Est)': round(pop_est)
        })

df_linear = pd.DataFrame(rows)
df_linear.head(15)


Unnamed: 0,HSE Regions,Year,Population (Linear Est)
0,HSE Dublin and Midlands,2014,317830
1,HSE Dublin and Midlands,2015,322688
2,HSE Dublin and Midlands,2016,327547
3,HSE Dublin and Midlands,2017,332405
4,HSE Dublin and Midlands,2018,337264
5,HSE Dublin and Midlands,2019,342123
6,HSE Dublin and Midlands,2020,346981
7,HSE Dublin and Midlands,2021,351840
8,HSE Dublin and Midlands,2022,356699
9,HSE Dublin and Midlands,2023,361557


In [10]:
df_linear.rename(columns={'HSE Regions': 'region'}, inplace=True)

In [11]:
df_linear.rename(columns={'HSE Midwest': 'HSE Mid West'}, inplace=True)

In [12]:
df_linear.rename(columns={'Population (Linear Est)': 'population'}, inplace=True)

In [13]:
df_linear.rename(columns={'Year': 'year'}, inplace=True)

In [14]:
df_linear.to_csv('/home/paulharford/college/project/project_data/processed/census_estimated_per_region_2014_2023.csv', index=False)