### Set styling for plotting

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import seaborn as sns
sns.set_palette('colorblind')
from matplotlib.pyplot import tight_layout
# ##SETTING PARAMS FOR MATPLOTLIB FIGURES
plt.rcParams.update({"figure.figsize": (6, 6),
                 "axes.facecolor": "white",
                 "axes.edgecolor": "black"})
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=sns.color_palette('colorblind'))
##set font size
font = {'family': 'sans-serif',
       'weight': 'normal',
       'size': 14}
plt.rc('font', **font)
# ##PANDAS PLOTTING
pd.plotting.register_matplotlib_converters()


### Step 1: save environment file

In [4]:
!conda env export > census_environment.yml

### Step 2: import modules


In [9]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
import codecs
import csv
import openpyxl

### Step 3: import data files

In [10]:
data_directory_census = "/home/paulharford/college/project/project_data/census/WEATHERED_cenusu_2022_2016_population_by_agegroup_gender.csv"

In [11]:
full_path_census = os.path.abspath(data_directory_census)

In [12]:
df_census = pd.read_csv(full_path_census)

In [13]:
pd.set_option('display.float_format', '{:,.0f}'.format)


In [14]:
df_census.head()

Unnamed: 0,region,gender,age_group,percent_value,total_population_2022,demographic_pop_number,age_group_gender_population_2022,total_population_2016,age_group_gender_population_2016
0,HSE Dublin and North East,Male,0 - 4 years,6,1187082,73599,36873,1082117,33613
1,HSE Dublin and North East,Male,5 - 9 years,7,1187082,83096,41631,1082117,37950
2,HSE Dublin and North East,Male,10 - 14 years,8,1187082,91405,45794,1082117,41745
3,HSE Dublin and North East,Male,15 - 19 years,7,1187082,79534,39847,1082117,36323
4,HSE Dublin and North East,Male,20 - 24 years,6,1187082,75973,38063,1082117,34697


In [19]:
# Drop the percent_value column
df_census = df_census.drop(['percent_value'], axis=1)

In [20]:
# Filter to 60+ age groups
age_group_labels = [
    "0 - 4 years" 
    "5 - 9 years", 
    "10 - 14 years", 
    "15 - 19 years", 
    "20 - 24 years", 
    "25 - 29 years", 
    "30 - 34 years", 
    "35 - 39 years", 
    "40 - 44 years", 
    "45 - 49 years", 
    "50 - 54 years", 
    "55 - 59 years",
    "60 - 64 years", 
    "65 - 69 years", 
    "70 - 74 years", 
    "75 - 79 years", 
    "80 - 84 years", 
    "85 years and over"  # Update this to match your exact label
]

# Filter to 60+ age groups
df_census_region = df_census[df_census['age_group'].isin(age_group_labels)]

# Group by region, age group, and sex
group_cols = ['region', 'age_group', 'gender']

# Sum the numeric population columns
#pop_cols = ['Age group pop 2022', 'Age group pop 2016']

# Aggregate the data
df_agg_detailed = (
    df_census_region
    .groupby(group_cols, as_index=False)
    .sum()
)

# Display the result
df_agg_detailed

Unnamed: 0,region,age_group,gender,total_population_2022,demographic_pop_number,age_group_gender_population_2022,total_population_2016,age_group_gender_population_2016
0,HSE Dublin and Midlands,10 - 14 years,Female,1077639,77590,38717,989567,35553
1,HSE Dublin and Midlands,10 - 14 years,Male,1077639,81901,45794,989567,37679
2,HSE Dublin and Midlands,15 - 19 years,Female,1077639,67891,33878,989567,31109
3,HSE Dublin and Midlands,15 - 19 years,Male,1077639,73279,39847,989567,33713
4,HSE Dublin and Midlands,20 - 24 years,Female,1077639,65736,32802,989567,30121
...,...,...,...,...,...,...,...,...
187,HSE West and North West,75 - 79 years,Male,759652,25828,12914,711285,12092
188,HSE West and North West,80 - 84 years,Female,759652,16712,8373,711285,7840
189,HSE West and North West,80 - 84 years,Male,759652,15953,7976,711285,7468
190,HSE West and North West,85 years and over,Female,759652,15953,7992,711285,7483


In [21]:
##Estimate otehr years linear interpolation 
# Years we want estimates for
years = range(2014, 2024)  # 2014 through 2023
rows = []

# Group by region, age group, and gender
grouped_data = df_census.groupby(['region', 'age_group', 'gender'])

# Iterate through each group
for (region, age_group, gender), group in grouped_data:
    # Get 2016 and 2022 population values
    row = group.iloc[0]  # Take the first (and only) row from this group
    pop_2016 = row['age_group_gender_population_2016']
    pop_2022 = row['age_group_gender_population_2022']
    
    for y in years:
        # Linear interpolation formula
        fraction = (y - 2016) / (2022 - 2016)  # 0.0 to 1.0
        
        # For years before 2016 or after 2022, extend the trend
        if y < 2016:
            fraction = (y - 2016) / (2022 - 2016)
        elif y > 2022:
            fraction = (y - 2016) / (2022 - 2016)
            
        pop_est = pop_2016 + (pop_2022 - pop_2016) * fraction
        
        rows.append({
            'region': region,
            'year': y,
            'age_group': age_group,
            'gender': gender,
            'population': round(pop_est)
        })

# Create DataFrame with interpolated values
df_interpolated = pd.DataFrame(rows)

# Create a pivot table to get total population by region, year, and age group
df_total_by_age = df_interpolated.pivot_table(
    index=['region', 'year', 'age_group'],
    columns='gender',
    values='population',
    aggfunc='sum'
).reset_index()

# Add total column (male + female)
df_total_by_age['total'] = df_total_by_age['Male'] + df_total_by_age['Female']

# Sort for readability
df_total_by_age = df_total_by_age.sort_values(['region', 'year', 'age_group'])

# Filter to only 60+ age groups, NOCA only record inceidents of over 60's
age_60plus_labels = [ 
    "60 - 64 years", 
    "65 - 69 years", 
    "70 - 74 years", 
    "75 - 79 years", 
    "80 - 84 years", 
    "85 years and over"
]

df_60plus = df_total_by_age[df_total_by_age['age_group'].isin(age_60plus_labels)]

# Display first 15 rows
df_60plus.head(15)


gender,region,year,age_group,Female,Male,total
12,HSE Dublin and Midlands,2014,60 - 64 years,22999,23073,46072
13,HSE Dublin and Midlands,2014,65 - 69 years,19166,20031,39197
14,HSE Dublin and Midlands,2014,70 - 74 years,16770,16594,33364
15,HSE Dublin and Midlands,2014,75 - 79 years,12936,10909,23845
16,HSE Dublin and Midlands,2014,80 - 84 years,8624,5620,14244
17,HSE Dublin and Midlands,2014,85 years and over,8624,4430,13054
30,HSE Dublin and Midlands,2015,60 - 64 years,23350,23683,47033
31,HSE Dublin and Midlands,2015,65 - 69 years,19459,20427,39886
32,HSE Dublin and Midlands,2015,70 - 74 years,17027,16973,34000
33,HSE Dublin and Midlands,2015,75 - 79 years,13134,11404,24538


In [33]:
# Create an explicit copy and then rename
df_census = df_60plus.copy()
df_census.loc[df_60plus['region'] == 'HSE Midwest', 'region'] = 'HSE Mid West'

In [34]:
unique_region = df_census['region'].unique()

# Display the unique values
print("Unique values in Region:")
for region in sorted(unique_region):
    print(f"- {region}")

Unique values in Region:
- HSE Dublin and Midlands
- HSE Dublin and North East
- HSE Dublin and South East
- HSE Mid West
- HSE South West
- HSE West and North West


In [35]:
df_census.head(10)

gender,region,year,age_group,Female,Male,total
12,HSE Dublin and Midlands,2014,60 - 64 years,22999,23073,46072
13,HSE Dublin and Midlands,2014,65 - 69 years,19166,20031,39197
14,HSE Dublin and Midlands,2014,70 - 74 years,16770,16594,33364
15,HSE Dublin and Midlands,2014,75 - 79 years,12936,10909,23845
16,HSE Dublin and Midlands,2014,80 - 84 years,8624,5620,14244
17,HSE Dublin and Midlands,2014,85 years and over,8624,4430,13054
30,HSE Dublin and Midlands,2015,60 - 64 years,23350,23683,47033
31,HSE Dublin and Midlands,2015,65 - 69 years,19459,20427,39886
32,HSE Dublin and Midlands,2015,70 - 74 years,17027,16973,34000
33,HSE Dublin and Midlands,2015,75 - 79 years,13134,11404,24538


In [36]:
df_census.to_csv('/home/paulharford/college/project/project_data/processed/WEATHERED_census_pop_age_grp_gender_region.csv', index=False)