### Set styling for plotting

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import seaborn as sns
sns.set_palette('colorblind')
from matplotlib.pyplot import tight_layout
# ##SETTING PARAMS FOR MATPLOTLIB FIGURES
plt.rcParams.update({"figure.figsize": (6, 6),
                 "axes.facecolor": "white",
                 "axes.edgecolor": "black"})
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=sns.color_palette('colorblind'))
##set font size
font = {'family': 'sans-serif',
       'weight': 'normal',
       'size': 14}
plt.rc('font', **font)
# ##PANDAS PLOTTING
pd.plotting.register_matplotlib_converters()


### Step 1: save environment file

In [4]:
!conda env export > census_environment.yml

### Step 2: import modules


In [6]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
import codecs
import csv
import openpyxl

### Step 3: import data files

In [8]:
data_directory_census = "/home/paulharford/college/project/project_data/census/WEATHERED_census_2022_2016_population_by_agegroup_gender_v2.0.xlsx"

In [9]:
full_path_census = os.path.abspath(data_directory_census)


In [10]:
df_census = pd.read_excel(full_path_census)


In [11]:
pd.set_option('display.float_format', '{:,.0f}'.format)


In [12]:
df_census.head()

Unnamed: 0,region,gender,age_group,percent_value,gender_ratio,total_population_2022,demographic_pop_number_2022,age_group_gender_population_2022,total_population_2016,demographic_pop_number_2016,age_group_gender_population_2016
0,HSE Dublin and North East,Male,0 - 4 years,6,1,1187082,73600,39002,1082117,67092,33613
1,HSE Dublin and North East,Male,5 - 9 years,7,1,1187082,83096,43409,1082117,75749,37950
2,HSE Dublin and North East,Male,10 - 14 years,8,1,1187082,91406,47880,1082117,83324,41745
3,HSE Dublin and North East,Male,15 - 19 years,7,1,1187082,79535,40992,1082117,72502,36323
4,HSE Dublin and North East,Male,20 - 24 years,6,1,1187082,75974,39856,1082117,69256,34697


In [13]:
# Drop the percent_value column
df_census = df_census.drop(['percent_value'], axis=1)

### Step 4.1: Filter the data to age groups 60 and over

In [15]:
# Filter to 60+ age groups
age_group_labels = [
    "0 - 4 years" 
    "5 - 9 years", 
    "10 - 14 years", 
    "15 - 19 years", 
    "20 - 24 years", 
    "25 - 29 years", 
    "30 - 34 years", 
    "35 - 39 years", 
    "40 - 44 years", 
    "45 - 49 years", 
    "50 - 54 years", 
    "55 - 59 years",
    "60 - 64 years", 
    "65 - 69 years", 
    "70 - 74 years", 
    "75 - 79 years", 
    "80 - 84 years", 
    "85 years and over"
]

# Filter to 60+ age groups
df_census_region = df_census[df_census['age_group'].isin(age_group_labels)]


# Group by region, age group, and sex
group_cols = ['region', 'age_group', 'gender']

# Aggregate the data
df_agg_detailed = (
    df_census_region
    .groupby(group_cols, as_index=False)
    .sum()
)
# Display the result
df_agg_detailed


Unnamed: 0,region,age_group,gender,gender_ratio,total_population_2022,demographic_pop_number_2022,age_group_gender_population_2022,total_population_2016,demographic_pop_number_2016,age_group_gender_population_2016
0,HSE Dublin and Midlands,10 - 14 years,Female,0,1077639,77591,37747,989567,71249,35553
1,HSE Dublin and Midlands,10 - 14 years,Male,1,1077639,81901,42058,989567,75208,37679
2,HSE Dublin and Midlands,15 - 19 years,Female,0,1077639,67892,32651,989567,62343,31109
3,HSE Dublin and Midlands,15 - 19 years,Male,1,1077639,73280,38039,989567,67291,33713
4,HSE Dublin and Midlands,20 - 24 years,Female,0,1077639,65736,32080,989567,60364,30121
...,...,...,...,...,...,...,...,...,...,...
187,HSE West and North West,75 - 79 years,Male,0,759652,25829,12728,711285,24184,12092
188,HSE West and North West,80 - 84 years,Female,1,759652,16713,8551,711285,15649,7840
189,HSE West and North West,80 - 84 years,Male,0,759652,15953,7791,711285,14937,7468
190,HSE West and North West,85 years and over,Female,1,759652,15953,9306,711285,14937,7483


### Step 4.2: Use linear interpolation to estimate the population numbers for other years

In [17]:
##Estimate other years linear interpolation 
# Years we want estimates for
years = range(2014, 2024)  # 2014 through 2023
rows = []

# Group by region, age group, and gender
grouped_data = df_census.groupby(['region', 'age_group', 'gender'])

# Iterate through each group
for (region, age_group, gender), group in grouped_data:
    # Get 2016 and 2022 population values
    row = group.iloc[0]  # Take the first (and only) row from this group
    pop_2016 = row['age_group_gender_population_2016']
    pop_2022 = row['age_group_gender_population_2022']
    
    for y in years:
        # Linear interpolation formula
        fraction = (y - 2016) / (2022 - 2016)  # 0.0 to 1.0
        
        # For years before 2016 or after 2022, extend the trend
        if y < 2016:
            fraction = (y - 2016) / (2022 - 2016)
        elif y > 2022:
            fraction = (y - 2016) / (2022 - 2016)
            
        pop_est = pop_2016 + (pop_2022 - pop_2016) * fraction
        
        rows.append({
            'region': region,
            'year': y,
            'age_group': age_group,
            'gender': gender,
            'population': round(pop_est)
        })

# Create DataFrame with interpolated values
df_interpolated = pd.DataFrame(rows)

# Create a pivot table to get total population by region, year, and age group
df_total_by_age = df_interpolated.pivot_table(
    index=['region', 'year', 'age_group'],
    columns='gender',
    values='population',
    aggfunc='sum'
).reset_index()

# Add total column (male + female)
df_total_by_age['total'] = df_total_by_age['Male'] + df_total_by_age['Female']

# Sort for readability
df_total_by_age = df_total_by_age.sort_values(['region', 'year', 'age_group'])

# Filter to only 60+ age groups, NOCA only record inceidents of over 60's
age_60plus_labels = [ 
    "60 - 64 years", 
    "65 - 69 years", 
    "70 - 74 years", 
    "75 - 79 years", 
    "80 - 84 years", 
    "85 years and over"
]

df_60plus = df_total_by_age[df_total_by_age['age_group'].isin(age_60plus_labels)]

# Display first 15 rows
df_60plus.head(15)


gender,region,year,age_group,Female,Male,total
12,HSE Dublin and Midlands,2014,60 - 64 years,23070,23499,46569
13,HSE Dublin and Midlands,2014,65 - 69 years,19327,20035,39362
14,HSE Dublin and Midlands,2014,70 - 74 years,16758,16850,33608
15,HSE Dublin and Midlands,2014,75 - 79 years,12641,11808,24449
16,HSE Dublin and Midlands,2014,80 - 84 years,8096,6635,14731
17,HSE Dublin and Midlands,2014,85 years and over,7694,5328,13022
30,HSE Dublin and Midlands,2015,60 - 64 years,23386,23896,47282
31,HSE Dublin and Midlands,2015,65 - 69 years,19539,20428,39967
32,HSE Dublin and Midlands,2015,70 - 74 years,17020,17101,34121
33,HSE Dublin and Midlands,2015,75 - 79 years,12986,11854,24840


### Step 4.3 : Update region names to match the other datasets

In [19]:
# Create an explicit copy and then rename
df_census = df_60plus.copy()
df_census.loc[df_60plus['region'] == 'HSE Midwest', 'region'] = 'HSE Mid West'

In [20]:
unique_region = df_census['region'].unique()

# Display the unique values
print("Unique values in Region:")
for region in sorted(unique_region):
    print(f"- {region}")

Unique values in Region:
- HSE Dublin and Midlands
- HSE Dublin and North East
- HSE Dublin and South East
- HSE Mid West
- HSE South West
- HSE West and North West


In [21]:
df_census.rename(columns={"Male": "male"}, inplace=True)
df_census.rename(columns={"Female": "female"}, inplace=True)
df_census.head(10)

gender,region,year,age_group,female,male,total
12,HSE Dublin and Midlands,2014,60 - 64 years,23070,23499,46569
13,HSE Dublin and Midlands,2014,65 - 69 years,19327,20035,39362
14,HSE Dublin and Midlands,2014,70 - 74 years,16758,16850,33608
15,HSE Dublin and Midlands,2014,75 - 79 years,12641,11808,24449
16,HSE Dublin and Midlands,2014,80 - 84 years,8096,6635,14731
17,HSE Dublin and Midlands,2014,85 years and over,7694,5328,13022
30,HSE Dublin and Midlands,2015,60 - 64 years,23386,23896,47282
31,HSE Dublin and Midlands,2015,65 - 69 years,19539,20428,39967
32,HSE Dublin and Midlands,2015,70 - 74 years,17020,17101,34121
33,HSE Dublin and Midlands,2015,75 - 79 years,12986,11854,24840


In [22]:
def clean_age_group(age_text):
    if pd.isna(age_text):
        return age_text
    
    # Replace " years and over" with " and over"
    if 'years and over' in age_text:
        return age_text.replace('years ', '')
    
    # Remove " years" from strings like "70-74 years"
    return age_text.replace(' years', '')


# Apply the function to the column
df_census['age_group'] = df_census['age_group'].apply(clean_age_group)#


In [23]:
from datetime import datetime
def expand_to_daily(census_df):
    expanded_rows = []
    
    # Iterate through each row in the census data
    for _, row in census_df.iterrows():
        year = int(row['year'])
        
        # Create all dates for this year
        start_date = datetime(year, 1, 1)
        end_date = datetime(year, 12, 31)
        all_dates = pd.date_range(start=start_date, end=end_date)
        
        # Create a new row for each date
        for date in all_dates:
            new_row = {
                'region': row['region'],
                'date': date,
                'age_group': row['age_group'],
                'female': row['female'],
                'male': row['male'],
                'total': row['total'],
                'year': year  # Keeping the year column for reference
            }
            expanded_rows.append(new_row)
    
    # Convert to DataFrame
    return pd.DataFrame(expanded_rows)

df_census_final = expand_to_daily(df_census)



In [24]:
df_census_final.head(30)

Unnamed: 0,region,date,age_group,female,male,total,year
0,HSE Dublin and Midlands,2014-01-01,60 - 64,23070,23499,46569,2014
1,HSE Dublin and Midlands,2014-01-02,60 - 64,23070,23499,46569,2014
2,HSE Dublin and Midlands,2014-01-03,60 - 64,23070,23499,46569,2014
3,HSE Dublin and Midlands,2014-01-04,60 - 64,23070,23499,46569,2014
4,HSE Dublin and Midlands,2014-01-05,60 - 64,23070,23499,46569,2014
5,HSE Dublin and Midlands,2014-01-06,60 - 64,23070,23499,46569,2014
6,HSE Dublin and Midlands,2014-01-07,60 - 64,23070,23499,46569,2014
7,HSE Dublin and Midlands,2014-01-08,60 - 64,23070,23499,46569,2014
8,HSE Dublin and Midlands,2014-01-09,60 - 64,23070,23499,46569,2014
9,HSE Dublin and Midlands,2014-01-10,60 - 64,23070,23499,46569,2014


In [25]:
df_census_final.to_csv('/home/paulharford/college/project/project_data/processed/WEATHERED_census_pop_age_grp_gender_region.csv', index=False)