### Set styling for plotting

In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import seaborn as sns
sns.set_palette('colorblind')
from matplotlib.pyplot import tight_layout
# ##SETTING PARAMS FOR MATPLOTLIB FIGURES
plt.rcParams.update({"figure.figsize": (6, 6),
                 "axes.facecolor": "white",
                 "axes.edgecolor": "black"})
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=sns.color_palette('colorblind'))
##set font size
font = {'family': 'sans-serif',
       'weight': 'normal',
       'size': 14}
plt.rc('font', **font)
# ##PANDAS PLOTTING
pd.plotting.register_matplotlib_converters()


### Step 1: save environment file

In [4]:
!conda env export > census__2024_environment.yml

### Step 2: import modules


In [6]:
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import numpy as np
import os
import codecs
import csv
import openpyxl

### Step 3: import data files

In [8]:

data_directory_census_2024 = "/home/paulharford/college/project/project_data/census/WEATHERED_census_2024_population_by_agegroup_gender_v1.0.xlsx"

In [9]:

full_path_census_2024 = os.path.abspath(data_directory_census_2024)

In [10]:

df_census_2024 = pd.read_excel(full_path_census_2024)

In [11]:
pd.set_option('display.float_format', '{:,.0f}'.format)


In [12]:
df_census_2024.head()

Unnamed: 0,region,gender,age_group,percent_value,gender_ratio,total_population_2024,demographic_pop_number_2024,age_group_gender_population_2024
0,HSE Dublin and North East,Male,0 - 4 years,6,1,1242843,77057,40834
1,HSE Dublin and North East,Male,5 - 9 years,7,1,1242843,87000,45448
2,HSE Dublin and North East,Male,10 - 14 years,8,1,1242843,95699,50129
3,HSE Dublin and North East,Male,15 - 19 years,7,1,1242843,83271,42917
4,HSE Dublin and North East,Male,20 - 24 years,6,1,1242843,79542,41727


### Step 4.1: Filter the data to age groups 60 and over

In [14]:
# Filter to 60+ age groups
age_group_labels = [
    "0 - 4 years" 
    "5 - 9 years", 
    "10 - 14 years", 
    "15 - 19 years", 
    "20 - 24 years", 
    "25 - 29 years", 
    "30 - 34 years", 
    "35 - 39 years", 
    "40 - 44 years", 
    "45 - 49 years", 
    "50 - 54 years", 
    "55 - 59 years",
    "60 - 64 years", 
    "65 - 69 years", 
    "70 - 74 years", 
    "75 - 79 years", 
    "80 - 84 years", 
    "85 years and over"
]

# Filter to 60+ age groups
df_census_region_2024 = df_census_2024[df_census_2024['age_group'].isin(age_group_labels)]

# Group by region, age group, and sex
group_cols = ['region', 'age_group', 'gender']

# Aggregate the data for 2024
df_agg_detailed_2024 = (
    df_census_region_2024
    .groupby(group_cols, as_index=False)
    .sum()
)

In [15]:
df_agg_detailed_2024

Unnamed: 0,region,age_group,gender,percent_value,gender_ratio,total_population_2024,demographic_pop_number_2024,age_group_gender_population_2024
0,HSE Dublin and Midlands,10 - 14 years,Female,7,0,1124497,80964,39388
1,HSE Dublin and Midlands,10 - 14 years,Male,8,1,1124497,85462,43886
2,HSE Dublin and Midlands,15 - 19 years,Female,6,0,1124497,70844,34071
3,HSE Dublin and Midlands,15 - 19 years,Male,7,1,1124497,76466,39693
4,HSE Dublin and Midlands,20 - 24 years,Female,6,0,1124497,68595,33475
...,...,...,...,...,...,...,...,...
187,HSE West and North West,75 - 79 years,Male,3,0,791076,26897,13254
188,HSE West and North West,80 - 84 years,Female,2,1,791076,17404,8905
189,HSE West and North West,80 - 84 years,Male,2,0,791076,16613,8114
190,HSE West and North West,85 years and over,Female,2,1,791076,16613,9691


### Step 4.2: Use linear interpolation to estimate the population numbers for other years

In [17]:
# Process 2024 data separately
rows_2024 = []

# Assuming df_agg_detailed_2024 has your raw 2024 data
for _, row in df_agg_detailed_2024.iterrows():
    rows_2024.append({
        'region': row['region'],
        'year': 2024,
        'age_group': row['age_group'],
        'gender': row['gender'],
        'population': row['age_group_gender_population_2024']
    })

# Create DataFrame for 2024 data with the same structure as interpolated data
df_2024 = pd.DataFrame(rows_2024)

# Filter to 60+ age groups
df_60plus_2024 = df_2024[df_2024['age_group'].isin(age_60plus_labels)]

# If you need the pivot table format for analysis
df_total_by_age_2024 = df_60plus_2024.pivot_table(
    index=['region', 'year', 'age_group'],
    columns='gender',
    values='population',
    aggfunc='sum'
).reset_index()

# Add total column
df_total_by_age_2024['total'] = df_total_by_age_2024['Male'] + df_total_by_age_2024['Female']

# Sort for readability
df_total_by_age_2024 = df_total_by_age_2024.sort_values(['region', 'year', 'age_group'])

NameError: name 'age_60plus_labels' is not defined

In [None]:
df_census_2024 = df_total_by_age_2024.copy()
df_census_2024.rename(columns={"Male": "male"}, inplace=True)
df_census_2024.rename(columns={"Female": "female"}, inplace=True)
df_census_2024.head()

In [None]:
df_census_2024.loc[df_census_2024['region'] == 'HSE Midwest', 'region'] = 'HSE Mid West'

### Step 4.3 : Update region names to match the other datasets

In [None]:
df_census.rename(columns={"Male": "male"}, inplace=True)
df_census.rename(columns={"Female": "female"}, inplace=True)
df_census.head(10)

In [None]:
def clean_age_group(age_text):
    if pd.isna(age_text):
        return age_text
    
    # Replace " years and over" with " and over"
    if 'years and over' in age_text:
        return age_text.replace('years ', '')
    
    # Remove " years" from strings like "70-74 years"
    return age_text.replace(' years', '')


# Apply the function to the column
df_census_2024['age_group'] = df_census_2024['age_group'].apply(clean_age_group)

In [None]:
from datetime import datetime
def expand_to_daily(census_df):
    expanded_rows = []
    
    # Iterate through each row in the census data
    for _, row in census_df.iterrows():
        year = int(row['year'])
        
        # Create all dates for this year
        start_date = datetime(year, 1, 1)
        end_date = datetime(year, 12, 31)
        all_dates = pd.date_range(start=start_date, end=end_date)
        
        # Create a new row for each date
        for date in all_dates:
            new_row = {
                'region': row['region'],
                'date': date,
                'age_group': row['age_group'],
                'female': row['female'],
                'male': row['male'],
                'total': row['total'],
                'year': year  # Keeping the year column for reference
            }
            expanded_rows.append(new_row)
    
    # Convert to DataFrame
    return pd.DataFrame(expanded_rows)

df_census_final = expand_to_daily(df_census)
df_census_2024_final = expand_to_daily(df_census_2024)


In [None]:
df_census_2024_final.head(10)

In [None]:
df_census_2024_final.to_csv('/home/paulharford/college/project/project_data/processed/WEATHERED_census_estimated_2024_pop_age_grp_gender_region.csv', index=False)