## Initial analysis of weather and ihfd

In [11]:
import pandas as pd
import matplotlib as plt
import glob
import os
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
from tqdm import tqdm
import codecs
import csv
!pip install openpyxl

Collecting openpyxl
  Using cached openpyxl-3.1.5-py2.py3-none-any.whl.metadata (2.5 kB)
Collecting et-xmlfile (from openpyxl)
  Using cached et_xmlfile-2.0.0-py3-none-any.whl.metadata (2.7 kB)
Using cached openpyxl-3.1.5-py2.py3-none-any.whl (250 kB)
Using cached et_xmlfile-2.0.0-py3-none-any.whl (18 kB)
Installing collected packages: et-xmlfile, openpyxl
Successfully installed et-xmlfile-2.0.0 openpyxl-3.1.5


In [12]:
data_directory_weather = "/mnt/hgfs/shared/project_data/processed/WEATHERED_warnings_2014-2023_cleaned.csv"
data_directory_ihfd = "/mnt/hgfs/shared/project_data/processed/ihfd_clean.csv"
data_directory_census = "/mnt/hgfs/shared/project_data/census/population_age_group_gender2016_2022_hse_region.xlsx"
full_path_ihfd = os.path.abspath(data_directory_ihfd)
full_path_weather = os.path.abspath(data_directory_weather)
full_path_census = os.path.abspath(data_directory_census)


In [13]:
df_ihfd = pd.read_csv(full_path_ihfd)
df_weather = pd.read_csv(full_path_weather)
df_census = pd.read_excel(full_path_census)


###Aggregate data

In [16]:
##census data
desired_age_groups = [
    "60 - 64 years", "65 - 69 years", "70 - 74 years",
    "75 - 79 years", "80 - 84 years", "85 years and over"
]
# Filter age groups
df_filtered = df_census[df_census["Age Group"].isin(desired_age_groups)]

# Group by HSE region and Sex, summing the 2016 and 2022 pops
df_summed = (
    df_filtered
    .groupby(["HSE Regions", "Sex"], as_index=False)
    .agg({
        "Age group pop 2022": "sum",
        "Age group pop 2016": "sum"
    })
)

# Rename for clarity
df_summed.rename(columns={
    "HSE Regions": "region",
    "Age group pop 2022": "pop_2022",
    "Age group pop 2016": "pop_2016"
}, inplace=True)

df_summed = df_summed.round(0)
df_summed.head()


Unnamed: 0,region,Sex,pop_2022,pop_2016
0,HSE Dublin and Midlands,Female,200441.0,184060.0
1,HSE Dublin and Midlands,Male,186432.0,171195.0
2,HSE Dublin and North East,Female,249287.0,227245.0
3,HSE Dublin and North East,Male,201804.0,183960.0
4,HSE Dublin and South East,Female,181594.0,168925.0


In [18]:
##
years = range(2016, 2023)  # 2016 through 2022 inclusive

rows = []

for idx, row in df_summed.iterrows():
    r = row["region"]
    s = row["Sex"]
    pop_2016 = row["pop_2016"]
    pop_2022 = row["pop_2022"]
    
    for y in years:
        # fraction of the way from 2016
        fraction = (y - 2016) / (2022 - 2016)  # denominator = 6
        pop_y = pop_2016 + (pop_2022 - pop_2016) * fraction
        
        rows.append({
            "region": r,
            "sex": s,
            "year": y,
            "population_60plus": pop_y
        })

df_region_sex_year = pd.DataFrame(rows)
df_region_sex_year = df_region_sex_year.round(0)
df_region_sex_year.head(15)


Unnamed: 0,region,sex,year,population_60plus
0,HSE Dublin and Midlands,Female,2016,184060.0
1,HSE Dublin and Midlands,Female,2017,186790.0
2,HSE Dublin and Midlands,Female,2018,189520.0
3,HSE Dublin and Midlands,Female,2019,192250.0
4,HSE Dublin and Midlands,Female,2020,194981.0
5,HSE Dublin and Midlands,Female,2021,197711.0
6,HSE Dublin and Midlands,Female,2022,200441.0
7,HSE Dublin and Midlands,Male,2016,171195.0
8,HSE Dublin and Midlands,Male,2017,173734.0
9,HSE Dublin and Midlands,Male,2018,176274.0


In [20]:
df_weather["Valid From"] = pd.to_datetime(df_weather["Valid From"])
df_weather["date"] = df_weather["Valid From"].dt.date
df_weather["date"] = pd.to_datetime(df_weather["date"])

In [22]:
df_ihfd["Adm_First_Pres_Hosp_DateTime"] = pd.to_datetime(df_ihfd["Adm_First_Pres_Hosp_DateTime"])
df_ihfd["date"] = df_ihfd["Adm_First_Pres_Hosp_DateTime"].dt.date
# convert to a pandas datetime if you prefer consistency
df_ihfd["date"] = pd.to_datetime(df_ihfd["date"])

In [21]:
# List out all the HSE region columns in weather
weather_region_cols = [
    "HSE Dublin and North East",
    "HSE Dublin and Midlands",
    "HSE Dublin and South East",
    "HSE Mid West",
    "HSE South West",
    "HSE West and North West",
    # any others you have
]

df_weather_long = df_weather.melt(
    id_vars=["date", "Issue Time", "Valid From", "Valid To", "Warning Colour", 
             "Warning Element", "Warning Text", "Duration_hours", "Year"],
    value_vars=weather_region_cols,
    var_name="region",
    value_name="weather_flag"
)



In [24]:
df_weather_long.head()

Unnamed: 0,date,Issue Time,Valid From,Valid To,Warning Colour,Warning Element,Warning Text,Duration_hours,Year,region,weather_flag
0,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,Orange,Moderate Wind warning,Becoming stormy this evening and tonight and c...,21.0,2014,HSE Dublin and North East,0
1,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,Yellow,Moderate Wind warning,Becoming extremely windy or stormy this evenin...,21.0,2014,HSE Dublin and North East,1
2,2014-01-03,2014-01-02 09:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-03 19:00:00+00:00,Yellow,Moderate Wind warning,Blustery for the rest of the afternoon with so...,5.0,2014,HSE Dublin and North East,1
3,2014-01-04,2014-01-04 03:00:00+00:00,2014-01-04 03:00:00+00:00,2014-01-04 14:00:00+00:00,Yellow,Moderate Snow-ice warning,Scattered outbreaks of rain and sleet spreadin...,11.0,2014,HSE Dublin and North East,1
4,2014-01-05,2014-01-04 10:00:00+00:00,2014-01-05 08:00:00+00:00,2014-01-05 20:00:00+00:00,Yellow,Moderate Wind warning,Southeast winds mean speeds of 50 to 65 km/h w...,12.0,2014,HSE Dublin and North East,1


In [30]:
df_weather_filtered = df_weather_long[
    (df_weather_long['Issue Time'] >= '2016-01-01') & 
    (df_weather_long['Issue Time'] <= '2022-12-31 23:59:59')
]

In [35]:
df_ihfd.head()

Unnamed: 0,region,NOCA_TraumaPeriodDay,Adm_First_Pres_Hosp_DateTime,NOCA_FirstPresPeriodDay,NOCA_AgeRange,NOCA_Gender,LOS,Adm_Trauma_TYPE,Adm_Ward_Type,Adm_Pre_Frac_Indoor,...,Adm_Operation,Adm_Asa_Grade,Adm_Anaesthesia,Adm_Surgery_Delay_Reason,Adm_Mobilised,Adm_Pressure_Ulcers,Adm_Spec_Falls_Assess,Adm_Bone_Protect_Med,Adm_Multi_Rehab_Assess,date
0,HSE Mid West,AM,2016-09-06,PM,70-74,Female,12,2.0,1.0,2.0,...,8.0,3.0,5.0,7.0,1.0,2.0,1.0,1.0,1.0,2016-09-06
1,HSE Mid West,AM,2016-09-15,Night,90-94,Female,4,2.0,1.0,2.0,...,1.0,2.0,5.0,0.0,1.0,2.0,1.0,1.0,1.0,2016-09-15
2,HSE Mid West,AM,2016-09-14,AM,75-79,Male,6,2.0,1.0,2.0,...,8.0,2.0,5.0,0.0,1.0,2.0,1.0,5.0,1.0,2016-09-14
3,HSE Mid West,AM,2016-09-04,AM,90-94,Female,16,2.0,1.0,2.0,...,1.0,3.0,5.0,0.0,1.0,2.0,1.0,1.0,1.0,2016-09-04
4,HSE Mid West,AM,2016-08-23,Night,65-69,Male,29,2.0,1.0,2.0,...,8.0,3.0,5.0,2.0,1.0,2.0,1.0,1.0,1.0,2016-08-23


In [23]:
df_ihfd.rename(columns={"New Health Regions": "region"}, inplace=True)


In [38]:
df_ihfd.rename(columns={"NOCA_Gender": "sex"}, inplace=True)

In [39]:
df_ihfd.head()

Unnamed: 0,region,NOCA_TraumaPeriodDay,Adm_First_Pres_Hosp_DateTime,NOCA_FirstPresPeriodDay,NOCA_AgeRange,sex,LOS,Adm_Trauma_TYPE,Adm_Ward_Type,Adm_Pre_Frac_Indoor,...,Adm_Operation,Adm_Asa_Grade,Adm_Anaesthesia,Adm_Surgery_Delay_Reason,Adm_Mobilised,Adm_Pressure_Ulcers,Adm_Spec_Falls_Assess,Adm_Bone_Protect_Med,Adm_Multi_Rehab_Assess,date
0,HSE Mid West,AM,2016-09-06,PM,70-74,Female,12,2.0,1.0,2.0,...,8.0,3.0,5.0,7.0,1.0,2.0,1.0,1.0,1.0,2016-09-06
1,HSE Mid West,AM,2016-09-15,Night,90-94,Female,4,2.0,1.0,2.0,...,1.0,2.0,5.0,0.0,1.0,2.0,1.0,1.0,1.0,2016-09-15
2,HSE Mid West,AM,2016-09-14,AM,75-79,Male,6,2.0,1.0,2.0,...,8.0,2.0,5.0,0.0,1.0,2.0,1.0,5.0,1.0,2016-09-14
3,HSE Mid West,AM,2016-09-04,AM,90-94,Female,16,2.0,1.0,2.0,...,1.0,3.0,5.0,0.0,1.0,2.0,1.0,1.0,1.0,2016-09-04
4,HSE Mid West,AM,2016-08-23,Night,65-69,Male,29,2.0,1.0,2.0,...,8.0,3.0,5.0,2.0,1.0,2.0,1.0,1.0,1.0,2016-08-23


In [40]:
df_ihfd_filtered = df_ihfd[
    (df_ihfd['date'] >= '2016-01-01') & 
    (df_ihfd['date'] <= '2022-12-31')
]

In [41]:
##hip fracture counts
df_hip_daily = (
    df_ihfd_filtered
    .groupby(["region", "sex", "date"], as_index=False)
    .size()
    .rename(columns={"size": "fracture_count"})
)

In [43]:
df_merged_all = pd.merge(
    df_hip_daily,             # (region, date, sex, fracture_count)
    df_weather_filtered,               # (region, date, weather columns)
    on=["region", "date"],
    how="left"
)

In [44]:
df_merged_all.head()

Unnamed: 0,region,sex,date,fracture_count,Issue Time,Valid From,Valid To,Warning Colour,Warning Element,Warning Text,Duration_hours,Year,weather_flag
0,HSE Dublin and Midlands,Female,2016-01-04,3,,NaT,,,,,,,
1,HSE Dublin and Midlands,Female,2016-01-07,1,,NaT,,,,,,,
2,HSE Dublin and Midlands,Female,2016-01-09,3,,NaT,,,,,,,
3,HSE Dublin and Midlands,Female,2016-01-10,1,,NaT,,,,,,,
4,HSE Dublin and Midlands,Female,2016-01-11,1,,NaT,,,,,,,


In [45]:
df_merged_all['year'] = df_merged_all['date'].dt.year

In [47]:
df_merged_all = pd.merge(
    df_merged_all,
    df_region_sex_year,  # your population DataFrame
    on=["region", "sex", "year"],  # common keys
    how="left"  # Use a left merge so all daily rows remain even if population data is missing
)


In [48]:
df_merged_all.head(10)

Unnamed: 0,region,sex,date,fracture_count,Issue Time,Valid From,Valid To,Warning Colour,Warning Element,Warning Text,Duration_hours,Year,weather_flag,year,population_60plus
0,HSE Dublin and Midlands,Female,2016-01-04,3,,NaT,,,,,,,,2016,184060.0
1,HSE Dublin and Midlands,Female,2016-01-07,1,,NaT,,,,,,,,2016,184060.0
2,HSE Dublin and Midlands,Female,2016-01-09,3,,NaT,,,,,,,,2016,184060.0
3,HSE Dublin and Midlands,Female,2016-01-10,1,,NaT,,,,,,,,2016,184060.0
4,HSE Dublin and Midlands,Female,2016-01-11,1,,NaT,,,,,,,,2016,184060.0
5,HSE Dublin and Midlands,Female,2016-01-12,1,,NaT,,,,,,,,2016,184060.0
6,HSE Dublin and Midlands,Female,2016-01-15,4,,NaT,,,,,,,,2016,184060.0
7,HSE Dublin and Midlands,Female,2016-01-16,2,,NaT,,,,,,,,2016,184060.0
8,HSE Dublin and Midlands,Female,2016-01-17,3,,NaT,,,,,,,,2016,184060.0
9,HSE Dublin and Midlands,Female,2016-01-18,2,,NaT,,,,,,,,2016,184060.0


In [50]:
df_merged_all.drop(columns=["Issue Time", "Warning Text"], inplace=True)

In [51]:
##check wetaher data 
# Check how many rows have a non-null weather_flag
num_with_weather = df_merged_all['weather_flag'].notnull().sum()
total_rows = len(df_merged_all)
print(f"Rows with weather data: {num_with_weather} / {total_rows}")

# Inspect a few rows that have weather data
df_weather_present = df_merged_all[df_merged_all['weather_flag'].notnull()]
print(df_weather_present.head())


Rows with weather data: 13110 / 25342
                     region     sex       date  fracture_count  \
11  HSE Dublin and Midlands  Female 2016-01-21               2   
14  HSE Dublin and Midlands  Female 2016-01-28               2   
15  HSE Dublin and Midlands  Female 2016-01-29               3   
16  HSE Dublin and Midlands  Female 2016-01-29               3   
17  HSE Dublin and Midlands  Female 2016-01-29               3   

11 2016-01-21 20:00:00+00:00  2016-01-22 05:00:00+00:00         Yellow   
14 2016-01-28 21:00:00+00:00  2016-01-29 09:00:00+00:00         Yellow   
15 2016-01-29 00:01:00+00:00  2016-01-29 09:00:00+00:00         Yellow   
16 2016-01-29 00:01:00+00:00  2016-01-29 09:00:00+00:00         Orange   
17 2016-01-29 00:01:00+00:00  2016-01-29 09:00:00+00:00         Orange   


    population_60plus  
11           184060.0  
14           184060.0  
15           184060.0  
16           184060.0  
17           184060.0  


In [52]:
df_merged_all["Warning Colour"] = df_merged_all["Warning Colour"].fillna("unknown")
df_merged_all["Warning Element"] = df_merged_all["Warning Element"].fillna("unknown")

In [53]:
df_merged_all["weather_flag"] = df_merged_all["weather_flag"].fillna(0)
df_merged_all["Duration_hours"] = df_merged_all["Duration_hours"].fillna(0)

In [57]:
df_merged_all["population_60plus"] = df_merged_all["population_60plus"].fillna(df_merged_all["population_60plus"].median())

In [58]:
missing_percentage = df_merged_all.isnull().mean() * 100

# Print the result
print(missing_percentage)

region                0.000000
sex                   0.000000
date                  0.000000
fracture_count        0.000000
Valid From           48.267698
Valid To             48.267698
Duration_hours        0.000000
Year                 48.267698
weather_flag          0.000000
year                  0.000000
population_60plus     0.000000
dtype: float64


In [60]:
df_merged_all.to_csv('/mnt/hgfs/shared/project_data/processed/WEATHERED_met-eirenn_ihfd_merged.csv', index=False)