## Initial analysis of weather and ihfd

In [1]:
import pandas as pd
import matplotlib as plt
import glob
import os
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
from tqdm import tqdm
import codecs
import csv
!pip install openpyxl



In [2]:
data_directory_weather = "/mnt/hgfs/shared/project_data/processed/WEATHERED_warnings_2014-2023_cleaned.csv"
data_directory_ihfd = "/mnt/hgfs/shared/project_data/processed/ihfd_clean.csv"
data_directory_census = "/mnt/hgfs/shared/project_data/census/population_age_group_gender2016_2022_hse_region.xlsx"
full_path_ihfd = os.path.abspath(data_directory_ihfd)
full_path_weather = os.path.abspath(data_directory_weather)
full_path_census = os.path.abspath(data_directory_census)


In [3]:
df_ihfd = pd.read_csv(full_path_ihfd)
df_weather = pd.read_csv(full_path_weather)
df_census = pd.read_excel(full_path_census)


###Aggregate data

#### Step 1 CENSUS data 

In [4]:
##census data
desired_age_groups = [
    "60 - 64 years", "65 - 69 years", "70 - 74 years",
    "75 - 79 years", "80 - 84 years", "85 years and over"
]
# Filter age groups
df_filtered = df_census[df_census["Age Group"].isin(desired_age_groups)]

# Group by HSE region and Sex, summing the 2016 and 2022 pops
df_summed = (
    df_filtered
    .groupby(["HSE Regions"], as_index=False)
    .agg({
        "Age group pop 2022": "sum",
        "Age group pop 2016": "sum"
    })
)

# Rename for clarity
df_summed.rename(columns={
    "HSE Regions": "region",
    "Age group pop 2022": "pop_2022",
    "Age group pop 2016": "pop_2016"
}, inplace=True)

df_summed = df_summed.round(0)
df_summed.head()


Unnamed: 0,region,pop_2022,pop_2016
0,HSE Dublin and Midlands,386872.0,355255.0
1,HSE Dublin and North East,451091.0,411204.0
2,HSE Dublin and South East,388437.0,361337.0
3,HSE Midwest,181333.0,168996.0
4,HSE South West,322908.0,301220.0


In [6]:
##
years = range(2016, 2023)  # 2016 through 2022 inclusive

rows = []

for idx, row in df_summed.iterrows():
    r = row["region"]
    #s = row["Sex"]
    pop_2016 = row["pop_2016"]
    pop_2022 = row["pop_2022"]
    
    for y in years:
        # fraction of the way from 2016
        fraction = (y - 2016) / (2022 - 2016)  # denominator = 6
        pop_y = pop_2016 + (pop_2022 - pop_2016) * fraction
        
        rows.append({
            "region": r,
           # "sex": s,
            "year": y,
            "population_60plus": pop_y
        })

df_region_sex_year = pd.DataFrame(rows)
df_region_sex_year = df_region_sex_year.round(0)
df_region_sex_year.head(15)


Unnamed: 0,region,year,population_60plus
0,HSE Dublin and Midlands,2016,355255.0
1,HSE Dublin and Midlands,2017,360524.0
2,HSE Dublin and Midlands,2018,365794.0
3,HSE Dublin and Midlands,2019,371064.0
4,HSE Dublin and Midlands,2020,376333.0
5,HSE Dublin and Midlands,2021,381602.0
6,HSE Dublin and Midlands,2022,386872.0
7,HSE Dublin and North East,2016,411204.0
8,HSE Dublin and North East,2017,417852.0
9,HSE Dublin and North East,2018,424500.0


#### Step 2  Weather Data

In [12]:
df_weather["Valid From"] = pd.to_datetime(df_weather["Valid From"])
df_weather["date"] = df_weather["Valid From"].dt.date
df_weather["date"] = pd.to_datetime(df_weather["date"])

In [13]:
##create new column with lag of 12 hours from original Valid To timestamp
df_weather["Valid To"] = pd.to_datetime(df_weather["Valid To"])


In [18]:
df_weather['lag_valid_to'] = df_weather['Valid To'] + pd.Timedelta(hours=12)

In [22]:
# List out all the HSE region columns in weather
weather_region_cols = [
    "HSE Dublin and North East",
    "HSE Dublin and Midlands",
    "HSE Dublin and South East",
    "HSE Mid West",
    "HSE South West",
    "HSE West and North West",
    # any others you have
]

df_weather_long = df_weather.melt(
    id_vars=["date", "Issue Time", "Valid From", "Valid To", "lag_valid_to", "Warning Colour", 
             "Warning Element", "Warning Text", "Duration_hours", "Year"],
    value_vars=weather_region_cols,
    var_name="region",
    value_name="weather_flag"
)



In [23]:
df_weather_filtered = df_weather_long[
    (df_weather_long['Issue Time'] >= '2016-01-01') & 
    (df_weather_long['Issue Time'] <= '2022-12-31 23:59:59')
]

In [24]:
df_weather_long.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16392 entries, 0 to 16391
Data columns (total 12 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   date             16392 non-null  datetime64[ns]     
 1   Issue Time       16392 non-null  object             
 2   Valid From       16392 non-null  datetime64[ns, UTC]
 3   Valid To         16380 non-null  datetime64[ns, UTC]
 4   lag_valid_to     16380 non-null  datetime64[ns, UTC]
 8   Duration_hours   16380 non-null  float64            
 9   Year             16392 non-null  int64              
 10  region           16392 non-null  object             
 11  weather_flag     16392 non-null  int64              
dtypes: datetime64[ns, UTC](3), datetime64[ns](1), float64(1), int64(2), object(5)
memory usage: 1.5+ MB


#### Step 3 IHFD data

In [35]:
df_ihfd["Adm_First_Pres_Hosp_DateTime"] = pd.to_datetime(df_ihfd["Adm_First_Pres_Hosp_DateTime"])
df_ihfd["date"] = df_ihfd["Adm_First_Pres_Hosp_DateTime"].dt.date
# convert to a pandas datetime if you prefer consistency
df_ihfd["date"] = pd.to_datetime(df_ihfd["date"])

In [36]:
df_ihfd.head()

Unnamed: 0,region,NOCA_TraumaPeriodDay,Adm_First_Pres_Hosp_DateTime,NOCA_FirstPresPeriodDay,NOCA_AgeRange,sex,LOS,Adm_Trauma_TYPE,Adm_Ward_Type,Adm_Pre_Frac_Indoor,...,Adm_Operation,Adm_Asa_Grade,Adm_Anaesthesia,Adm_Surgery_Delay_Reason,Adm_Mobilised,Adm_Pressure_Ulcers,Adm_Spec_Falls_Assess,Adm_Bone_Protect_Med,Adm_Multi_Rehab_Assess,date
0,HSE Mid West,AM,2016-09-06,PM,70-74,Female,12,2.0,1.0,2.0,...,8.0,3.0,5.0,7.0,1.0,2.0,1.0,1.0,1.0,2016-09-06
1,HSE Mid West,AM,2016-09-15,Night,90-94,Female,4,2.0,1.0,2.0,...,1.0,2.0,5.0,0.0,1.0,2.0,1.0,1.0,1.0,2016-09-15
2,HSE Mid West,AM,2016-09-14,AM,75-79,Male,6,2.0,1.0,2.0,...,8.0,2.0,5.0,0.0,1.0,2.0,1.0,5.0,1.0,2016-09-14
3,HSE Mid West,AM,2016-09-04,AM,90-94,Female,16,2.0,1.0,2.0,...,1.0,3.0,5.0,0.0,1.0,2.0,1.0,1.0,1.0,2016-09-04
4,HSE Mid West,AM,2016-08-23,Night,65-69,Male,29,2.0,1.0,2.0,...,8.0,3.0,5.0,2.0,1.0,2.0,1.0,1.0,1.0,2016-08-23


In [37]:
##check if all ihfd dates are mon to friday 
df_ihfd['Adm_First_Pres_Hosp_DateTime'] = pd.to_datetime(df_ihfd['Adm_First_Pres_Hosp_DateTime'])

# Create a boolean Series: True if the date is a weekday (Mon=0 to Fri=4)
weekdays = df_ihfd['Adm_First_Pres_Hosp_DateTime'].dt.dayofweek < 5

# Check if all dates are weekdays
if weekdays.all():
    print("All dates are Monday through Friday.")
else:
    print("Some dates fall on a weekend.")
    
    # Optional: To see which dates are weekends
    weekend_dates = df_ihfd.loc[~weekdays, 'Adm_First_Pres_Hosp_DateTime']
    print("Weekend dates found:")
    print(weekend_dates)

Some dates fall on a weekend.
Weekend dates found:
3       2016-09-04
14      2016-08-20
15      2016-09-18
19      2016-09-17
23      2016-10-01
           ...    
37305   2023-12-23
37307   2023-12-30
37310   2023-12-24
37312   2023-12-17
37315   2023-12-31
Name: Adm_First_Pres_Hosp_DateTime, Length: 10202, dtype: datetime64[ns]


In [38]:
df_ihfd.rename(columns={"New Health Regions": "region"}, inplace=True)


In [39]:
df_ihfd.rename(columns={"NOCA_Gender": "sex"}, inplace=True)

In [40]:
df_ihfd.head()

Unnamed: 0,region,NOCA_TraumaPeriodDay,Adm_First_Pres_Hosp_DateTime,NOCA_FirstPresPeriodDay,NOCA_AgeRange,sex,LOS,Adm_Trauma_TYPE,Adm_Ward_Type,Adm_Pre_Frac_Indoor,...,Adm_Operation,Adm_Asa_Grade,Adm_Anaesthesia,Adm_Surgery_Delay_Reason,Adm_Mobilised,Adm_Pressure_Ulcers,Adm_Spec_Falls_Assess,Adm_Bone_Protect_Med,Adm_Multi_Rehab_Assess,date
0,HSE Mid West,AM,2016-09-06,PM,70-74,Female,12,2.0,1.0,2.0,...,8.0,3.0,5.0,7.0,1.0,2.0,1.0,1.0,1.0,2016-09-06
1,HSE Mid West,AM,2016-09-15,Night,90-94,Female,4,2.0,1.0,2.0,...,1.0,2.0,5.0,0.0,1.0,2.0,1.0,1.0,1.0,2016-09-15
2,HSE Mid West,AM,2016-09-14,AM,75-79,Male,6,2.0,1.0,2.0,...,8.0,2.0,5.0,0.0,1.0,2.0,1.0,5.0,1.0,2016-09-14
3,HSE Mid West,AM,2016-09-04,AM,90-94,Female,16,2.0,1.0,2.0,...,1.0,3.0,5.0,0.0,1.0,2.0,1.0,1.0,1.0,2016-09-04
4,HSE Mid West,AM,2016-08-23,Night,65-69,Male,29,2.0,1.0,2.0,...,8.0,3.0,5.0,2.0,1.0,2.0,1.0,1.0,1.0,2016-08-23


In [41]:
df_ihfd_filtered = df_ihfd[
    (df_ihfd['date'] >= '2016-01-01') & 
    (df_ihfd['date'] <= '2022-12-31')
]

In [42]:
##hip fracture counts
df_hip_daily = (
    df_ihfd_filtered
    .groupby(["region"," date"], as_index=False)
    .size()
    .rename(columns={"size": "fracture_count"})
)

KeyError: ' date'

In [None]:
df_merged_all = pd.merge(
    df_hip_daily,             # (region, date, sex, fracture_count)
    df_weather_filtered,               # (region, date, weather columns)
    on=["region", "date"],
    how="left"
)

In [None]:
df_merged_all.describe()

In [None]:
df_merged_all['year'] = df_merged_all['date'].dt.year

In [None]:
df_merged_all = pd.merge(
    df_merged_all,
    df_region_sex_year,  # your population DataFrame
    on=["region", "sex", "year"],  # common keys
    how="left"  # Use a left merge so all daily rows remain even if population data is missing
)


In [None]:
df_merged_all.head(10)

In [None]:
df_merged_all.drop(columns=["Issue Time", "Warning Text"], inplace=True)

In [None]:
##check wetaher data 
# Check how many rows have a non-null weather_flag
num_with_weather = df_merged_all['weather_flag'].notnull().sum()
total_rows = len(df_merged_all)
print(f"Rows with weather data: {num_with_weather} / {total_rows}")

# Inspect a few rows that have weather data
df_weather_present = df_merged_all[df_merged_all['weather_flag'].notnull()]
print(df_weather_present.head())


In [None]:
df_merged_all["Warning Colour"] = df_merged_all["Warning Colour"].fillna("unknown")
df_merged_all["Warning Element"] = df_merged_all["Warning Element"].fillna("unknown")

In [None]:
df_merged_all["weather_flag"] = df_merged_all["weather_flag"].fillna(0)
df_merged_all["Duration_hours"] = df_merged_all["Duration_hours"].fillna(0)

In [None]:
df_merged_all["population_60plus"] = df_merged_all["population_60plus"].fillna(df_merged_all["population_60plus"].median())

In [None]:
missing_percentage = df_merged_all.isnull().mean() * 100

# Print the result
print(missing_percentage)

In [None]:
df_merged_all.to_csv('/mnt/hgfs/shared/project_data/processed/WEATHERED_met-eirenn_ihfd_merged.csv', index=False)