In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

# For inline plots in Jupyter
%matplotlib inline
import pandas as pd
import matplotlib as plt
import glob
import os
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
from tqdm import tqdm
import codecs
import csv
import openpyxl

In [2]:
data_directory_weather = "/home/paulharford/college/project/project_data/processed/WEATHERED_warnings_2014-2023_cleaned.csv"
data_directory_ihfd = "/home/paulharford/college/project/project_data/processed/ihfd_clean.csv"
data_directory_census = "/home/paulharford/college/project/project_data/processed/census_estimated_per_region_2014_2023.csv"
full_path_ihfd = os.path.abspath(data_directory_ihfd)
full_path_weather = os.path.abspath(data_directory_weather)
full_path_census = os.path.abspath(data_directory_census)

In [3]:
df_ihfd = pd.read_csv(full_path_ihfd)
df_weather = pd.read_csv(full_path_weather)
df_census = pd.read_csv(full_path_census)

In [4]:
df_weather["Valid From"] = pd.to_datetime(df_weather["Valid From"])
df_weather["date"] = df_weather["Valid From"].dt.date
df_weather["date"] = pd.to_datetime(df_weather["date"])
df_weather["Valid To"] = pd.to_datetime(df_weather["Valid To"])


In [5]:
#add column for lag
df_weather['lag_valid_to'] = df_weather['Valid To'] + pd.Timedelta(hours=12)

In [6]:
weather_region_cols = [
    "HSE Dublin and North East",
    "HSE Dublin and Midlands",
    "HSE Dublin and South East",
    "HSE Mid West",
    "HSE South West",
    "HSE West and North West",
    # any others you have
]

df_weather_full = df_weather.melt(
    id_vars=["date", "Issue Time", "Valid From", "Valid To", "lag_valid_to", "Warning Colour", 
             "Warning Element", "Warning Text", "Duration_hours"],
    value_vars=weather_region_cols,
    var_name="region",
    value_name="weather_flag"
)


In [7]:
df_weather_full.head(10)

Unnamed: 0,date,Issue Time,Valid From,Valid To,lag_valid_to,Warning Colour,Warning Element,Warning Text,Duration_hours,region,weather_flag
0,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Orange,Wind,Becoming stormy this evening and tonight and c...,21.0,HSE Dublin and North East,0
1,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Yellow,Wind,Becoming extremely windy or stormy this evenin...,21.0,HSE Dublin and North East,1
2,2014-01-03,2014-01-02 09:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-03 19:00:00+00:00,2014-01-04 07:00:00+00:00,Yellow,Wind,Blustery for the rest of the afternoon with so...,5.0,HSE Dublin and North East,1
3,2014-01-04,2014-01-04 03:00:00+00:00,2014-01-04 03:00:00+00:00,2014-01-04 14:00:00+00:00,2014-01-05 02:00:00+00:00,Yellow,Snow/Ice,Scattered outbreaks of rain and sleet spreadin...,11.0,HSE Dublin and North East,1
4,2014-01-05,2014-01-04 10:00:00+00:00,2014-01-05 08:00:00+00:00,2014-01-05 20:00:00+00:00,2014-01-06 08:00:00+00:00,Yellow,Wind,Southeast winds mean speeds of 50 to 65 km/h w...,12.0,HSE Dublin and North East,1
5,2014-01-05,2014-01-04 16:00:00+00:00,2014-01-05 08:00:00+00:00,2014-01-05 18:00:00+00:00,2014-01-06 06:00:00+00:00,Yellow,Wind,Becoming windy again on Sunday morning with st...,10.0,HSE Dublin and North East,1
6,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 11:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Yellow,Wind,"UpdateStrong south to southeast winds, with gu...",25.0,HSE Dublin and North East,1
7,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 11:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Yellow,Wind,"UndateStrong south to southeast winds, with gu...",25.0,HSE Dublin and North East,1
8,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 11:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Orange,Wind,"Update Strong south to southeast winds, with g...",25.0,HSE Dublin and North East,0
9,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 12:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Orange,Wind,"Update Strong south to southeast winds, with g...",24.0,HSE Dublin and North East,0


In [8]:
fill_values = {
    'Issue Time': 0,
    'Valid From': 0,
    'Valid To': 0,
    'lag_valid_to': 0,
    'Warning Colour': 0,
    'Warning Element': 0,
    # For text fields you want a specific string, e.g. 'noevent':
    'Warning Text': 'noevent',
    'Duration_hours': 0,
    'region': 'noevent',
    'weather_flag': 2.0
}

# Fill NaN values for these columns using fillna().
df_weather_full.fillna(value=fill_values, inplace=True)



In [9]:
df_weather_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16398 entries, 0 to 16397
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   date             16398 non-null  datetime64[ns]     
 1   Issue Time       16398 non-null  object             
 2   Valid From       16398 non-null  datetime64[ns, UTC]
 3   Valid To         16398 non-null  object             
 4   lag_valid_to     16398 non-null  object             
 8   Duration_hours   16398 non-null  float64            
 9   region           16398 non-null  object             
 10  weather_flag     16398 non-null  int64              
dtypes: datetime64[ns, UTC](1), datetime64[ns](1), float64(1), int64(1), object(7)
memory usage: 1.4+ MB


In [10]:
df_weather_full.head(15)

Unnamed: 0,date,Issue Time,Valid From,Valid To,lag_valid_to,Warning Colour,Warning Element,Warning Text,Duration_hours,region,weather_flag
0,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Orange,Wind,Becoming stormy this evening and tonight and c...,21.0,HSE Dublin and North East,0
1,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Yellow,Wind,Becoming extremely windy or stormy this evenin...,21.0,HSE Dublin and North East,1
2,2014-01-03,2014-01-02 09:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-03 19:00:00+00:00,2014-01-04 07:00:00+00:00,Yellow,Wind,Blustery for the rest of the afternoon with so...,5.0,HSE Dublin and North East,1
3,2014-01-04,2014-01-04 03:00:00+00:00,2014-01-04 03:00:00+00:00,2014-01-04 14:00:00+00:00,2014-01-05 02:00:00+00:00,Yellow,Snow/Ice,Scattered outbreaks of rain and sleet spreadin...,11.0,HSE Dublin and North East,1
4,2014-01-05,2014-01-04 10:00:00+00:00,2014-01-05 08:00:00+00:00,2014-01-05 20:00:00+00:00,2014-01-06 08:00:00+00:00,Yellow,Wind,Southeast winds mean speeds of 50 to 65 km/h w...,12.0,HSE Dublin and North East,1
5,2014-01-05,2014-01-04 16:00:00+00:00,2014-01-05 08:00:00+00:00,2014-01-05 18:00:00+00:00,2014-01-06 06:00:00+00:00,Yellow,Wind,Becoming windy again on Sunday morning with st...,10.0,HSE Dublin and North East,1
6,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 11:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Yellow,Wind,"UpdateStrong south to southeast winds, with gu...",25.0,HSE Dublin and North East,1
7,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 11:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Yellow,Wind,"UndateStrong south to southeast winds, with gu...",25.0,HSE Dublin and North East,1
8,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 11:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Orange,Wind,"Update Strong south to southeast winds, with g...",25.0,HSE Dublin and North East,0
9,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 12:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Orange,Wind,"Update Strong south to southeast winds, with g...",24.0,HSE Dublin and North East,0


## Hip Data

In [11]:
df_ihfd.rename(columns={"New Health Regions": "region"}, inplace=True)


In [12]:
df_ihfd["Adm_First_Pres_Hosp_DateTime"] = pd.to_datetime(df_ihfd["Adm_First_Pres_Hosp_DateTime"])
df_ihfd["date"] = df_ihfd["Adm_First_Pres_Hosp_DateTime"].dt.date
# convert to a pandas datetime if you prefer consistency
df_ihfd["date"] = pd.to_datetime(df_ihfd["date"])

In [13]:
df_ihfd.sort_values('date', ascending=True, inplace=True)


In [14]:
df_ihfd.head()

Unnamed: 0,region,NOCA_TraumaPeriodDay,Adm_First_Pres_Hosp_DateTime,NOCA_FirstPresPeriodDay,NOCA_AgeRange,NOCA_Gender,LOS,Adm_Trauma_TYPE,Adm_Ward_Type,Adm_Pre_Frac_Indoor,...,Adm_Operation,Adm_Asa_Grade,Adm_Anaesthesia,Adm_Surgery_Delay_Reason,Adm_Mobilised,Adm_Pressure_Ulcers,Adm_Spec_Falls_Assess,Adm_Bone_Protect_Med,Adm_Multi_Rehab_Assess,date
20304,HSE Mid West,AM,2007-05-04,PM,85-89,Female,60,2.0,1.0,2.0,...,3.0,2.0,5.0,0.0,2.0,2.0,1.0,1.0,1.0,2007-05-04
13462,HSE West and North West,AM,2007-09-04,PM,75-79,Female,8,2.0,1.0,2.0,...,9.0,2.0,1.0,0.0,1.0,2.0,0.0,1.0,1.0,2007-09-04
8392,HSE Dublin and North East,AM,2012-09-24,AM,85-89,Male,11,2.0,1.0,0.0,...,0.0,3.0,5.0,0.0,0.0,2.0,0.0,0.0,1.0,2012-09-24
9817,HSE Dublin and North East,AM,2012-12-30,AM,85-89,Female,34,2.0,1.0,0.0,...,7.0,4.0,5.0,0.0,0.0,2.0,1.0,0.0,1.0,2012-12-30
11672,HSE Dublin and North East,PM,2013-01-01,PM,85-89,Female,22,2.0,1.0,0.0,...,1.0,3.0,1.0,2.0,0.0,2.0,1.0,0.0,1.0,2013-01-01


In [15]:
df_ihfd_filtered = df_ihfd[
    (df_ihfd['date'] >= '2014-01-01') & 
    (df_ihfd['date'] <= '2023-12-31').copy()
]

In [16]:
##check ihfd data has all dates 
df_ihfd['Adm_First_Pres_Hosp_DateTime'] = pd.to_datetime(df_ihfd['Adm_First_Pres_Hosp_DateTime']).dt.normalize()

# Generate the full calendar date range for the period 2014 to 2023
full_range = pd.date_range(start='2014-01-01', end='2023-12-31', freq='D')

# Get the unique dates from your DataFrame
unique_dates = pd.to_datetime(df_ihfd['Adm_First_Pres_Hosp_DateTime'].unique()).normalize()

# Find dates in full_range that are not present in unique_dates
missing_dates = full_range.difference(unique_dates)

# Check if any dates are missing
if missing_dates.empty:
    print("All calendar days are present in the dataset.")
else:
    print("The following dates are missing from the dataset:")
    print(missing_dates)

All calendar days are present in the dataset.


In [17]:
df_ihfd_season = df_ihfd_filtered.copy()

In [18]:
def month_to_season(month):
    # Convert month to an integer if it isn't already.
    month = int(month)
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Autumn'
    else:
        return 'unknown'




In [19]:
df_ihfd_season['month'] = df_ihfd_season['date'].dt.month

In [20]:
df_weather_full['month'] = df_weather_full['date'].dt.month

In [21]:
df_weather_full['season'] = df_weather_full['month'].apply(month_to_season)

In [22]:
df_ihfd_season['season'] = df_ihfd_season['month'].apply(month_to_season)

In [23]:
print(df_ihfd_season['season'].unique())


['Winter' 'Spring' 'Summer' 'Autumn']


In [24]:
print(df_weather_full['season'].unique())

['Winter' 'Spring' 'Summer' 'Autumn']


In [25]:
print(df_weather_full['season'].value_counts())


season
Winter    6666
Autumn    4740
Summer    2664
Spring    2328
Name: count, dtype: int64


In [26]:
print(df_ihfd_season['season'].value_counts())


season
Winter    9251
Spring    8766
Summer    8604
Autumn    8600
Name: count, dtype: int64


In [27]:
nan_counts = df_ihfd_season.isna().sum()
print(nan_counts)

region                          0
NOCA_TraumaPeriodDay            0
Adm_First_Pres_Hosp_DateTime    0
NOCA_FirstPresPeriodDay         0
NOCA_AgeRange                   0
NOCA_Gender                     0
LOS                             0
Adm_Trauma_TYPE                 0
Adm_Ward_Type                   0
Adm_Pre_Frac_Indoor             0
Adm_PRE_Frac_Outdoor            0
Adm_Pre_Frac_Shop               0
Adm_Pre_Frac_Number             0
Adm_Fracture_Type               0
Adm_Pathological                0
Adm_Fragility                   0
Adm_Pre_OP_Med_Assess           0
Adm_Ger_Acute_Assess            0
Adm_Operation                   0
Adm_Asa_Grade                   0
Adm_Anaesthesia                 0
Adm_Surgery_Delay_Reason        0
Adm_Mobilised                   0
Adm_Pressure_Ulcers             0
Adm_Spec_Falls_Assess           0
Adm_Bone_Protect_Med            0
Adm_Multi_Rehab_Assess          0
date                            0
month                           0
season        

## merging

In [28]:
hip_agg = (
    df_ihfd_season
    .groupby(['region', 'date','season'])
    .agg(hip_fracture_count=('date', 'count'))
    .reset_index()
)


In [29]:
hip_agg.head()


Unnamed: 0,region,date,season,hip_fracture_count
0,HSE Dublin and Midlands,2014-01-02,Winter,1
1,HSE Dublin and Midlands,2014-01-04,Winter,5
2,HSE Dublin and Midlands,2014-01-06,Winter,1
3,HSE Dublin and Midlands,2014-01-07,Winter,1
4,HSE Dublin and Midlands,2014-01-08,Winter,2


In [30]:
##add full date calandar to the aggregation 
# 1a. Identify all unique regions
regions = hip_agg['region'].unique()

# 1b. Identify the full date range
min_date = hip_agg['date'].min()
max_date = hip_agg['date'].max()

In [31]:
all_dates = pd.date_range(start=min_date, end=max_date, freq='D')

multi_index = pd.MultiIndex.from_product([regions, all_dates], names=['region', 'date'])
df_region_date = pd.DataFrame(index=multi_index).reset_index()

In [32]:
df_region_date.head(15)

Unnamed: 0,region,date
0,HSE Dublin and Midlands,2014-01-01
1,HSE Dublin and Midlands,2014-01-02
2,HSE Dublin and Midlands,2014-01-03
3,HSE Dublin and Midlands,2014-01-04
4,HSE Dublin and Midlands,2014-01-05
5,HSE Dublin and Midlands,2014-01-06
6,HSE Dublin and Midlands,2014-01-07
7,HSE Dublin and Midlands,2014-01-08
8,HSE Dublin and Midlands,2014-01-09
9,HSE Dublin and Midlands,2014-01-10


In [33]:
df_hip_final = pd.merge(
    df_region_date,          # all region-date combos
    hip_agg,               # your aggregated counts
    on=['region', 'date'],   # merge keys
    how='left'               # left-join so we keep all rows from df_region_date
)

In [34]:
df_hip_final['month'] = df_hip_final['date'].dt.month

In [35]:
df_hip_final['season'] = df_hip_final['month'].apply(month_to_season)

In [36]:
df_hip_final['hip_fracture_count'] = df_hip_final['hip_fracture_count'].fillna(0)


In [37]:
df_hip_final.head(15)

Unnamed: 0,region,date,season,hip_fracture_count,month
0,HSE Dublin and Midlands,2014-01-01,Winter,0.0,1
1,HSE Dublin and Midlands,2014-01-02,Winter,1.0,1
2,HSE Dublin and Midlands,2014-01-03,Winter,0.0,1
3,HSE Dublin and Midlands,2014-01-04,Winter,5.0,1
4,HSE Dublin and Midlands,2014-01-05,Winter,0.0,1
5,HSE Dublin and Midlands,2014-01-06,Winter,1.0,1
6,HSE Dublin and Midlands,2014-01-07,Winter,1.0,1
7,HSE Dublin and Midlands,2014-01-08,Winter,2.0,1
8,HSE Dublin and Midlands,2014-01-09,Winter,2.0,1
9,HSE Dublin and Midlands,2014-01-10,Winter,1.0,1


In [38]:
df_hip_final['year'] = df_hip_final['date'].dt.year

In [39]:
severity_map = {
    'Yellow': 1,
    'Orange': 2,
    'Red': 3
}

df_weather_full['severity_num'] = df_weather_full['Warning Colour'].map(severity_map)


In [40]:
df_weather_full['severity_num'] = df_weather_full['severity_num'].fillna(0)

In [41]:
df_weather_full.head(15)

Unnamed: 0,date,Issue Time,Valid From,Valid To,lag_valid_to,Warning Colour,Warning Element,Warning Text,Duration_hours,region,weather_flag,month,season,severity_num
0,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Orange,Wind,Becoming stormy this evening and tonight and c...,21.0,HSE Dublin and North East,0,1,Winter,2.0
1,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Yellow,Wind,Becoming extremely windy or stormy this evenin...,21.0,HSE Dublin and North East,1,1,Winter,1.0
2,2014-01-03,2014-01-02 09:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-03 19:00:00+00:00,2014-01-04 07:00:00+00:00,Yellow,Wind,Blustery for the rest of the afternoon with so...,5.0,HSE Dublin and North East,1,1,Winter,1.0
3,2014-01-04,2014-01-04 03:00:00+00:00,2014-01-04 03:00:00+00:00,2014-01-04 14:00:00+00:00,2014-01-05 02:00:00+00:00,Yellow,Snow/Ice,Scattered outbreaks of rain and sleet spreadin...,11.0,HSE Dublin and North East,1,1,Winter,1.0
4,2014-01-05,2014-01-04 10:00:00+00:00,2014-01-05 08:00:00+00:00,2014-01-05 20:00:00+00:00,2014-01-06 08:00:00+00:00,Yellow,Wind,Southeast winds mean speeds of 50 to 65 km/h w...,12.0,HSE Dublin and North East,1,1,Winter,1.0
5,2014-01-05,2014-01-04 16:00:00+00:00,2014-01-05 08:00:00+00:00,2014-01-05 18:00:00+00:00,2014-01-06 06:00:00+00:00,Yellow,Wind,Becoming windy again on Sunday morning with st...,10.0,HSE Dublin and North East,1,1,Winter,1.0
6,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 11:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Yellow,Wind,"UpdateStrong south to southeast winds, with gu...",25.0,HSE Dublin and North East,1,1,Winter,1.0
7,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 11:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Yellow,Wind,"UndateStrong south to southeast winds, with gu...",25.0,HSE Dublin and North East,1,1,Winter,1.0
8,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 11:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Orange,Wind,"Update Strong south to southeast winds, with g...",25.0,HSE Dublin and North East,0,1,Winter,2.0
9,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 12:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Orange,Wind,"Update Strong south to southeast winds, with g...",24.0,HSE Dublin and North East,0,1,Winter,2.0


In [42]:
df_weather_full.query(
    "date == '2014-01-02'"
)

  df_weather_full.query(


Unnamed: 0,date,Issue Time,Valid From,Valid To,lag_valid_to,Warning Colour,Warning Element,Warning Text,Duration_hours,region,weather_flag,month,season,severity_num
0,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Orange,Wind,Becoming stormy this evening and tonight and c...,21.0,HSE Dublin and North East,0,1,Winter,2.0
1,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Yellow,Wind,Becoming extremely windy or stormy this evenin...,21.0,HSE Dublin and North East,1,1,Winter,1.0
2733,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Orange,Wind,Becoming stormy this evening and tonight and c...,21.0,HSE Dublin and Midlands,0,1,Winter,2.0
2734,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Yellow,Wind,Becoming extremely windy or stormy this evenin...,21.0,HSE Dublin and Midlands,1,1,Winter,1.0
5466,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Orange,Wind,Becoming stormy this evening and tonight and c...,21.0,HSE Dublin and South East,0,1,Winter,2.0
5467,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Yellow,Wind,Becoming extremely windy or stormy this evenin...,21.0,HSE Dublin and South East,1,1,Winter,1.0
8199,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Orange,Wind,Becoming stormy this evening and tonight and c...,21.0,HSE Mid West,1,1,Winter,2.0
8200,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Yellow,Wind,Becoming extremely windy or stormy this evenin...,21.0,HSE Mid West,1,1,Winter,1.0
10932,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Orange,Wind,Becoming stormy this evening and tonight and c...,21.0,HSE South West,1,1,Winter,2.0
10933,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Yellow,Wind,Becoming extremely windy or stormy this evenin...,21.0,HSE South West,0,1,Winter,1.0


In [43]:
df_weather_final = (
    df_weather_full
    .groupby(['region', 'date', 'severity_num'], as_index=False)
    .agg({
        'weather_flag': 'max',  # any day with a warning => 1
        'Warning Element': lambda s: list(s.unique())
    })
)



In [44]:
df_weather_final.head(15)

Unnamed: 0,region,date,severity_num,weather_flag,Warning Element
0,HSE Dublin and Midlands,2014-01-02,1.0,1,[Wind]
1,HSE Dublin and Midlands,2014-01-02,2.0,0,[Wind]
2,HSE Dublin and Midlands,2014-01-03,1.0,1,[Wind]
3,HSE Dublin and Midlands,2014-01-04,1.0,1,[Snow/Ice]
4,HSE Dublin and Midlands,2014-01-05,1.0,1,[Wind]
5,HSE Dublin and Midlands,2014-01-05,2.0,0,[Wind]
6,HSE Dublin and Midlands,2014-01-06,1.0,1,[Wind]
7,HSE Dublin and Midlands,2014-01-06,2.0,1,[Wind]
8,HSE Dublin and Midlands,2014-01-12,1.0,1,"[Rain, Wind]"
9,HSE Dublin and Midlands,2014-01-16,2.0,1,[Fog (or freezing fog)]


In [45]:
merged = pd.merge(hip_agg, df_weather_final , on=['region', 'date'], how='left')



In [46]:
# Fill missing weather_flag values with 0 (indicating no adverse weather event)
merged['weather_flag'] = merged['weather_flag'].fillna(0)
merged['severity_num'] = merged['severity_num'].fillna(0)
merged['Warning Element'] = merged['Warning Element'].fillna('Unknown')



merged['hip_adv_event'] = ((merged['hip_fracture_count'] > 0) & (merged['weather_flag'] == 1)).astype(int)


In [47]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17725 entries, 0 to 17724
Data columns (total 7 columns):
 #   Column              Non-Null Count  Dtype         
---  ------              --------------  -----         
 0   region              17725 non-null  object        
 1   date                17725 non-null  datetime64[ns]
 2   season              17725 non-null  object        
 3   hip_fracture_count  17725 non-null  int64         
 4   severity_num        17725 non-null  float64       
 5   weather_flag        17725 non-null  float64       
dtypes: datetime64[ns](1), float64(2), int64(1), object(3)
memory usage: 969.5+ KB


In [48]:
merged.isna().sum()


region                0
date                  0
season                0
hip_fracture_count    0
severity_num          0
weather_flag          0
dtype: int64

In [49]:
print(merged['season'].value_counts())

season
Winter    4739
Autumn    4351
Spring    4347
Summer    4288
Name: count, dtype: int64


In [50]:
merged.head(10)

Unnamed: 0,region,date,season,hip_fracture_count,severity_num,weather_flag,Warning Element
0,HSE Dublin and Midlands,2014-01-02,Winter,1,1.0,1.0,[Wind]
1,HSE Dublin and Midlands,2014-01-02,Winter,1,2.0,0.0,[Wind]
2,HSE Dublin and Midlands,2014-01-04,Winter,5,1.0,1.0,[Snow/Ice]
3,HSE Dublin and Midlands,2014-01-06,Winter,1,1.0,1.0,[Wind]
4,HSE Dublin and Midlands,2014-01-06,Winter,1,2.0,1.0,[Wind]
5,HSE Dublin and Midlands,2014-01-07,Winter,1,0.0,0.0,Unknown
6,HSE Dublin and Midlands,2014-01-08,Winter,2,0.0,0.0,Unknown
7,HSE Dublin and Midlands,2014-01-09,Winter,2,0.0,0.0,Unknown
8,HSE Dublin and Midlands,2014-01-10,Winter,1,0.0,0.0,Unknown
9,HSE Dublin and Midlands,2014-01-11,Winter,2,0.0,0.0,Unknown


In [52]:
merged['year'] = merged['date'].dt.year

In [53]:
# Create a subset DataFrame containing only rows for the Mid West
df_mid_west = merged[merged['region'] == 'HSE Mid West']


df_mid_west.isna().any(axis=1).sum()


0

In [54]:
df_census.rename(columns={'HSE Regions': 'region'}, inplace=True)


In [55]:
df_census.rename(columns={'Year': 'year'}, inplace=True)

In [56]:
df_census.rename(columns={'Population (Linear Est)': 'population'}, inplace=True)

In [57]:
df_census['region'] = df_census['region'].replace('HSE Midwest', 'HSE Mid West')

In [58]:
df_with_pop = pd.merge(
    merged,     # your merged daily data (weather + hip fractures)
    df_census,     # population per region-year
    how='left', # keep all rows from merged even if no match found
    on=['region','year']
)

In [59]:
df_with_pop['log_population'] = np.log(df_with_pop['population'])

In [68]:
df_with_pop[df_with_pop['region'] == 'HSE Mid West'].head(15)


Unnamed: 0,region,date,season,hip_fracture_count,severity_num,weather_flag,Warning Element,year,population,log_population
9701,HSE Mid West,2014-01-03,Winter,2,1.0,1.0,[Wind],2014,152114,11.932386
9702,HSE Mid West,2014-01-05,Winter,3,1.0,1.0,[Wind],2014,152114,11.932386
9703,HSE Mid West,2014-01-05,Winter,3,2.0,1.0,[Wind],2014,152114,11.932386
9704,HSE Mid West,2014-01-06,Winter,2,1.0,1.0,[Wind],2014,152114,11.932386
9705,HSE Mid West,2014-01-06,Winter,2,2.0,1.0,[Wind],2014,152114,11.932386
9706,HSE Mid West,2014-01-08,Winter,2,0.0,0.0,Unknown,2014,152114,11.932386
9707,HSE Mid West,2014-01-09,Winter,1,0.0,0.0,Unknown,2014,152114,11.932386
9708,HSE Mid West,2014-01-11,Winter,1,0.0,0.0,Unknown,2014,152114,11.932386
9709,HSE Mid West,2014-01-12,Winter,1,1.0,1.0,"[Rain, Wind]",2014,152114,11.932386
9710,HSE Mid West,2014-01-13,Winter,1,0.0,0.0,Unknown,2014,152114,11.932386


In [69]:
df_with_pop[df_with_pop['region'] == 'HSE Dublin and Midlands'].head(15)

Unnamed: 0,region,date,season,hip_fracture_count,severity_num,weather_flag,Warning Element,year,population,log_population
0,HSE Dublin and Midlands,2014-01-02,Winter,1,1.0,1.0,[Wind],2014,317830,12.669272
1,HSE Dublin and Midlands,2014-01-02,Winter,1,2.0,0.0,[Wind],2014,317830,12.669272
2,HSE Dublin and Midlands,2014-01-04,Winter,5,1.0,1.0,[Snow/Ice],2014,317830,12.669272
3,HSE Dublin and Midlands,2014-01-06,Winter,1,1.0,1.0,[Wind],2014,317830,12.669272
4,HSE Dublin and Midlands,2014-01-06,Winter,1,2.0,1.0,[Wind],2014,317830,12.669272
5,HSE Dublin and Midlands,2014-01-07,Winter,1,0.0,0.0,Unknown,2014,317830,12.669272
6,HSE Dublin and Midlands,2014-01-08,Winter,2,0.0,0.0,Unknown,2014,317830,12.669272
7,HSE Dublin and Midlands,2014-01-09,Winter,2,0.0,0.0,Unknown,2014,317830,12.669272
8,HSE Dublin and Midlands,2014-01-10,Winter,1,0.0,0.0,Unknown,2014,317830,12.669272
9,HSE Dublin and Midlands,2014-01-11,Winter,2,0.0,0.0,Unknown,2014,317830,12.669272


In [60]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf


In [61]:
print(merged['severity_num'].apply(type).value_counts())


severity_num
<class 'float'>    17725
Name: count, dtype: int64


In [62]:
import statsmodels.formula.api as smf

# Expanded formula with additional predictors and an interaction term.
formula = "hip_fracture_count ~ weather_flag * + C(season) +C(region)"

# Fit the Poisson regression model (without an offset for this example)
poisson_model = smf.glm(formula=formula, data=df_with_pop, family=sm.families.Poisson(),offset=df_with_pop['log_population']).fit()


# Print the summary of the model
print("Poisson Regression Results:")
print(poisson_model.summary())

# Optionally, check for overdispersion:
print("Residual Deviance:", poisson_model.deviance)
print("Degrees of Freedom:", poisson_model.df_resid)

Poisson Regression Results:
                 Generalized Linear Model Regression Results                  
Dep. Variable:     hip_fracture_count   No. Observations:                17725
Model:                            GLM   Df Residuals:                    17712
Model Family:                 Poisson   Df Model:                           12
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -27483.
Date:                Wed, 12 Feb 2025   Deviance:                       10092.
Time:                        07:47:13   Pearson chi2:                 1.10e+04
No. Iterations:                     4   Pseudo R-squ. (CS):            0.02426
Covariance Type:            nonrobust                                         
                                             coef    std err          z      P>|z|      [0.025      0.975]
---------------------------------------------------------------------------

In [70]:
import statsmodels.formula.api as smf

# Define the model formula
formula = "hip_fracture_count ~ weather_flag * + C(season) + C(region)"

# Fit the Negative Binomial model
nb_model = smf.glm(formula=formula, data=df_with_pop,
                   family=sm.families.NegativeBinomial()).fit()

# Print the summary of the model
print(nb_model.summary())


                 Generalized Linear Model Regression Results                  
Dep. Variable:     hip_fracture_count   No. Observations:                17725
Model:                            GLM   Df Residuals:                    17712
Model Family:        NegativeBinomial   Df Model:                           12
Link Function:                    Log   Scale:                          1.0000
Method:                          IRLS   Log-Likelihood:                -34379.
Date:                Wed, 12 Feb 2025   Deviance:                       3236.0
Time:                        10:47:52   Pearson chi2:                 3.57e+03
No. Iterations:                     5   Pseudo R-squ. (CS):            0.01384
Covariance Type:            nonrobust                                         
                                             coef    std err          z      P>|z|      [0.025      0.975]
-------------------------------------------------------------------------------------------------------

