In [1]:
# Import necessary libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import statsmodels.formula.api as smf

# For inline plots in Jupyter
%matplotlib inline
import pandas as pd
import matplotlib as plt
import glob
import os
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
from tqdm import tqdm
import codecs
import csv
import openpyxl

In [2]:
data_directory_weather = "/home/paulharford/college/project/project_data/processed/WEATHERED_warnings_2014-2023_cleaned.csv"
data_directory_nas = "/home/paulharford/college/project/project_data/nas/nas_clean.csv"
data_directory_census = "/home/paulharford/college/project/project_data/processed/census_estimated_per_region_2014_2023.csv"
full_path_nas = os.path.abspath(data_directory_nas)
full_path_weather = os.path.abspath(data_directory_weather)
full_path_census = os.path.abspath(data_directory_census)

In [3]:
df_nas = pd.read_csv(full_path_nas)
df_weather = pd.read_csv(full_path_weather)
df_census = pd.read_csv(full_path_census)

In [4]:
df_weather["Valid From"] = pd.to_datetime(df_weather["Valid From"])
df_weather["date"] = df_weather["Valid From"].dt.date
df_weather["date"] = pd.to_datetime(df_weather["date"])
df_weather["Valid To"] = pd.to_datetime(df_weather["Valid To"])


In [5]:
#add column for lag
df_weather['lag_valid_to'] = df_weather['Valid To'] + pd.Timedelta(hours=12)

In [6]:
weather_region_cols = [
    "HSE Dublin and North East",
    "HSE Dublin and Midlands",
    "HSE Dublin and South East",
    "HSE Mid West",
    "HSE South West",
    "HSE West and North West",
    # any others you have
]

df_weather_full = df_weather.melt(
    id_vars=["date", "Issue Time", "Valid From", "Valid To", "lag_valid_to", "Warning Colour", 
             "Warning Element", "Warning Text", "Duration_hours"],
    value_vars=weather_region_cols,
    var_name="region",
    value_name="weather_flag"
)


In [7]:
df_weather_full.head(10)

Unnamed: 0,date,Issue Time,Valid From,Valid To,lag_valid_to,Warning Colour,Warning Element,Warning Text,Duration_hours,region,weather_flag
0,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Orange,Wind,Becoming stormy this evening and tonight and c...,21.0,HSE Dublin and North East,0
1,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Yellow,Wind,Becoming extremely windy or stormy this evenin...,21.0,HSE Dublin and North East,1
2,2014-01-03,2014-01-02 09:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-03 19:00:00+00:00,2014-01-04 07:00:00+00:00,Yellow,Wind,Blustery for the rest of the afternoon with so...,5.0,HSE Dublin and North East,1
3,2014-01-04,2014-01-04 03:00:00+00:00,2014-01-04 03:00:00+00:00,2014-01-04 14:00:00+00:00,2014-01-05 02:00:00+00:00,Yellow,Snow/Ice,Scattered outbreaks of rain and sleet spreadin...,11.0,HSE Dublin and North East,1
4,2014-01-05,2014-01-04 10:00:00+00:00,2014-01-05 08:00:00+00:00,2014-01-05 20:00:00+00:00,2014-01-06 08:00:00+00:00,Yellow,Wind,Southeast winds mean speeds of 50 to 65 km/h w...,12.0,HSE Dublin and North East,1
5,2014-01-05,2014-01-04 16:00:00+00:00,2014-01-05 08:00:00+00:00,2014-01-05 18:00:00+00:00,2014-01-06 06:00:00+00:00,Yellow,Wind,Becoming windy again on Sunday morning with st...,10.0,HSE Dublin and North East,1
6,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 11:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Yellow,Wind,"UpdateStrong south to southeast winds, with gu...",25.0,HSE Dublin and North East,1
7,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 11:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Yellow,Wind,"UndateStrong south to southeast winds, with gu...",25.0,HSE Dublin and North East,1
8,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 11:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Orange,Wind,"Update Strong south to southeast winds, with g...",25.0,HSE Dublin and North East,0
9,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 12:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Orange,Wind,"Update Strong south to southeast winds, with g...",24.0,HSE Dublin and North East,0


In [8]:
fill_values = {
    'Issue Time': 0,
    'Valid From': 0,
    'Valid To': 0,
    'lag_valid_to': 0,
    'Warning Colour': 0,
    'Warning Element': 0,
    # For text fields you want a specific string, e.g. 'noevent':
    'Warning Text': 'noevent',
    'Duration_hours': 0,
    'region': 'noevent',
    'weather_flag': 2.0
}

# Fill NaN values for these columns using fillna().
df_weather_full.fillna(value=fill_values, inplace=True)



In [9]:
df_weather_full.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16398 entries, 0 to 16397
Data columns (total 11 columns):
 #   Column           Non-Null Count  Dtype              
---  ------           --------------  -----              
 0   date             16398 non-null  datetime64[ns]     
 1   Issue Time       16398 non-null  object             
 2   Valid From       16398 non-null  datetime64[ns, UTC]
 3   Valid To         16398 non-null  object             
 4   lag_valid_to     16398 non-null  object             
 8   Duration_hours   16398 non-null  float64            
 9   region           16398 non-null  object             
 10  weather_flag     16398 non-null  int64              
dtypes: datetime64[ns, UTC](1), datetime64[ns](1), float64(1), int64(1), object(7)
memory usage: 1.4+ MB


In [10]:
df_weather_full.head(15)

Unnamed: 0,date,Issue Time,Valid From,Valid To,lag_valid_to,Warning Colour,Warning Element,Warning Text,Duration_hours,region,weather_flag
0,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Orange,Wind,Becoming stormy this evening and tonight and c...,21.0,HSE Dublin and North East,0
1,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Yellow,Wind,Becoming extremely windy or stormy this evenin...,21.0,HSE Dublin and North East,1
2,2014-01-03,2014-01-02 09:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-03 19:00:00+00:00,2014-01-04 07:00:00+00:00,Yellow,Wind,Blustery for the rest of the afternoon with so...,5.0,HSE Dublin and North East,1
3,2014-01-04,2014-01-04 03:00:00+00:00,2014-01-04 03:00:00+00:00,2014-01-04 14:00:00+00:00,2014-01-05 02:00:00+00:00,Yellow,Snow/Ice,Scattered outbreaks of rain and sleet spreadin...,11.0,HSE Dublin and North East,1
4,2014-01-05,2014-01-04 10:00:00+00:00,2014-01-05 08:00:00+00:00,2014-01-05 20:00:00+00:00,2014-01-06 08:00:00+00:00,Yellow,Wind,Southeast winds mean speeds of 50 to 65 km/h w...,12.0,HSE Dublin and North East,1
5,2014-01-05,2014-01-04 16:00:00+00:00,2014-01-05 08:00:00+00:00,2014-01-05 18:00:00+00:00,2014-01-06 06:00:00+00:00,Yellow,Wind,Becoming windy again on Sunday morning with st...,10.0,HSE Dublin and North East,1
6,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 11:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Yellow,Wind,"UpdateStrong south to southeast winds, with gu...",25.0,HSE Dublin and North East,1
7,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 11:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Yellow,Wind,"UndateStrong south to southeast winds, with gu...",25.0,HSE Dublin and North East,1
8,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 11:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Orange,Wind,"Update Strong south to southeast winds, with g...",25.0,HSE Dublin and North East,0
9,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 12:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Orange,Wind,"Update Strong south to southeast winds, with g...",24.0,HSE Dublin and North East,0


In [11]:
df_weather_filtered = df_weather_full[
    (df_weather_full['date'] >= '2016-01-01') & 
    (df_weather_full['date'] <= '2023-12-31').copy()
]

## NAS Data

In [12]:
# convert to a pandas datetime if you prefer consistency
df_nas["date"] = pd.to_datetime(df_nas["date"])

In [13]:
df_nas.head()

Unnamed: 0,date,Time of Day,PickupTown,DespatchCode,DespCodeDescription,Hospital Attended,region
0,2016-01-01,Night,DUNGLOE,17B01P,Falls - Possibly Dangerous body Area - Public ...,LETTERKENNY GENERAL HOSPITAL,HSE West and North West
1,2016-01-01,Night,BEAUMONT DUBLIN 9,17B01G,Falls - Possibly Dangerous body Area - On the ...,BEAUMONT HOSPITAL,HSE Dublin and North East
2,2016-01-01,Night,DROGHEDA,17B01,Falls - Possibly Dangerous body Area -,LOURDES HOSPITAL DROGHEDA,HSE Dublin and North East
3,2016-01-01,Night,BORRISOKANE,17B01P,Falls - Possibly Dangerous body Area - Public ...,PORTIUNCULA GENERAL HOSPITAL,HSE West and North West
4,2016-01-01,Night,WEXFORD,17B01G,Falls - Possibly Dangerous body Area - On the ...,Unknown,Unknown


In [14]:
df_nas_filtered = df_nas[
    (df_nas['date'] >= '2016-01-01') & 
    (df_nas['date'] <= '2023-12-31').copy()
]

In [15]:
df_nas_season = df_nas_filtered.copy()

In [16]:
def month_to_season(month):
    # Convert month to an integer if it isn't already.
    month = int(month)
    if month in [12, 1, 2]:
        return 'Winter'
    elif month in [3, 4, 5]:
        return 'Spring'
    elif month in [6, 7, 8]:
        return 'Summer'
    elif month in [9, 10, 11]:
        return 'Autumn'
    else:
        return 'unknown'




In [17]:
df_nas_season['month'] = df_nas_season['date'].dt.month

In [18]:
df_weather_full['month'] = df_weather_full['date'].dt.month

In [19]:
df_weather_full['season'] = df_weather_full['month'].apply(month_to_season)

In [20]:
df_nas_season['season'] = df_nas_season['month'].apply(month_to_season)

In [21]:
print(df_nas_season['season'].unique())


['Winter' 'Spring' 'Summer' 'Autumn']


In [22]:
print(df_weather_full['season'].unique())

['Winter' 'Spring' 'Summer' 'Autumn']


In [23]:
print(df_nas_season['season'].value_counts())


season
Summer    39667
Autumn    39217
Winter    39196
Spring    37126
Name: count, dtype: int64


In [25]:
nan_counts = df_nas_season.isna().sum()
print(nan_counts)

date                   0
Time of Day            0
PickupTown             0
DespatchCode           0
DespCodeDescription    0
Hospital Attended      0
region                 0
month                  0
season                 0
dtype: int64


## merging

In [26]:
nas_agg = (
    df_nas_season
    .groupby(['region', 'date','season'])
    .agg(hip_fracture_callout=('date', 'count'))
    .reset_index()
)


In [27]:
nas_agg.head()


Unnamed: 0,region,date,season,hip_fracture_callout
0,HSE Dublin and Midlands,2016-01-01,Winter,8
1,HSE Dublin and Midlands,2016-01-02,Winter,5
2,HSE Dublin and Midlands,2016-01-03,Winter,2
3,HSE Dublin and Midlands,2016-01-04,Winter,5
4,HSE Dublin and Midlands,2016-01-05,Winter,2


In [28]:
##add full date calandar to the aggregation 
# 1a. Identify all unique regions
regions = nas_agg['region'].unique()

# 1b. Identify the full date range
min_date = nas_agg['date'].min()
max_date = nas_agg['date'].max()

In [29]:
all_dates = pd.date_range(start=min_date, end=max_date, freq='D')

multi_index = pd.MultiIndex.from_product([regions, all_dates], names=['region', 'date'])
df_region_date = pd.DataFrame(index=multi_index).reset_index()

In [30]:
df_region_date.head(15)

Unnamed: 0,region,date
0,HSE Dublin and Midlands,2016-01-01
1,HSE Dublin and Midlands,2016-01-02
2,HSE Dublin and Midlands,2016-01-03
3,HSE Dublin and Midlands,2016-01-04
4,HSE Dublin and Midlands,2016-01-05
5,HSE Dublin and Midlands,2016-01-06
6,HSE Dublin and Midlands,2016-01-07
7,HSE Dublin and Midlands,2016-01-08
8,HSE Dublin and Midlands,2016-01-09
9,HSE Dublin and Midlands,2016-01-10


In [31]:
df_nas_final = pd.merge(
    df_region_date,          # all region-date combos
    nas_agg,               # your aggregated counts
    on=['region', 'date'],   # merge keys
    how='left'               # left-join so we keep all rows from df_region_date
)

In [32]:
df_nas_final['month'] = df_nas_final['date'].dt.month

In [33]:
df_nas_final.head(10)

Unnamed: 0,region,date,season,hip_fracture_callout,month
0,HSE Dublin and Midlands,2016-01-01,Winter,8.0,1
1,HSE Dublin and Midlands,2016-01-02,Winter,5.0,1
2,HSE Dublin and Midlands,2016-01-03,Winter,2.0,1
3,HSE Dublin and Midlands,2016-01-04,Winter,5.0,1
4,HSE Dublin and Midlands,2016-01-05,Winter,2.0,1
5,HSE Dublin and Midlands,2016-01-06,Winter,6.0,1
6,HSE Dublin and Midlands,2016-01-07,Winter,4.0,1
7,HSE Dublin and Midlands,2016-01-08,Winter,8.0,1
8,HSE Dublin and Midlands,2016-01-09,Winter,4.0,1
9,HSE Dublin and Midlands,2016-01-10,Winter,3.0,1


In [34]:
df_nas_final['year'] = df_nas_final['date'].dt.year

In [35]:
severity_map = {
    'Yellow': 1,
    'Orange': 2,
    'Red': 3
}

df_weather_full['severity_num'] = df_weather_full['Warning Colour'].map(severity_map)


In [36]:
df_weather_full['severity_num'] = df_weather_full['severity_num'].fillna(0)

In [37]:
df_weather_full.head(15)

Unnamed: 0,date,Issue Time,Valid From,Valid To,lag_valid_to,Warning Colour,Warning Element,Warning Text,Duration_hours,region,weather_flag,month,season,severity_num
0,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Orange,Wind,Becoming stormy this evening and tonight and c...,21.0,HSE Dublin and North East,0,1,Winter,2.0
1,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Yellow,Wind,Becoming extremely windy or stormy this evenin...,21.0,HSE Dublin and North East,1,1,Winter,1.0
2,2014-01-03,2014-01-02 09:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-03 19:00:00+00:00,2014-01-04 07:00:00+00:00,Yellow,Wind,Blustery for the rest of the afternoon with so...,5.0,HSE Dublin and North East,1,1,Winter,1.0
3,2014-01-04,2014-01-04 03:00:00+00:00,2014-01-04 03:00:00+00:00,2014-01-04 14:00:00+00:00,2014-01-05 02:00:00+00:00,Yellow,Snow/Ice,Scattered outbreaks of rain and sleet spreadin...,11.0,HSE Dublin and North East,1,1,Winter,1.0
4,2014-01-05,2014-01-04 10:00:00+00:00,2014-01-05 08:00:00+00:00,2014-01-05 20:00:00+00:00,2014-01-06 08:00:00+00:00,Yellow,Wind,Southeast winds mean speeds of 50 to 65 km/h w...,12.0,HSE Dublin and North East,1,1,Winter,1.0
5,2014-01-05,2014-01-04 16:00:00+00:00,2014-01-05 08:00:00+00:00,2014-01-05 18:00:00+00:00,2014-01-06 06:00:00+00:00,Yellow,Wind,Becoming windy again on Sunday morning with st...,10.0,HSE Dublin and North East,1,1,Winter,1.0
6,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 11:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Yellow,Wind,"UpdateStrong south to southeast winds, with gu...",25.0,HSE Dublin and North East,1,1,Winter,1.0
7,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 11:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Yellow,Wind,"UndateStrong south to southeast winds, with gu...",25.0,HSE Dublin and North East,1,1,Winter,1.0
8,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 11:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Orange,Wind,"Update Strong south to southeast winds, with g...",25.0,HSE Dublin and North East,0,1,Winter,2.0
9,2014-01-05,2014-01-05 11:00:00+00:00,2014-01-05 12:00:00+00:00,2014-01-06 12:00:00+00:00,2014-01-07 00:00:00+00:00,Orange,Wind,"Update Strong south to southeast winds, with g...",24.0,HSE Dublin and North East,0,1,Winter,2.0


In [38]:
df_weather_full.query(
    "date == '2014-01-02'"
)

  df_weather_full.query(


Unnamed: 0,date,Issue Time,Valid From,Valid To,lag_valid_to,Warning Colour,Warning Element,Warning Text,Duration_hours,region,weather_flag,month,season,severity_num
0,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Orange,Wind,Becoming stormy this evening and tonight and c...,21.0,HSE Dublin and North East,0,1,Winter,2.0
1,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Yellow,Wind,Becoming extremely windy or stormy this evenin...,21.0,HSE Dublin and North East,1,1,Winter,1.0
2733,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Orange,Wind,Becoming stormy this evening and tonight and c...,21.0,HSE Dublin and Midlands,0,1,Winter,2.0
2734,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Yellow,Wind,Becoming extremely windy or stormy this evenin...,21.0,HSE Dublin and Midlands,1,1,Winter,1.0
5466,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Orange,Wind,Becoming stormy this evening and tonight and c...,21.0,HSE Dublin and South East,0,1,Winter,2.0
5467,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Yellow,Wind,Becoming extremely windy or stormy this evenin...,21.0,HSE Dublin and South East,1,1,Winter,1.0
8199,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Orange,Wind,Becoming stormy this evening and tonight and c...,21.0,HSE Mid West,1,1,Winter,2.0
8200,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Yellow,Wind,Becoming extremely windy or stormy this evenin...,21.0,HSE Mid West,1,1,Winter,1.0
10932,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Orange,Wind,Becoming stormy this evening and tonight and c...,21.0,HSE South West,1,1,Winter,2.0
10933,2014-01-02,2014-01-02 09:00:00+00:00,2014-01-02 17:00:00+00:00,2014-01-03 14:00:00+00:00,2014-01-04 02:00:00+00:00,Yellow,Wind,Becoming extremely windy or stormy this evenin...,21.0,HSE South West,0,1,Winter,1.0


In [39]:
df_weather_final = (
    df_weather_full
    .groupby(['region', 'date', 'severity_num'], as_index=False)
    .agg({
        'weather_flag': 'max',  # any day with a warning => 1
        'Warning Element': lambda s: list(s.unique())
    })
)



In [40]:
df_weather_final.head(15)

Unnamed: 0,region,date,severity_num,weather_flag,Warning Element
0,HSE Dublin and Midlands,2014-01-02,1.0,1,[Wind]
1,HSE Dublin and Midlands,2014-01-02,2.0,0,[Wind]
2,HSE Dublin and Midlands,2014-01-03,1.0,1,[Wind]
3,HSE Dublin and Midlands,2014-01-04,1.0,1,[Snow/Ice]
4,HSE Dublin and Midlands,2014-01-05,1.0,1,[Wind]
5,HSE Dublin and Midlands,2014-01-05,2.0,0,[Wind]
6,HSE Dublin and Midlands,2014-01-06,1.0,1,[Wind]
7,HSE Dublin and Midlands,2014-01-06,2.0,1,[Wind]
8,HSE Dublin and Midlands,2014-01-12,1.0,1,"[Rain, Wind]"
9,HSE Dublin and Midlands,2014-01-16,2.0,1,[Fog (or freezing fog)]


In [41]:
merged = pd.merge(df_nas_final, df_weather_final , on=['region', 'date'], how='left')



In [42]:
# Fill missing weather_flag values with 0 (indicating no adverse weather event)
merged['weather_flag'] = merged['weather_flag'].fillna(0)
merged['severity_num'] = merged['severity_num'].fillna(0)
merged['Warning Element'] = merged['Warning Element'].fillna('Unknown')



merged['hip_adv_event'] = ((merged['hip_fracture_count'] > 0) & (merged['weather_flag'] == 1)).astype(int)


In [43]:
merged.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24051 entries, 0 to 24050
Data columns (total 9 columns):
 #   Column                Non-Null Count  Dtype         
---  ------                --------------  -----         
 0   region                24051 non-null  object        
 1   date                  24051 non-null  datetime64[ns]
 2   season                21076 non-null  object        
 3   hip_fracture_callout  21076 non-null  float64       
 4   month                 24051 non-null  int32         
 5   year                  24051 non-null  int32         
 6   severity_num          24051 non-null  float64       
 7   weather_flag          24051 non-null  float64       
dtypes: datetime64[ns](1), float64(3), int32(2), object(3)
memory usage: 1.5+ MB


In [44]:
merged.isna().sum()


region                     0
date                       0
season                  2975
hip_fracture_callout    2975
month                      0
year                       0
severity_num               0
weather_flag               0
dtype: int64

In [45]:
merged['hip_fracture_callout'] = merged['hip_fracture_callout'].fillna(0)

In [46]:
merged['season'] = merged['month'].apply(month_to_season)

In [47]:
print(merged['season'].value_counts())

season
Winter    6076
Autumn    5994
Spring    5993
Summer    5988
Name: count, dtype: int64


In [48]:
merged.head(10)

Unnamed: 0,region,date,season,hip_fracture_callout,month,year,severity_num,weather_flag,Warning Element
0,HSE Dublin and Midlands,2016-01-01,Winter,8.0,1,2016,1.0,1.0,[Rain]
1,HSE Dublin and Midlands,2016-01-02,Winter,5.0,1,2016,0.0,0.0,Unknown
2,HSE Dublin and Midlands,2016-01-03,Winter,2.0,1,2016,1.0,0.0,[Rain]
3,HSE Dublin and Midlands,2016-01-04,Winter,5.0,1,2016,0.0,0.0,Unknown
4,HSE Dublin and Midlands,2016-01-05,Winter,2.0,1,2016,0.0,0.0,Unknown
5,HSE Dublin and Midlands,2016-01-06,Winter,6.0,1,2016,1.0,1.0,[Rain]
6,HSE Dublin and Midlands,2016-01-07,Winter,4.0,1,2016,0.0,0.0,Unknown
7,HSE Dublin and Midlands,2016-01-08,Winter,8.0,1,2016,0.0,0.0,Unknown
8,HSE Dublin and Midlands,2016-01-09,Winter,4.0,1,2016,0.0,0.0,Unknown
9,HSE Dublin and Midlands,2016-01-10,Winter,3.0,1,2016,0.0,0.0,Unknown


In [49]:
merged['year'] = merged['date'].dt.year

In [69]:
unique_regions_census = df_census['region'].unique()
print("Unique regions in df_census:", unique_regions_census)

Unique regions in df_census: ['HSE Dublin and Midlands' 'HSE Dublin and North East'
 'HSE Dublin and South East' 'HSE Midwest' 'HSE South West'
 'HSE West and North West']


In [70]:
unique_regions_census = merged['region'].unique()
print("Unique regions in df_census:", unique_regions_census)

Unique regions in df_census: ['HSE Dublin and Midlands' 'HSE Dublin and North East'
 'HSE Dublin and South East' 'HSE Mid West' 'HSE South East'
 'HSE West and North West' 'Hospital Unknown' 'Unknown']


In [71]:
##naming corrections 
df_census['region'] = df_census['region'].replace({'HSE Midwest': 'HSE Mid West'})

In [72]:
merged['region'] = merged['region'].replace({'HSE South East': 'HSE South West'})

In [73]:
df_with_pop = pd.merge(
    merged,     # your merged daily data (weather + hip fractures)
    df_census,     # population per region-year
    how='left', # keep all rows from merged even if no match found
    on=['region','year']
)

In [74]:
df_with_pop.head(15)

Unnamed: 0,region,date,season,hip_fracture_callout,month,year,severity_num,weather_flag,Warning Element,population
0,HSE Dublin and Midlands,2016-01-01,Winter,8.0,1,2016,1.0,1.0,[Rain],327547.0
1,HSE Dublin and Midlands,2016-01-02,Winter,5.0,1,2016,0.0,0.0,Unknown,327547.0
2,HSE Dublin and Midlands,2016-01-03,Winter,2.0,1,2016,1.0,0.0,[Rain],327547.0
3,HSE Dublin and Midlands,2016-01-04,Winter,5.0,1,2016,0.0,0.0,Unknown,327547.0
4,HSE Dublin and Midlands,2016-01-05,Winter,2.0,1,2016,0.0,0.0,Unknown,327547.0
5,HSE Dublin and Midlands,2016-01-06,Winter,6.0,1,2016,1.0,1.0,[Rain],327547.0
6,HSE Dublin and Midlands,2016-01-07,Winter,4.0,1,2016,0.0,0.0,Unknown,327547.0
7,HSE Dublin and Midlands,2016-01-08,Winter,8.0,1,2016,0.0,0.0,Unknown,327547.0
8,HSE Dublin and Midlands,2016-01-09,Winter,4.0,1,2016,0.0,0.0,Unknown,327547.0
9,HSE Dublin and Midlands,2016-01-10,Winter,3.0,1,2016,0.0,0.0,Unknown,327547.0


In [75]:
# Create a dictionary from df_census that maps (region, year) to population
pop_dict = df_census.set_index(['region', 'year'])['population'].to_dict()

def update_population(row):
    # Build a key from the current row
    key = (row['region'], row['year'])
    # If population is missing and the key exists in pop_dict, update it; otherwise, return current value.
    return pop_dict.get(key, row['population'])





In [76]:
df_census['year'] = df_census['year'].astype(int)
df_with_pop['year'] = df_with_pop['year'].astype(int)
df_census['region'] = df_census['region'].astype(str)
df_with_pop['region'] = df_with_pop['region'].astype(str)

In [77]:
df_with_pop['population'] = df_with_pop.apply(update_population, axis=1)

In [79]:
# Count entries with either 'Hospital Unknown' or 'Unknown'
count = df_with_pop[df_with_pop['region'].isin(['Hospital Unknown', 'Unknown'])].shape[0]
print("Count of entries with 'Hospital Unknown' or 'Unknown':", count)



Count of entries with 'Hospital Unknown' or 'Unknown': 5844


In [82]:
# Filter out rows where region is 'Hospital Unknown' or 'Unknown'
df_with_pop = df_with_pop[~df_with_pop['region'].isin(['Hospital Unknown', 'Unknown'])]

# Verify by checking unique values
print(df_with_pop['region'].unique())


['HSE Dublin and Midlands' 'HSE Dublin and North East'
 'HSE Dublin and South East' 'HSE Mid West' 'HSE South West'
 'HSE West and North West']


In [83]:
nan_rows = df_with_pop[df_with_pop['log_population'].isna()]
print(nan_rows.head())

Empty DataFrame
Index: []


In [84]:
df_with_pop['log_population'] = np.log(df_with_pop['population'])

In [85]:
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf


In [86]:
print(merged['severity_num'].apply(type).value_counts())


severity_num
<class 'float'>    24051
Name: count, dtype: int64


In [87]:
import statsmodels.formula.api as smf

# Expanded formula with additional predictors and an interaction term.
formula = "hip_fracture_callout ~ weather_flag * +C(region)"

# Fit the Poisson regression model (without an offset for this example)
poisson_model = smf.glm(formula=formula, data=df_with_pop, family=sm.families.Poisson(),offset=df_with_pop['log_population']).fit()


# Print the summary of the model
print("Poisson Regression Results:")
print(poisson_model.summary())

# Optionally, check for overdispersion:
print("Residual Deviance:", poisson_model.deviance)
print("Degrees of Freedom:", poisson_model.df_resid)

Poisson Regression Results:
                  Generalized Linear Model Regression Results                   
Dep. Variable:     hip_fracture_callout   No. Observations:                18207
Model:                              GLM   Df Residuals:                    18196
Model Family:                   Poisson   Df Model:                           10
Link Function:                      Log   Scale:                          1.0000
Method:                            IRLS   Log-Likelihood:                -46423.
Date:                  Sun, 16 Feb 2025   Deviance:                       25594.
Time:                          18:18:04   Pearson chi2:                 2.50e+04
No. Iterations:                       5   Pseudo R-squ. (CS):             0.3680
Covariance Type:              nonrobust                                         
                                                          coef    std err          z      P>|z|      [0.025      0.975]
------------------------------------------

In [88]:
import statsmodels.formula.api as smf

# Define the model formula
formula = "hip_fracture_callout ~ weather_flag * + C(season) + C(region)"

# Fit the Negative Binomial model
nb_model = smf.glm(formula=formula, data=df_with_pop,
                   family=sm.families.NegativeBinomial()).fit()

# Print the summary of the model
print(nb_model.summary())


                  Generalized Linear Model Regression Results                   
Dep. Variable:     hip_fracture_callout   No. Observations:                18207
Model:                              GLM   Df Residuals:                    18194
Model Family:          NegativeBinomial   Df Model:                           12
Link Function:                      Log   Scale:                          1.0000
Method:                            IRLS   Log-Likelihood:                -55093.
Date:                  Sun, 16 Feb 2025   Deviance:                       3905.6
Time:                          18:18:07   Pearson chi2:                 3.34e+03
No. Iterations:                       5   Pseudo R-squ. (CS):            0.08971
Covariance Type:              nonrobust                                         
                                             coef    std err          z      P>|z|      [0.025      0.975]
-----------------------------------------------------------------------------------



In [92]:
import statsmodels.api as sm
from statsmodels.discrete.discrete_model import Poisson
from patsy import dmatrices

# Define your formula
formula = "hip_fracture_callout ~ weather_flag * + C(season) + C(region)"

# Create design matrices using patsy
y, X = dmatrices(formula, data=df_with_pop, return_type='dataframe')

# Extract the offset (for example, log_population)
offset = df_with_pop['log_population']

# Fit the discrete Poisson model
poisson_discrete = Poisson(y, X, offset=offset)
result = poisson_discrete.fit()

# Print the summary of the results
print(result.summary())



Optimization terminated successfully.
         Current function value: 2.545209
         Iterations 5
                           Poisson Regression Results                           
Dep. Variable:     hip_fracture_callout   No. Observations:                18207
Model:                          Poisson   Df Residuals:                    18194
Method:                             MLE   Df Model:                           12
Date:                  Sun, 16 Feb 2025   Pseudo R-squ.:                 0.08419
Time:                          18:22:57   Log-Likelihood:                -46341.
converged:                         True   LL-Null:                       -50601.
Covariance Type:              nonrobust   LLR p-value:                     0.000
                                             coef    std err          z      P>|z|      [0.025      0.975]
----------------------------------------------------------------------------------------------------------
Intercept                           

In [64]:
#print(df_with_pop['log_population'].describe())
print(df_with_pop['log_population'].isna().sum())


11823
