In [1]:
##Set styling for plotting
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import seaborn as sns
sns.set_palette('colorblind')
from matplotlib.pyplot import tight_layout
# ##SETTING PARAMS FOR MATPLOTLIB FIGURES
plt.rcParams.update({"figure.figsize": (6, 6),
                 "axes.facecolor": "white",
                 "axes.edgecolor": "black"})
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=sns.color_palette('colorblind'))
##set font size
font = {'family': 'sans-serif',
       'weight': 'normal',
       'size': 14}
plt.rc('font', **font)
# ##PANDAS PLOTTING
pd.plotting.register_matplotlib_converters()

##set file path outside git=hub repo (data must not be stpored in github)
file_path='/mnt/hgfs/shared/ihfdv2.xlsx'

##Import modules
import numpy as np
!pip install openpyxl
%matplotlib inline
##save juypter env file
!conda env export > ihfd_environment2.yml



In [2]:
##Import data from excel into panda's dataframe
ihfd_df = pd.read_excel(file_path, engine='openpyxl')

In [3]:
###so the date-time is important for using with the weather data for the pres_hosp_datetime and missing values i'm going to populate with a random time 12 to 48 hours prior based on the variable surgey datetime
ihfd_df['Adm_First_Pres_Hosp_DateTime'] = ihfd_df['Adm_First_Pres_Hosp_DateTime'].fillna(
    ihfd_df['Adm_Primary_Surgery_DateTime'].apply(
        lambda x: x - pd.Timedelta(hours=np.random.uniform(12, 48)) if pd.notna(x) else np.nan
    )
)

In [4]:
columns_to_drop = [
    'Adm_RE_OP_30_DAYS',
    'Adm_Discharged_To',
    'Adm_Nerve_Block',
    'Adm_Nut_Risk',
    'Adm_Ass_Anp',
    'Adm_AMB_Number_ACU_DIS',
    'Adm_Trauma_DateTime',
    'Adm_Hospital_Fall',
    'Adm_Mobilised_No_Opt',
    'Adm_FRACTURE_Type_Other2',
    'Adm_Primary_Surgery_DateTime',
    'Index'
]

# Drop the columns
ihfd_df_2= ihfd_df.drop(columns=columns_to_drop)

# Separate numeric and categorical columns
numeric_cols = ihfd_df_2.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = ihfd_df_2.select_dtypes(include=['object', 'category']).columns

# Fill numeric columns with mean
ihfd_df_2[numeric_cols] = ihfd_df_2[numeric_cols].fillna(ihfd_df_2[numeric_cols].median())

# Fill categorical columns with mode (most frequent value)
for col in categorical_cols:
    ihfd_df_2[col] = ihfd_df_2[col].fillna(ihfd_df_2[col].mode().iloc[0])

print(numeric_cols)
print(categorical_cols)

Index(['LOS', 'Adm_Trauma_TYPE', 'Adm_Ward_Type', 'Adm_Pre_Frac_Indoor',
       'Adm_PRE_Frac_Outdoor', 'Adm_Pre_Frac_Shop', 'Adm_Pre_Frac_Number',
       'Adm_Fracture_Type', 'Adm_Pathological', 'Adm_Fragility',
       'Adm_Pre_OP_Med_Assess', 'Adm_Ger_Acute_Assess', 'Adm_Operation',
       'Adm_Asa_Grade', 'Adm_Anaesthesia', 'Adm_Surgery_Delay_Reason',
       'Adm_Mobilised', 'Adm_Pressure_Ulcers', 'Adm_Spec_Falls_Assess',
       'Adm_Bone_Protect_Med', 'Adm_Multi_Rehab_Assess'],
      dtype='object')
Index(['New Health Regions', 'NOCA_TraumaPeriodDay', 'NOCA_FirstPresPeriodDay',
       'NOCA_AgeRange'],
      dtype='object')


In [6]:
ihfd_df_2.head(10)

Unnamed: 0,New Health Regions,NOCA_TraumaPeriodDay,Adm_First_Pres_Hosp_DateTime,NOCA_FirstPresPeriodDay,NOCA_AgeRange,LOS,Adm_Trauma_TYPE,Adm_Ward_Type,Adm_Pre_Frac_Indoor,Adm_PRE_Frac_Outdoor,...,Adm_Ger_Acute_Assess,Adm_Operation,Adm_Asa_Grade,Adm_Anaesthesia,Adm_Surgery_Delay_Reason,Adm_Mobilised,Adm_Pressure_Ulcers,Adm_Spec_Falls_Assess,Adm_Bone_Protect_Med,Adm_Multi_Rehab_Assess
0,HSE Dublin and South East,AM,2015-09-08,PM,75-79,16,1.0,1.0,2.0,2.0,...,2.0,10.0,2.0,5.0,0.0,1.0,2.0,0.0,0.0,1.0
1,HSE Mid West,AM,2015-10-24,AM,80-84,60,2.0,1.0,2.0,2.0,...,1.0,1.0,4.0,5.0,2.0,1.0,2.0,1.0,1.0,1.0
2,HSE Dublin and South East,AM,2015-03-31,PM,95+,49,2.0,1.0,2.0,2.0,...,1.0,3.0,3.0,3.0,0.0,1.0,2.0,1.0,5.0,1.0
3,HSE Dublin and South East,AM,2015-01-08,AM,75-79,12,2.0,1.0,2.0,2.0,...,1.0,10.0,2.0,5.0,2.0,2.0,2.0,1.0,1.0,1.0
4,HSE South West,AM,2015-01-10,AM,65-69,12,2.0,1.0,2.0,2.0,...,1.0,5.0,2.0,1.0,0.0,1.0,2.0,1.0,0.0,1.0
5,HSE Mid West,AM,2015-01-27,AM,90-94,15,2.0,1.0,2.0,2.0,...,1.0,7.0,2.0,1.0,0.0,1.0,2.0,0.0,4.0,1.0
6,HSE Dublin and South East,AM,2015-04-19,PM,75-79,31,2.0,2.0,2.0,2.0,...,1.0,0.0,3.0,5.0,0.0,1.0,2.0,1.0,5.0,1.0
7,HSE Dublin and South East,AM,2015-04-24,PM,80-84,16,2.0,1.0,2.0,2.0,...,2.0,10.0,3.0,3.0,0.0,2.0,2.0,0.0,0.0,1.0
8,HSE South West,AM,2015-11-21,AM,80-84,19,2.0,1.0,2.0,2.0,...,1.0,7.0,3.0,5.0,0.0,1.0,2.0,1.0,1.0,1.0
9,HSE Mid West,AM,2015-08-18,AM,65-69,2,2.0,1.0,2.0,2.0,...,2.0,13.0,2.0,5.0,0.0,1.0,2.0,0.0,2.0,1.0


In [5]:
##get percentage of missing data
(ihfd_df_2.isna().sum()/ihfd_df_2.shape[0] * 100).sort_values()

New Health Regions              0.00000
Adm_Spec_Falls_Assess           0.00000
Adm_Pressure_Ulcers             0.00000
Adm_Mobilised                   0.00000
Adm_Surgery_Delay_Reason        0.00000
Adm_Anaesthesia                 0.00000
Adm_Asa_Grade                   0.00000
Adm_Operation                   0.00000
Adm_Ger_Acute_Assess            0.00000
Adm_Pre_OP_Med_Assess           0.00000
Adm_Fragility                   0.00000
Adm_Pathological                0.00000
Adm_Fracture_Type               0.00000
Adm_Pre_Frac_Number             0.00000
Adm_Pre_Frac_Shop               0.00000
Adm_PRE_Frac_Outdoor            0.00000
Adm_Pre_Frac_Indoor             0.00000
Adm_Ward_Type                   0.00000
Adm_Trauma_TYPE                 0.00000
LOS                             0.00000
NOCA_AgeRange                   0.00000
NOCA_FirstPresPeriodDay         0.00000
NOCA_TraumaPeriodDay            0.00000
Adm_Bone_Protect_Med            0.00000
Adm_Multi_Rehab_Assess          0.00000


In [15]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import manifold 
from sklearn import cluster  
from sklearn.preprocessing import OneHotEncoder, MinMaxScaler, StandardScaler, RobustScaler, FunctionTransformer
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn import set_config
import seaborn as sns
#from feature_engine import CyclicalFeatures
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import OrdinalEncoder, QuantileTransformer, LabelEncoder
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import SimpleImputer, KNNImputer, IterativeImputer

In [16]:
# Ensure datetime format
ihfd_df_2['Adm_First_Pres_Hosp_DateTime'] = pd.to_datetime(ihfd_df['Adm_First_Pres_Hosp_DateTime'])

# Extract date month year 
ihfd_df_2['Year'] = ihfd_df_2['Adm_First_Pres_Hosp_DateTime'].dt.year
ihfd_df_2['Month'] = ihfd_df_2['Adm_First_Pres_Hosp_DateTime'].dt.month
ihfd_df_2['Day'] = ihfd_df_2['Adm_First_Pres_Hosp_DateTime'].dt.day
ihfd_df_2['DayOfWeek'] = ihfd_df_2['Adm_First_Pres_Hosp_DateTime'].dt.dayofweek  # 0=Monday
#ihfd_df_2['Hour'] = ihfd_df_2['Adm_First_Pres_Hosp_DateTime'].dt.hour

In [17]:
##set variable types 
# Numerical Variables
numerical_vars = [
    'LOS',
    'Adm_Pre_Frac_Number'
]

# Categorical Nominal Variables
categorical_nominal_vars = [
    'Adm_Trauma_TYPE',
    'Adm_Ward_Type',
    'Adm_Fracture_Type',
    'Adm_Pre_OP_Med_Assess',
    'Adm_Operation',
    'Adm_Anaesthesia',
    'Adm_Surgery_Delay_Reason',
    'New Health Regions',
    'NOCA_TraumaPeriodDay',
    'NOCA_FirstPresPeriodDay',
    'Adm_Pathological',        
    'Adm_Fragility',             
    'Adm_Ger_Acute_Assess'       
]

# Categorical Ordinal Variables
categorical_ordinal_vars = [
    'Adm_Pre_Frac_Indoor',
    'Adm_PRE_Frac_Outdoor',
    'Adm_Pre_Frac_Shop',
    'Adm_Asa_Grade',
    'Adm_Spec_Falls_Assess',
    'Adm_Bone_Protect_Med',
    'NOCA_AgeRange'
]

# Binary Variables
binary_vars = [
    'Adm_Mobilised',
    'Adm_Pressure_Ulcers',
    'Adm_Multi_Rehab_Assess'
]

In [18]:
# 1. Numerical Transformer
numerical_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='mean')),
    ('scaler', StandardScaler())
])

# 2. Categorical Nominal Transformer
categorical_nominal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# 3. Categorical Ordinal Transformer

# Define the order for each ordinal variable
ordinal_categories = [
    [0, 1, 2, 3],          # Adm_Pre_Frac_Indoor
    [0, 1, 2, 3],          # Adm_PRE_Frac_Outdoor
    [0, 1, 2, 3],          # Adm_Pre_Frac_Shop
    [1, 2, 3, 4, 5, 9],    # Adm_Asa_Grade
    [0, 1, 2, 3],          # Adm_Spec_Falls_Assess
    [0, 1, 2, 3, 4, 5, 6], # Adm_Bone_Protect_Med
    ['60-64', '65-69', '70-74', '75-79','80-84', '85-89', '90-94', '95+']  # NOCA_AgeRange
]

categorical_ordinal_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('ordinal', OrdinalEncoder(categories=ordinal_categories))
])

# 4. Binary Transformer
# mapping function for binary variables
def binary_map(series):
    # For example, if your data uses 1 for 'Yes', 2 for 'No', and 9 for 'Not documented':
    return series.map({1: 1, 2: 0, 9: np.nan})

binary_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
    ('mapper', FunctionTransformer(binary_map, validate=False))
])

# ==========================
# Combine Preprocessing Steps
# ==========================
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, numerical_vars),
        ('cat_nom', categorical_nominal_transformer, categorical_nominal_vars),
        ('cat_ord', categorical_ordinal_transformer, categorical_ordinal_vars),
        ('bin', binary_transformer, binary_vars)
    ]
)

# Create the full pipeline by adding your model at the end
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor)
])


In [19]:
pipeline