#### Set styling for plotting

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import seaborn as sns
sns.set_palette('colorblind')
from matplotlib.pyplot import tight_layout
# ##SETTING PARAMS FOR MATPLOTLIB FIGURES
plt.rcParams.update({"figure.figsize": (6, 6),
                 "axes.facecolor": "white",
                 "axes.edgecolor": "black"})
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=sns.color_palette('colorblind'))
##set font size
font = {'family': 'sans-serif',
       'weight': 'normal',
       'size': 14}
plt.rc('font', **font)
# ##PANDAS PLOTTING
pd.plotting.register_matplotlib_converters()

#### Step 1: save environment file

In [2]:
!conda env export > ihfd_environment.yml

#### Step 2: import modules

In [3]:
import glob
import os
import xml.etree.ElementTree as ET
from datetime import datetime, timezone
from tqdm import tqdm
import codecs
import csv
import numpy as np
!pip install openpyxl
%matplotlib inline



## INFORMATION

$$
\begin{array}{|r|l|r|l|l|}
\hline
\text{No} & \text{Variable} & \text{Count} & \text{Label} & \text{Description} \\
\hline\hline
0 & \text{Index} & 37310 & \text{N/A} & \text{Index number for each entry} \\
\hline
1 & \text{Adm\_Trauma\_DateTime} & 12913 & \text{1. Date and time of trauma} & \text{Date and time of trauma causing hip fracture (Format: DD/MM/YYYY, HH:MM)} \\
\hline
2 & \text{NOCA\_AgeRange} & 37310 & \text{Age range} & \text{Age range of patient} \\
\hline
3 & \text{Adm\_Trauma\_TYPE} & 37078 & \text{2. Type of trauma} & \begin{array}{l} 
    1 \text{ High energy trauma} \\ 
    2 \text{ Low energy trauma} \\ 
    8 \text{ Unknown} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
4 & \text{Adm\_Hospital\_Fall} & 2535 & \text{4H. Hospital fall} & \begin{array}{l}
    1 \text{ Yes} \\
    2 \text{ No} \end{array} \\
\hline
5 & \text{Adm\_Ward\_Type} & 37299 & \text{5. Ward type} & \begin{array}{l} 
    1 \text{ Orthopaedic Ward} \\ 
    2 \text{ Never Admitted to Orthopaedic Ward} \\ 
    9 \text{ Not Documented} \end{array} \\
\hline
6 & \text{Adm\_Pre\_Frac\_Indoor} & 29378 & \text{6A. Indoor mobility} & \begin{array}{l} 
    0 \text{ Unable} \\ 
    1 \text{ Assistance of one person} \\ 
    2 \text{ With an aid} \\ 
    3 \text{ Independent} \end{array} \\
\hline
7 & \text{Adm\_PRE\_Frac\_Outdoor} & 29141 & \text{6B. Outdoor mobility} & \begin{array}{l} 
    0 \text{ Unable} \\ 
    1 \text{ Assistance of one person} \\ 
    2 \text{ With an aid} \\ 
    3 \text{ Independent} \end{array} \\
\hline
8 & \text{Adm\_Pre\_Frac\_Shop} & 29037 & \text{6C. Shopping capability} & \begin{array}{l} 
    0 \text{ Unable} \\ 
    1 \text{ Assistance of one person} \\ 
    2 \text{ With an aid} \\ 
    3 \text{ Independent} \end{array} \\
\hline
9 & \text{Adm\_Pre\_Frac\_Number} & 29117 & \text{6D. Mobility Score} & \text{Pre-fracture New Mobility Score (Sum A+B+C)} \\
\hline
10 & \text{Adm\_Fracture\_Type} & 37123 & \text{8A. Fracture type} & \begin{array}{l} 
    1 \text{ Intracapsular - displaced} \\ 
    2 \text{ Intracapsular - undisplaced} \\ 
    3 \text{ Intertrochanteric} \\ 
    4 \text{ Subtrochanteric} \\ 
    5 \text{ Periprosthetic} \\ 
    8 \text{ Other} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
11 & \text{Adm\_FRACTURE\_Type\_Other2} & 2 & \text{8B. Fracture type other} & \text{Type of fracture (Other, please specify)} \\
\hline
12 & \text{Adm\_Pathological} & 36419 & \text{9. Pathological fracture} & \begin{array}{l} 
    1 \text{ Atypical} \\ 
    2 \text{ Malignancy} \\ 
    3 \text{ No} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
13 & \text{Adm\_Fragility} & 36692 & \text{10. Fragility fractures} & \begin{array}{l} 
    1 \text{ Yes} \\ 
    2 \text{ No} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
14 & \text{Adm\_Pre\_OP\_Med\_Assess} & 37157 & \text{11. Pre-op medical assessment} & \begin{array}{l} 
    1 \text{ Routine by geriatrician} \\ 
    2 \text{ Routine by medical physician} \\ 
    6 \text{ None} \\ 
    7 \text{ Ger review following request} \\ 
    8 \text{ Med physician review following request} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
15 & \text{Adm\_Ger\_Acute\_Assess} & 35313 & \text{11A. Geriatric assessment} & \begin{array}{l} 
    1 \text{ Yes} \\ 
    2 \text{ No} \\ 
    3 \text{ Not applicable} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
16 & \text{Adm\_Operation} & 37222 & \text{14. Operation} & \begin{array}{l} 
    00 \text{ No operation performed} \\ 
    01 \text{ Internal fixation DHS} \\ 
    02 \text{ Internal fixation Screws} \\ 
    03 \text{ IM nail long} \\ 
    04 \text{ IM nail short} \\ 
    05 \text{ Art hemi uncemented} \\ 
    06 \text{ Art hemi cemented} \\ 
    07 \text{ Art total hip replacement} \\ 
    99 \text{ Not documented} \end{array} \\
\hline
17 & \text{Adm\_Asa\_Grade} & 35451 & \text{14A. ASA Grade} & \begin{array}{l} 
    1 \text{ Normal healthy individual} \\ 
    2 \text{ Mild systemic disease} \\ 
    3 \text{ Severe systemic disease} \\ 
    4 \text{ Incapacitating disease} \\ 
    5 \text{ Moribund, not expected to survive} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
18 & \text{Adm\_Anaesthesia} & 35382 & \text{14B. Anaesthesia type} & \begin{array}{l} 
    1 \text{ GA only} \\ 
    2 \text{ GA + nerve block} \\ 
    3 \text{ GA + spinal anaesthesia} \\ 
    4 \text{ GA + epidural anaesthesia} \\ 
    5 \text{ SA only} \\ 
    6 \text{ SA + nerve block} \\ 
    7 \text{ SA + epidural (CSE)} \\ 
    8 \text{ Other} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
19 & \text{Adm\_Primary\_Surgery\_DateTime} & 35575 & \text{14E. Surgery Date/Time} & \text{Date and time of primary surgery (Format: DD/MM/YYYY, HH:MM)} \\
\hline
20 & \text{Adm\_Surgery\_Delay\_Reason} & 35372 & \text{14H. Surgery delay reason} & \begin{array}{l} 
    0 \text{ No delay - surgery < 48 hours} \\ 
    1 \text{ Awaiting orthopaedic diagnosis} \\ 
    2 \text{ Awaiting medical review or stabilisation} \\ 
    3 \text{ Awaiting inpatient or high dependency bed} \\ 
    4 \text{ Awaiting space on theatre list} \\ 
    5 \text{ Problem with theatre/equipment} \\ 
    6 \text{ Problem with surgical staff cover} \\ 
    7 \text{ Cancelled due to list over-run} \\ 
    8 \text{ Other} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
21 & \text{Adm\_Mobilised} & 33741 & \text{14J. Mobilised post-surgery} & \begin{array}{l} 
    1 \text{ Yes} \\ 
    2 \text{ No} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
22 & \text{Adm\_RE\_OP\_30\_DAYS} & 33094 & \text{14M. Re-operation within 30 days} & \begin{array}{l} 
    0 \text{ None} \\ 
    1 \text{ Reduction of dislocated prosthesis} \\ 
    2 \text{ Washout or debridement} \\ 
    3 \text{ Implant removal} \\ 
    4 \text{ Revision of internal fixation} \\ 
    5 \text{ Conversion to Hemiarthroplasty} \\ 
    6 \text{ Conversion to THR} \\ 
    7 \text{ Girdlestone/excision arthroplasty} \\ 
    8 \text{ Surgery for periprosthetic fracture} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
23 & \text{Adm\_Pressure\_Ulcers} & 36904 & \text{16. Pressure ulcers} & \begin{array}{l} 
    1 \text{ Yes} \\ 
    2 \text{ No} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
24 & \text{Adm\_Spec\_Falls\_Assess} & 36886 & \text{17. Falls assessment} & \begin{array}{l} 
    0 \text{ No} \\ 
    1 \text{ Yes - performed on this admission} \\ 
    2 \text{ Yes - awaits out-patient assessment} \\ 
    3 \text{ Not applicable} \end{array} \\
\hline
25 & \text{Adm\_Bone\_Protect\_Med} & 36614 & \text{18. Bone protection medication} & \begin{array}{l} 
    0 \text{ No assessment} \\ 
    1 \text{ Started on this admission} \\ 
    2 \text{ Continued from pre-admission} \\ 
    3 \text{ Awaits DXA scan} \\ 
    4 \text{ Awaits out-patient assessment} \\ 
    5 \text{ No medication needed} \\ 
    6 \text{ Not applicable} \end{array} \\
\hline
26 & \text{Adm\_Multi\_Rehab\_Assess} & 36796 & \text{19. Multidisciplinary rehab} & \begin{array}{l} 
    1 \text{ Yes} \\ 
    2 \text{ No} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
27 & \text{Adm\_AMB\_Number\_ACU\_DIS} & 13956 & \text{20. Ambulatory Score acute discharge} & \text{Cumulative Ambulatory Score – acute discharge (0 - 6)} \\
\hline
28 & \text{Adm\_Discharged\_To} & 25994 & \text{21. Discharge destination} & \begin{array}{l} 
    1 \text{ Home} \\ 
    2 \text{ On-site rehab unit} \\ 
    3 \text{ Off-site rehab unit} \\ 
    4 \text{ Convalescence care} \\ 
    5 \text{ New adm to nursing home} \\ 
    6 \text{ Return adm to nursing home} \\ 
    7 \text{ Died} \\ 
    8 \text{ Other} \end{array} \\
\hline
29 & \text{Adm\_Nut\_Risk} & 18788 & \text{12. Nutritional risk assessment} & \begin{array}{l} 
    0 \text{ No} \\ 
    1 \text{ Indicates malnourished} \\ 
    2 \text{ Indicates risk of malnutrition} \\ 
    3 \text{ Indicates normal} \end{array} \\
\hline
30 & \text{Adm\_Nerve\_Block} & 19231 & \text{13. Nerve block administered} & \begin{array}{l} 
    1 \text{ Yes} \\ 
    2 \text{ No} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
31 & \text{Adm\_Ass\_Anp} & 15730 & \text{11E. ANP/cANP assessment} & \begin{array}{l} 
    1 \text{ Yes} \\ 
    2 \text{ No} \\ 
    3 \text{ Not applicable} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
32 & \text{Adm\_Mobilised\_No\_Opt} & 2156 & \text{14J3. Reason for not mobilised} & \begin{array}{l} 
    1 \text{ Pain} \\ 
    2 \text{ Confusion/agitation/delirium} \\ 
    3 \text{ Patient declined} \\ 
    4 \text{ Medically not fit} \\ 
    5 \text{ Not mobile pre-fracture} \\ 
    6 \text{ Physio staffing issues} \\ 
    7 \text{ Other staffing} \\ 
    8 \text{ Other} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
\end{array}
$$

 

#### Step 3: import data files

In [4]:
data_directory_xl = "/home/paulharford/college/project/project_data/ihfdv3.xlsx"
full_path_xl = os.path.abspath(data_directory_xl)


In [5]:
##Import data from excel into panda's dataframe
ihfd_df = pd.read_excel(full_path_xl)

FileNotFoundError: [Errno 2] No such file or directory: '/home/paulharford/college/project/project_data/ihfdv3.xlsx'

#### Step 4 - Drop columns that have too much missing data 

In [None]:
##get percentage of missing data
(ihfd_df.isna().sum()/ihfd_df.shape[0] * 100).sort_values()

In [None]:
###so the date-time is important for using with the weather data for the pres_hosp_datetime and missing values i'm going to populate with a random time 12 to 48 hours prior based on the variable surgey datetime
ihfd_df['Adm_First_Pres_Hosp_DateTime'] = ihfd_df['Adm_First_Pres_Hosp_DateTime'].fillna(
    ihfd_df['Adm_Primary_Surgery_DateTime'].apply(
        lambda x: x - pd.Timedelta(hours=np.random.uniform(12, 48)) if pd.notna(x) else np.nan
    )
)

In [None]:
columns_to_drop = [
    'Adm_RE_OP_30_DAYS',
    'Adm_Discharged_To',
    'Adm_Nerve_Block',
    'Adm_Nut_Risk',
    'Adm_Ass_Anp',
    'Adm_AMB_Number_ACU_DIS',
    'Adm_Trauma_DateTime',
    'Adm_Hospital_Fall',
    'Adm_Mobilised_No_Opt',
    'Adm_FRACTURE_Type_Other2',
    'Adm_Primary_Surgery_DateTime',
    'Index'
]

# Drop the columns
ihfd_red_df = ihfd_df.drop(columns=columns_to_drop)

# Separate numeric and categorical columns
numeric_cols = ihfd_red_df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = ihfd_red_df.select_dtypes(include=['object', 'category']).columns


In [None]:
##check missing data after column drop
##get percentage of missing data
(ihfd_red_df.isna().sum()/ihfd_red_df.shape[0] * 100).sort_values()

#### Step 5: missing data filling

In [None]:
# Fill numeric columns with mean
ihfd_red_df[numeric_cols] = ihfd_red_df[numeric_cols].fillna(ihfd_red_df[numeric_cols].median())

# Fill categorical columns with mode (most frequent value)
for col in categorical_cols:
    ihfd_red_df[col] = ihfd_red_df[col].fillna(ihfd_red_df[col].mode().iloc[0])

print(numeric_cols)
print(categorical_cols)

In [None]:
(ihfd_red_df.isna().sum()/ihfd_red_df.shape[0] * 100).sort_values()

In [None]:
ihfd_red_df.to_csv('/home/paulharford/college/project/processed/ihfd_clean.csv', index=False)