In [None]:
##Set styling for plotting
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import seaborn as sns
sns.set_palette('colorblind')
from matplotlib.pyplot import tight_layout
# ##SETTING PARAMS FOR MATPLOTLIB FIGURES
plt.rcParams.update({"figure.figsize": (6, 6),
                 "axes.facecolor": "white",
                 "axes.edgecolor": "black"})
plt.rcParams['axes.prop_cycle'] = plt.cycler(color=sns.color_palette('colorblind'))
##set font size
font = {'family': 'sans-serif',
       'weight': 'normal',
       'size': 14}
plt.rc('font', **font)
# ##PANDAS PLOTTING
pd.plotting.register_matplotlib_converters()


In [None]:
##set file path outside git=hub repo (data must not be stpored in github)
file_path='/mnt/hgfs/shared/ihfd.xlsx'

##Import modules
import numpy as np
!pip install openpyxl
%matplotlib inline
##save juypter env file
!conda env export > ihfd_environment.yml

In [None]:
##Import data from excel into panda's dataframe
ihfd_df = pd.read_excel(file_path, engine='openpyxl')

In [None]:
##check first 5 lines
ihfd_df.head(5)

In [None]:
##check last 5
ihfd_df.tail(5)

In [None]:
##data frame info - data types
ihfd_df.info()

## Initial EDA
the next cell lists all the variables we recieved initially from NOCA, the table lists all the variables the description and the count available in the dataset

$$
\begin{array}{|r|l|r|l|l|}
\hline
\text{No} & \text{Variable} & \text{Count} & \text{Label} & \text{Description} \\
\hline\hline
0 & \text{Index} & 37310 & \text{N/A} & \text{Index number for each entry} \\
\hline
1 & \text{Adm\_Trauma\_DateTime} & 12913 & \text{1. Date and time of trauma} & \text{Date and time of trauma causing hip fracture (Format: DD/MM/YYYY, HH:MM)} \\
\hline
2 & \text{NOCA\_AgeRange} & 37310 & \text{Age range} & \text{Age range of patient} \\
\hline
3 & \text{Adm\_Trauma\_TYPE} & 37078 & \text{2. Type of trauma} & \begin{array}{l} 
    1 \text{ High energy trauma} \\ 
    2 \text{ Low energy trauma} \\ 
    8 \text{ Unknown} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
4 & \text{Adm\_Hospital\_Fall} & 2535 & \text{4H. Hospital fall} & \begin{array}{l}
    1 \text{ Yes} \\
    2 \text{ No} \end{array} \\
\hline
5 & \text{Adm\_Ward\_Type} & 37299 & \text{5. Ward type} & \begin{array}{l} 
    1 \text{ Orthopaedic Ward} \\ 
    2 \text{ Never Admitted to Orthopaedic Ward} \\ 
    9 \text{ Not Documented} \end{array} \\
\hline
6 & \text{Adm\_Pre\_Frac\_Indoor} & 29378 & \text{6A. Indoor mobility} & \begin{array}{l} 
    0 \text{ Unable} \\ 
    1 \text{ Assistance of one person} \\ 
    2 \text{ With an aid} \\ 
    3 \text{ Independent} \end{array} \\
\hline
7 & \text{Adm\_PRE\_Frac\_Outdoor} & 29141 & \text{6B. Outdoor mobility} & \begin{array}{l} 
    0 \text{ Unable} \\ 
    1 \text{ Assistance of one person} \\ 
    2 \text{ With an aid} \\ 
    3 \text{ Independent} \end{array} \\
\hline
8 & \text{Adm\_Pre\_Frac\_Shop} & 29037 & \text{6C. Shopping capability} & \begin{array}{l} 
    0 \text{ Unable} \\ 
    1 \text{ Assistance of one person} \\ 
    2 \text{ With an aid} \\ 
    3 \text{ Independent} \end{array} \\
\hline
9 & \text{Adm\_Pre\_Frac\_Number} & 29117 & \text{6D. Mobility Score} & \text{Pre-fracture New Mobility Score (Sum A+B+C)} \\
\hline
10 & \text{Adm\_Fracture\_Type} & 37123 & \text{8A. Fracture type} & \begin{array}{l} 
    1 \text{ Intracapsular - displaced} \\ 
    2 \text{ Intracapsular - undisplaced} \\ 
    3 \text{ Intertrochanteric} \\ 
    4 \text{ Subtrochanteric} \\ 
    5 \text{ Periprosthetic} \\ 
    8 \text{ Other} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
11 & \text{Adm\_FRACTURE\_Type\_Other2} & 2 & \text{8B. Fracture type other} & \text{Type of fracture (Other, please specify)} \\
\hline
12 & \text{Adm\_Pathological} & 36419 & \text{9. Pathological fracture} & \begin{array}{l} 
    1 \text{ Atypical} \\ 
    2 \text{ Malignancy} \\ 
    3 \text{ No} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
13 & \text{Adm\_Fragility} & 36692 & \text{10. Fragility fractures} & \begin{array}{l} 
    1 \text{ Yes} \\ 
    2 \text{ No} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
14 & \text{Adm\_Pre\_OP\_Med\_Assess} & 37157 & \text{11. Pre-op medical assessment} & \begin{array}{l} 
    1 \text{ Routine by geriatrician} \\ 
    2 \text{ Routine by medical physician} \\ 
    6 \text{ None} \\ 
    7 \text{ Ger review following request} \\ 
    8 \text{ Med physician review following request} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
15 & \text{Adm\_Ger\_Acute\_Assess} & 35313 & \text{11A. Geriatric assessment} & \begin{array}{l} 
    1 \text{ Yes} \\ 
    2 \text{ No} \\ 
    3 \text{ Not applicable} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
16 & \text{Adm\_Operation} & 37222 & \text{14. Operation} & \begin{array}{l} 
    00 \text{ No operation performed} \\ 
    01 \text{ Internal fixation DHS} \\ 
    02 \text{ Internal fixation Screws} \\ 
    03 \text{ IM nail long} \\ 
    04 \text{ IM nail short} \\ 
    05 \text{ Art hemi uncemented} \\ 
    06 \text{ Art hemi cemented} \\ 
    07 \text{ Art total hip replacement} \\ 
    99 \text{ Not documented} \end{array} \\
\hline
17 & \text{Adm\_Asa\_Grade} & 35451 & \text{14A. ASA Grade} & \begin{array}{l} 
    1 \text{ Normal healthy individual} \\ 
    2 \text{ Mild systemic disease} \\ 
    3 \text{ Severe systemic disease} \\ 
    4 \text{ Incapacitating disease} \\ 
    5 \text{ Moribund, not expected to survive} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
18 & \text{Adm\_Anaesthesia} & 35382 & \text{14B. Anaesthesia type} & \begin{array}{l} 
    1 \text{ GA only} \\ 
    2 \text{ GA + nerve block} \\ 
    3 \text{ GA + spinal anaesthesia} \\ 
    4 \text{ GA + epidural anaesthesia} \\ 
    5 \text{ SA only} \\ 
    6 \text{ SA + nerve block} \\ 
    7 \text{ SA + epidural (CSE)} \\ 
    8 \text{ Other} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
19 & \text{Adm\_Primary\_Surgery\_DateTime} & 35575 & \text{14E. Surgery Date/Time} & \text{Date and time of primary surgery (Format: DD/MM/YYYY, HH:MM)} \\
\hline
20 & \text{Adm\_Surgery\_Delay\_Reason} & 35372 & \text{14H. Surgery delay reason} & \begin{array}{l} 
    0 \text{ No delay - surgery < 48 hours} \\ 
    1 \text{ Awaiting orthopaedic diagnosis} \\ 
    2 \text{ Awaiting medical review or stabilisation} \\ 
    3 \text{ Awaiting inpatient or high dependency bed} \\ 
    4 \text{ Awaiting space on theatre list} \\ 
    5 \text{ Problem with theatre/equipment} \\ 
    6 \text{ Problem with surgical staff cover} \\ 
    7 \text{ Cancelled due to list over-run} \\ 
    8 \text{ Other} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
21 & \text{Adm\_Mobilised} & 33741 & \text{14J. Mobilised post-surgery} & \begin{array}{l} 
    1 \text{ Yes} \\ 
    2 \text{ No} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
22 & \text{Adm\_RE\_OP\_30\_DAYS} & 33094 & \text{14M. Re-operation within 30 days} & \begin{array}{l} 
    0 \text{ None} \\ 
    1 \text{ Reduction of dislocated prosthesis} \\ 
    2 \text{ Washout or debridement} \\ 
    3 \text{ Implant removal} \\ 
    4 \text{ Revision of internal fixation} \\ 
    5 \text{ Conversion to Hemiarthroplasty} \\ 
    6 \text{ Conversion to THR} \\ 
    7 \text{ Girdlestone/excision arthroplasty} \\ 
    8 \text{ Surgery for periprosthetic fracture} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
23 & \text{Adm\_Pressure\_Ulcers} & 36904 & \text{16. Pressure ulcers} & \begin{array}{l} 
    1 \text{ Yes} \\ 
    2 \text{ No} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
24 & \text{Adm\_Spec\_Falls\_Assess} & 36886 & \text{17. Falls assessment} & \begin{array}{l} 
    0 \text{ No} \\ 
    1 \text{ Yes - performed on this admission} \\ 
    2 \text{ Yes - awaits out-patient assessment} \\ 
    3 \text{ Not applicable} \end{array} \\
\hline
25 & \text{Adm\_Bone\_Protect\_Med} & 36614 & \text{18. Bone protection medication} & \begin{array}{l} 
    0 \text{ No assessment} \\ 
    1 \text{ Started on this admission} \\ 
    2 \text{ Continued from pre-admission} \\ 
    3 \text{ Awaits DXA scan} \\ 
    4 \text{ Awaits out-patient assessment} \\ 
    5 \text{ No medication needed} \\ 
    6 \text{ Not applicable} \end{array} \\
\hline
26 & \text{Adm\_Multi\_Rehab\_Assess} & 36796 & \text{19. Multidisciplinary rehab} & \begin{array}{l} 
    1 \text{ Yes} \\ 
    2 \text{ No} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
27 & \text{Adm\_AMB\_Number\_ACU\_DIS} & 13956 & \text{20. Ambulatory Score acute discharge} & \text{Cumulative Ambulatory Score – acute discharge (0 - 6)} \\
\hline
28 & \text{Adm\_Discharged\_To} & 25994 & \text{21. Discharge destination} & \begin{array}{l} 
    1 \text{ Home} \\ 
    2 \text{ On-site rehab unit} \\ 
    3 \text{ Off-site rehab unit} \\ 
    4 \text{ Convalescence care} \\ 
    5 \text{ New adm to nursing home} \\ 
    6 \text{ Return adm to nursing home} \\ 
    7 \text{ Died} \\ 
    8 \text{ Other} \end{array} \\
\hline
29 & \text{Adm\_Nut\_Risk} & 18788 & \text{12. Nutritional risk assessment} & \begin{array}{l} 
    0 \text{ No} \\ 
    1 \text{ Indicates malnourished} \\ 
    2 \text{ Indicates risk of malnutrition} \\ 
    3 \text{ Indicates normal} \end{array} \\
\hline
30 & \text{Adm\_Nerve\_Block} & 19231 & \text{13. Nerve block administered} & \begin{array}{l} 
    1 \text{ Yes} \\ 
    2 \text{ No} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
31 & \text{Adm\_Ass\_Anp} & 15730 & \text{11E. ANP/cANP assessment} & \begin{array}{l} 
    1 \text{ Yes} \\ 
    2 \text{ No} \\ 
    3 \text{ Not applicable} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
32 & \text{Adm\_Mobilised\_No\_Opt} & 2156 & \text{14J3. Reason for not mobilised} & \begin{array}{l} 
    1 \text{ Pain} \\ 
    2 \text{ Confusion/agitation/delirium} \\ 
    3 \text{ Patient declined} \\ 
    4 \text{ Medically not fit} \\ 
    5 \text{ Not mobile pre-fracture} \\ 
    6 \text{ Physio staffing issues} \\ 
    7 \text{ Other staffing} \\ 
    8 \text{ Other} \\ 
    9 \text{ Not documented} \end{array} \\
\hline
\end{array}
$$


 

$$
\begin{array}{|l|p{7cm}|p{5cm}|}
\hline
\textbf{Type of Hip Fracture} & \textbf{Description} & \textbf{Severity} \\
\hline
\text{Intracapsular - Displaced} & \text{Intracapsular fractures occur within the capsule of the hip joint, affecting the femoral neck, the narrow section of the femur just below the ball of the hip joint.} & \text{Displaced intracapsular fractures are severe as the broken bone ends have moved out of alignment, disrupting blood flow to the femoral head. This often requires hip replacement or fixation surgery due to the risk of avascular necrosis.} \\
\hline
\text{Intracapsular - Undisplaced} & \text{An undisplaced intracapsular fracture occurs in the femoral neck, but the bone has not shifted, reducing the risk of compromised blood supply.} & \text{These fractures have a better prognosis and may be treated with internal fixation (e.g., screws) rather than hip replacement, especially in younger patients.} \\
\hline
\text{Intertrochanteric} & \text{Fractures occurring between the greater and lesser trochanters, the bony prominences below the femoral neck, located outside the joint capsule.} & \text{Generally treated with fixation devices, like plates and screws. These fractures maintain blood flow to the femoral head but still present significant pain and mobility issues, especially in older adults.} \\
\hline
\text{Subtrochanteric} & \text{Occurs just below the lesser trochanter, in the upper part of the femoral shaft, often extending down the femur.} & \text{Requires surgical intervention (e.g., intramedullary nails or rods) due to high forces in this area. Healing may be prolonged, particularly in older patients or those with weaker bones.} \\
\hline
\text{Periprosthetic} & \text{Fracture occurs around a hip prosthesis (artificial joint from prior replacement), either above, below, or around the prosthesis area.} & \text{Treatment complexity varies based on fracture and implant stability. May require revision surgery, posing challenges for older or less healthy individuals.} \\
\hline
\end{array}
$$


In [None]:
##stats
ihfd_df.describe()

In [None]:
##check missing data
ihfd_df.isna().sum()

In [None]:
##get percentage of missing data
(ihfd_df.isna().sum()/ihfd_df.shape[0] * 100).sort_values()

##a lot of missing data for some variables, ** review data dictionary to see what they are for 

In [None]:
##just for EDA drop some of the higher % missing data columns 

columns_to_drop = [
    'Adm_RE_OP_30_DAYS',
    'Adm_Pre_Frac_Indoor',
    'Adm_PRE_Frac_Outdoor',
    'Adm_Pre_Frac_Number',
    'Adm_Pre_Frac_Shop',
    'Adm_Discharged_To',
    'Adm_Nerve_Block',
    'Adm_Nut_Risk',
    'Adm_Ass_Anp',
    'Adm_AMB_Number_ACU_DIS',
    'Adm_Trauma_DateTime',
    'Adm_Hospital_Fall',
    'Adm_Mobilised_No_Opt',
    'Adm_FRACTURE_Type_Other2',
    'Adm_Primary_Surgery_DateTime',
    'Index'
]

# Drop the columns
ihfd_red_df = ihfd_df.drop(columns=columns_to_drop)

# Separate numeric and categorical columns
numeric_cols = ihfd_red_df.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = ihfd_red_df.select_dtypes(include=['object', 'category']).columns

# Fill numeric columns with mean
ihfd_red_df[numeric_cols] = ihfd_red_df[numeric_cols].fillna(ihfd_red_df[numeric_cols].median())

# Fill categorical columns with mode (most frequent value)
for col in categorical_cols:
    ihfd_red_df[col] = ihfd_red_df[col].fillna(ihfd_red_df[col].mode().iloc[0])

# Verify results
print("\nRemaining NaN values per column:")
print(ihfd_red_df.isnull().sum())



In [None]:
print("Unique values in Fracture Type:")
print(sorted(ihfd_red_df['Adm_Fracture_Type'].unique()))

In [None]:
ihfd_red_df.info()

In [None]:
# Set the style for all plots
sns.set_style("whitegrid")
sns.set_palette("husl")

# Function to create appropriate plot based on data type
def plot_column(data, column):
    if column == 'NOCA_AgeRange':
        age_order = ['60-64', '65-69', '70-74', '75-79', '80-84', '85-89', '90-94', '95+']
        ax = sns.countplot(data=data, x=column, order=age_order)
    else:
        unique_vals = sorted(data[column].unique())
        if isinstance(unique_vals[0], (int, float, np.integer, np.floating)):
            ax = sns.histplot(data=data, x=column, discrete=True)
        else:
            ax = sns.countplot(data=data, x=column)
    
    # Rotate labels if needed
    if len(str(data[column].iloc[0])) > 3:
        plt.xticks(rotation=45)
    
    # Add value labels
    for p in ax.patches:
        ax.annotate(f'{int(p.get_height()):,}',
                   (p.get_x() + p.get_width()/2., p.get_height()),
                   ha='center', va='bottom')
    
    # Format y-axis
    ax.yaxis.set_major_formatter(plt.FuncFormatter(lambda x, p: format(int(x), ',')))
    
    plt.title(column)

# Calculate number of rows and columns needed
n_cols = 3
n_rows = (len(ihfd_red_df.columns) + n_cols - 1) // n_cols

# Create a figure for each set of 12 columns
for i in range(0, len(ihfd_red_df.columns), 12):
    columns_subset = ihfd_red_df.columns[i:i+12]
    
    # Create figure
    fig = plt.figure(figsize=(20, 20))
    
    # Create subplots for each column
    for idx, col in enumerate(columns_subset, 1):
        plt.subplot(4, 3, idx)
        plot_column(ihfd_red_df, col)
    
    plt.tight_layout(pad=1.0)
    plt.show()

In [None]:
 ihfd_red_df['NOCA_AgeRange'].value_counts()

In [None]:
from matplotlib.colors import LogNorm

plt.figure(figsize=(12, 12))
sns.heatmap(ihfd_red_df.corr(method='pearson', numeric_only=True),
            annot=True,
            cmap='inferno',
            fmt='.2f',
            norm=LogNorm())

In [None]:

columns = [
'NOCA_AgeRange',                    
'Adm_Operation',                    
'Adm_Pre_OP_Med_Assess',        
'Adm_Fracture_Type',            
'Adm_Trauma_TYPE',                  
'Adm_Pressure_Ulcers',             
'Adm_Spec_Falls_Assess',            
'Adm_Multi_Rehab_Assess',         
'Adm_Fragility',         
'Adm_Bone_Protect_Med',          
'Adm_Pathological',              
'Adm_Asa_Grade',                    
'Adm_Anaesthesia',                 
'Adm_Surgery_Delay_Reason',        
'Adm_Ger_Acute_Assess',      
'Adm_Mobilised',        
] 
import numpy as np
n_cols = 3  # We'll use 3 columns to fit all plots better
n_rows = int(np.ceil(len(columns) / n_cols))  # cal number  of rows needed 

# Create figure and subplots
fig, axs = plt.subplots(n_rows, n_cols, figsize=(20, 5*n_rows))
axs = axs.reshape(-1) 


for i, col in enumerate(columns):
    if i < len(columns):  # Only plot if we have data
        sns.histplot(ihfd_red_df[col], bins="rice", ax=axs[i])
        axs[i].set_title(f'{col.capitalize()} Distribution')
        axs[i].set_xlabel(f'{col.capitalize()}')
        axs[i].set_ylabel('Count')
        
# Hide empty subplots if any
for i in range(len(columns), len(axs)):
    axs[i].set_visible(False)

plt.tight_layout()
plt.show()

In [None]:
## Plot the fracture type by age group
fracture_type_map = {
    1: 'Intracapsular - displaced',
    2: 'Intracapsular - undisplaced',
    3: 'Intertrochanteric',
    4: 'Subtrochanteric',
    5: 'Periprosthetic',
    8: 'Other',
    9: 'Not documented'
}

# column with mapped values
ihfd_red_df['Fracture_Type_Name'] = ihfd_red_df['Adm_Fracture_Type'].map(fracture_type_map)

# Create the plot with the descriptive names
plt.figure(figsize=(15, 8))
sns.countplot(data=ihfd_red_df, 
              x='NOCA_AgeRange', 
              hue='Fracture_Type_Name',
              hue_order=[fracture_type_map[i] for i in sorted(fracture_type_map.keys())],  
              order=sorted(ihfd_red_df['NOCA_AgeRange'].unique()))

plt.xticks(rotation=45)
plt.title('Distribution of Fracture Types by Age Range')
plt.xlabel('Age Range')
plt.ylabel('Count')
plt.legend(title='Fracture Type', 
          bbox_to_anchor=(1.05, 1), 
          loc='upper left')
plt.tight_layout()
plt.show()

print("\nDistribution of Fracture Types:")
print(ihfd_red_df['Fracture_Type_Name'].value_counts())

In [None]:
fig, axes = plt.subplots(3, 3, figsize=(20, 12))  # Adjust rows/cols based on number of age ranges
axes = axes.ravel()

for idx, age_range in enumerate(sorted(ihfd_red_df['NOCA_AgeRange'].unique())):
    age_data = ihfd_red_df[ihfd_red_df['NOCA_AgeRange'] == age_range]
    sns.countplot(data=age_data, x='Adm_Fracture_Type', ax=axes[idx])
    axes[idx].set_title(f'Age Range: {age_range}')
    axes[idx].tick_params(axis='x', rotation=45)
    axes[idx].set_xlabel('')

plt.suptitle('Distribution of Fracture Types for Each Age Range', y=1.02, size=16)
plt.tight_layout()
plt.show()

summary_df = pd.crosstab(ihfd_red_df['NOCA_AgeRange'], 
                        ihfd_red_df['Adm_Fracture_Type'])
print("\nNumerical Summary:")
print(summary_df)

In [None]:
##trauma type and age
summary_df1 = pd.crosstab(ihfd_red_df['NOCA_AgeRange'], 
                        ihfd_red_df['Adm_Trauma_TYPE'])
print("\nNumerical Summary:")
print("\nTrauma types: 1.High Energy Trauma, 2.Low Energ Trauma, 8.Unknown, 9.Not Documented")
print(summary_df1)

In [None]:
##plot the trauma type by age group
trauma_type_map = {
    1: 'High Energy Trauma',
    2: 'Low Energ Trauma',
    8: 'Unknown',
    9: 'Not documented'
}

# column with mapped values
ihfd_red_df['Trauma_Type_Name'] = ihfd_red_df['Adm_Trauma_TYPE'].map(trauma_type_map)

# Create the plot with the descriptive names
plt.figure(figsize=(15, 8))
sns.countplot(data=ihfd_red_df, 
              x='NOCA_AgeRange', 
              hue='Trauma_Type_Name',
              hue_order=[trauma_type_map[i] for i in sorted(trauma_type_map.keys())],  
              order=sorted(ihfd_red_df['NOCA_AgeRange'].unique()))

plt.xticks(rotation=45)
plt.title('Distribution of Trauma Types by Age Range')
plt.xlabel('Age Range')
plt.ylabel('Count')
plt.legend(title='Trauma Type', 
          bbox_to_anchor=(1.05, 1), 
          loc='upper left')
plt.tight_layout()
plt.show()

print("\nDistribution of Trauma Types:")
print(ihfd_red_df['Trauma_Type_Name'].value_counts())

In [None]:
##trauma type and fracture type
fracture_type_map = {
    1: 'Intracapsular - displaced',
    2: 'Intracapsular - undisplaced',
    3: 'Intertrochanteric',
    4: 'Subtrochanteric',
    5: 'Periprosthetic',
    8: 'Other',
    9: 'Not documented'
}
summary_df2 = pd.crosstab(ihfd_red_df['Adm_Fracture_Type'], 
                        ihfd_red_df['Adm_Trauma_TYPE'])
print("\nNumerical Summary:")
print("\nTrauma types: 1.High Energy Trauma, 2.Low Energ Trauma, 8.Unknown, 9.Not Documented")
print(fracture_type_map)
print(summary_df2)

In [None]:
##plot the trauma type by fracture type 
trauma_type_map = {
    1: 'High Energy Trauma',
    2: 'Low Energy Trauma',
    8: 'Unknown',
    9: 'Not documented'
}

fracture_type_map = {
    1: 'Intracapsular - displaced',
    2: 'Intracapsular - undisplaced',
    3: 'Intertrochanteric',
    4: 'Subtrochanteric',
    5: 'Periprosthetic',
    8: 'Other',
    9: 'Not documented'
}

# Map both columns to their descriptive names
ihfd_red_df['Trauma_Type_Name'] = ihfd_red_df['Adm_Trauma_TYPE'].map(trauma_type_map)
ihfd_red_df['Fracture_Type_Name'] = ihfd_red_df['Adm_Fracture_Type'].map(fracture_type_map)

plt.figure(figsize=(15, 8))
sns.countplot(data=ihfd_red_df, 
              x='Fracture_Type_Name',  
              hue='Trauma_Type_Name',
              hue_order=[trauma_type_map[i] for i in sorted(trauma_type_map.keys())],
              order=[fracture_type_map[i] for i in sorted(fracture_type_map.keys())]) 

plt.xticks(rotation=45, ha='right') 
plt.title('Distribution of Trauma Types and Fracture Types')
plt.xlabel('Fracture Type')
plt.ylabel('Count')
plt.legend(title='Trauma Type',
          bbox_to_anchor=(1.05, 1),
          loc='upper left')
plt.tight_layout()
plt.show()

print("\nDistribution of Trauma Types:")
print(ihfd_red_df['Trauma_Type_Name'].value_counts())

##Define seasons

    Spring: March, April, May
    Summer: June, July, August
    Autumn: September, October, November
    Winter: December, January, February

look at incidents seasonally (put adverse weather data in season and type e.g winter : snow/ice etc)
bring back in date columns and use that to divide up into seasons


####looking at the data the ed admission date in not there for below just to look i used the surgey date and time ( dropped any missing value rows). might have to request ed admission time but reduce to moring , afternoon, evening, like NAS data 

In [None]:
columns_to_drop = [
    'Adm_RE_OP_30_DAYS',
    'Adm_Pre_Frac_Indoor',
    'Adm_PRE_Frac_Outdoor',
    'Adm_Pre_Frac_Number',
    'Adm_Pre_Frac_Shop',
    'Adm_Discharged_To',
    'Adm_Nerve_Block',
    'Adm_Nut_Risk',
    'Adm_Ass_Anp',
    'Adm_AMB_Number_ACU_DIS',
    'Adm_Trauma_DateTime',
    'Adm_Hospital_Fall',
    'Adm_Mobilised_No_Opt',
    'Adm_FRACTURE_Type_Other2',
    'Index'
]

# Drop the columns
ihfd_date = ihfd_df.drop(columns=columns_to_drop)

# Separate numeric and categorical columns
numeric_cols = ihfd_date.select_dtypes(include=['int64', 'float64']).columns
categorical_cols = ihfd_date.select_dtypes(include=['object', 'category']).columns

#Adm_Primary_Surgery_DateTime
ihfd_date['Adm_Primary_Surgery_DateTime'] = pd.to_datetime(ihfd_date['Adm_Primary_Surgery_DateTime'], format='%d/%m/%Y %H:%M')

# Fill numeric columns with mean
ihfd_date[numeric_cols] = ihfd_date[numeric_cols].fillna(ihfd_date[numeric_cols].median())

# Fill categorical columns with mode (most frequent value)
for col in categorical_cols:
    ihfd_date[col] = ihfd_red_df[col].fillna(ihfd_date[col].mode().iloc[0])

# Verify results
print("\nRemaining NaN values per column:")
print(ihfd_date.isnull().sum())

In [None]:
##temp drop date rows that are NAN
ihfd_date_red = ihfd_date.dropna(subset=['Adm_Primary_Surgery_DateTime'])

In [None]:
print(ihfd_date_red.isnull().sum())

In [None]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

df = ihfd_date_red.copy()

# create the new columns
df.loc[:, 'season'] = pd.DatetimeIndex(df['Adm_Primary_Surgery_DateTime']).quarter.map({
    1: 'Winter',
    2: 'Spring',
    3: 'Summer',
    4: 'Fall'
})

df.loc[:, 'year'] = df['Adm_Primary_Surgery_DateTime'].dt.year

# Create pivot table for each season
seasons = ['Winter', 'Spring', 'Summer', 'Fall']
fig, axes = plt.subplots(2, 2, figsize=(15, 12))
axes = axes.ravel()


colors = sns.color_palette("Set3", n_colors=len(df['NOCA_AgeRange'].unique()))

for idx, season in enumerate(seasons):
    season_data = df[df['season'] == season]
    
    # Create pivot table - counting occurrences by year
    pivot_data = pd.pivot_table(
        season_data,
        values='Adm_Primary_Surgery_DateTime',
        index='year',
        columns='NOCA_AgeRange',
        aggfunc='count',
        fill_value=0
    )
    
    pivot_data.plot(
        kind='bar',
        stacked=True,
        ax=axes[idx],
        color=colors,
        title=f'{season} Incidents by Age Group'
    )
    
    axes[idx].set_xlabel('Year')
    axes[idx].set_ylabel('Number of Incidents')
    axes[idx].legend(title='Age Range', bbox_to_anchor=(1.05, 1))
    axes[idx].tick_params(axis='x', rotation=45)

plt.tight_layout()
plt.show()

# summary statistics
print("\nDetailed Summary of Incidents by Season and Age Group:")
seasonal_summary = df.groupby(['season', 'NOCA_AgeRange']).size().unstack(fill_value=0)
print("\nTotal incidents per season and age group:")
print(seasonal_summary)

print("\nYearly breakdown by season:")
yearly_seasonal = df.groupby(['year', 'season']).size().unstack(fill_value=0)
print(yearly_seasonal)

# Calculate percentages for each age group within seasons
print("\nPercentage distribution of age groups within each season:")
seasonal_percentages = seasonal_summary.div(seasonal_summary.sum(axis=1), axis=0) * 100
print(seasonal_percentages.round(1))

i was expecting to see more incidents in winter, majprity seem to be 80 to 89 age groups agin using the surgey time for this so might not be accurate 