In [30]:
import pandas as pd

columns_to_keep = ['Area_of_Origin', 'Building_Status', 'Business_Impact', 'Extent_Of_Fire', 'Final_Incident_Type', 
                   'Fire_Alarm_System_Impact_on_Evacuation', 'Fire_Alarm_System_Operation', 'Fire_Alarm_System_Presence', 
                   'Ignition_Source', 'Initial_CAD_Event_Type', 'Material_First_Ignited', 'Method_Of_Fire_Control', 
                   'Possible_Cause', 'Property_Use', 'Smoke_Alarm_at_Fire_Origin', 'Smoke_Alarm_at_Fire_Origin_Alarm_Failure', 
                   'Smoke_Alarm_at_Fire_Origin_Alarm_Type', 'Smoke_Alarm_Impact_on_Persons_Evacuating_Impact_on_Evacuation', 
                   'Smoke_Spread', 'Sprinkler_System_Operation', 'Sprinkler_System_Presence']

df = pd.read_csv('../../data/processed/numerical_encoded_Fire_Incidents_Data.csv')

#df = df[columns_to_keep]

# List of columns to remove
columns_to_remove = [
    'Ext_agent_app_or_defer_time', 'Fire_Under_Control_Time', 'Incident_Number', 
    'Incident_Station_Area', 'Intersection', 'Last_TFS_Unit_Clear_Time', 
    'Latitude', 'Longitude', 'TFS_Alarm_Time', 'TFS_Arrival_Time'
]

# Remove columns from the DataFrame
df.drop(columns=columns_to_remove, inplace=True)

df

Unnamed: 0,Area_of_Origin,Building_Status,Business_Impact,Civilian_Casualties,Count_of_Persons_Rescued,Estimated_Dollar_Loss,Estimated_Number_Of_Persons_Displaced,Exposures,Extent_Of_Fire,Final_Incident_Type,...,Property_Use,Smoke_Alarm_at_Fire_Origin,Smoke_Alarm_at_Fire_Origin_Alarm_Failure,Smoke_Alarm_at_Fire_Origin_Alarm_Type,Smoke_Alarm_Impact_on_Persons_Evacuating_Impact_on_Evacuation,Smoke_Spread,Sprinkler_System_Operation,Sprinkler_System_Presence,Status_of_Fire_On_Arrival,TFS_Firefighter_Casualties
0,56,0,0,0.0,0,0,0,0.0,0,0,...,138,8,9,5,5,9,5,3,2,0
1,6,0,0,0.0,0,2000,0,0.4,3,0,...,71,0,9,4,4,1,5,2,1,0
2,36,0,1,0.0,0,100000,0,1.0,3,0,...,80,8,9,4,1,6,5,2,1,0
3,12,0,0,0.0,0,5000,0,0.2,0,0,...,42,2,9,1,0,3,2,0,3,0
4,8,0,5,0.0,0,500,0,0.0,0,0,...,74,2,9,0,0,1,5,2,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15623,45,0,5,1.0,0,200,0,0.0,10,0,...,81,3,9,3,2,8,5,2,2,0
15624,21,0,0,1.0,0,0,0,0.0,0,0,...,33,9,9,5,5,1,2,3,0,0
15625,23,0,0,1.0,0,0,0,0.0,0,0,...,81,3,9,3,2,2,2,0,1,0
15626,25,0,5,1.0,0,2000,0,0.0,3,0,...,81,3,9,3,5,3,1,0,1,0


### Apply Spearman's rank correlation between categorical values

In [None]:
# Compute Spearman's rank correlation
#spearman_corr = df.corr(method='spearman')

# Display correlation matrix
#print(spearman_corr)

### Discussion

Interesting, but spearman is for ordinal values, and these should not be treated as ordinal.

Another method called, Cramér's V will be applied.

Cramér's V is a measure of association between two nominal variables. 

It is based on the chi-square statistic and ranges from 0 to 1, where 0 indicates no association and 1 indicates complete association.

(https://www.statology.org/interpret-cramers-v/)

In [34]:
import numpy as np
from scipy.stats import chi2_contingency
from scipy.stats import chi2

def cramers_v_p_value(x, y):
    confusion_matrix = pd.crosstab(x, y)
    chi2_stat, _, _, _ = chi2_contingency(confusion_matrix)
    n = confusion_matrix.sum().sum()
    phi2 = chi2_stat / n
    r, k = confusion_matrix.shape
    phi2corr = max(0, phi2 - ((k - 1) * (r - 1)) / (n - 1))
    rcorr = r - ((r - 1) ** 2) / (n - 1)
    kcorr = k - ((k - 1) ** 2) / (n - 1)
    cramers_v = np.sqrt(phi2corr / min((kcorr - 1), (rcorr - 1)))
    
    # Calculate p-value
    dof = (r - 1) * (k - 1)
    p_value = 1 - chi2.cdf(chi2_stat, dof)
    
    return cramers_v, p_value

'''
List of features we can examine.

['Unnamed: 0', '_id', 'Area_of_Origin', 'Building_Status',
       'Business_Impact', 'Civilian_Casualties', 'Count_of_Persons_Rescued',
       'Estimated_Dollar_Loss', 'Estimated_Number_Of_Persons_Displaced',
       'Exposures', 'Ext_agent_app_or_defer_time', 'Extent_Of_Fire',
       'Final_Incident_Type', 'Fire_Alarm_System_Impact_on_Evacuation',
       'Fire_Alarm_System_Operation', 'Fire_Alarm_System_Presence',
       'Fire_Under_Control_Time', 'Ignition_Source', 'Incident_Number',
       'Incident_Station_Area', 'Incident_Ward', 'Initial_CAD_Event_Type',
       'Intersection', 'Last_TFS_Unit_Clear_Time', 'Latitude',
       'Level_Of_Origin', 'Longitude', 'Material_First_Ignited',
       'Method_Of_Fire_Control', 'Number_of_responding_apparatus',
       'Number_of_responding_personnel', 'Possible_Cause', 'Property_Use',
       'Smoke_Alarm_at_Fire_Origin',
       'Smoke_Alarm_at_Fire_Origin_Alarm_Failure',
       'Smoke_Alarm_at_Fire_Origin_Alarm_Type',
       'Smoke_Alarm_Impact_on_Persons_Evacuating_Impact_on_Evacuation',
       'Smoke_Spread', 'Sprinkler_System_Operation',
       'Sprinkler_System_Presence', 'Status_of_Fire_On_Arrival',
       'TFS_Alarm_Time', 'TFS_Arrival_Time', 'TFS_Firefighter_Casualties'],
'''


column1 = df['Count_of_Persons_Rescued']
column2 = df['Number_of_responding_apparatus']

cramers_v_score, p_value = cramers_v_p_value(column1, column2)
print("Cramér's V Score:", cramers_v_score)
print("P-value:", p_value)

Cramér's V Score: 0.21001908318167137
P-value: 0.0


### Results
It seems like there is a small correlations between different categorical features.

### Comparing correlation between categorical and numerical features
We will use the point-biserial correlation coefficient

In [48]:
from scipy.stats import pointbiserialr

categorical_feature = 'Estimated_Dollar_Loss'
numerical_feature = 'Number_of_responding_personnel'

correlation_coefficient, p_value = pointbiserialr(df[categorical_feature], df[numerical_feature])

print("Point-biserial correlation coefficient:", correlation_coefficient)
print("P-value:", p_value)

Point-biserial correlation coefficient: 0.04160467098029353
P-value: 1.9618162178670904e-07


#### Discussion
It seems like the relationship co-efficient is 0.39 which points towards a moderate correlation.

We cannot evaluate and test a hypothesis using this method, since the nu,ber of responding personenel is not nominal, nor is it dictohomous.

### Kruskal-Wallis Test:
The Kruskal-Wallis test is a non-parametric test used to compare the medians of a continuous variable across different groups defined by a nominal variable. It is suitable when the assumptions of normality and homogeneity of variances (lack of outliers) required by ANOVA are not met.

(https://www.statology.org/kruskal-wallis-test/)

In [53]:
from scipy.stats import kruskal

nominal_feature = 'Estimated_Dollar_Loss'
continous_feature = 'Number_of_responding_personnel'

# Perform Kruskal-Wallis test
h_statistic, p_value = kruskal(*[group['Estimated_Dollar_Loss'] for name, group in df.groupby('Number_of_responding_personnel')])

print("H-statistic:", h_statistic)
print("P-value:", p_value)


H-statistic: 5894.492720028623
P-value: 0.0
