In [146]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats

In [3]:
df = pd.read_csv("Incidents_Responded_to_by_Fire_Companies.csv", parse_dates = ["INCIDENT_DATE_TIME", "ARRIVAL_DATE_TIME", "LAST_UNIT_CLEARED_DATE_TIME"], low_memory = False)

In [24]:
df.head()

Unnamed: 0,IM_INCIDENT_KEY,FIRE_BOX,INCIDENT_TYPE_DESC,INCIDENT_DATE_TIME,ARRIVAL_DATE_TIME,UNITS_ONSCENE,LAST_UNIT_CLEARED_DATE_TIME,HIGHEST_LEVEL_DESC,TOTAL_INCIDENT_DURATION,ACTION_TAKEN1_DESC,...,ZIP_CODE,BOROUGH_DESC,FLOOR,CO_DETECTOR_PRESENT_DESC,FIRE_ORIGIN_BELOW_GRADE_FLAG,STORY_FIRE_ORIGIN_COUNT,FIRE_SPREAD_DESC,DETECTOR_PRESENCE_DESC,AES_PRESENCE_DESC,STANDPIPE_SYS_PRESENT_FLAG
0,55672688,2147,"300 - Rescue, EMS incident, other",2013-01-01 00:00:20,2013-01-01 00:14:23,1.0,2013-01-01 00:20:06,"1 - More than initial alarm, less than Signal 7-5",1186.0,"00 - Action taken, other",...,10454,2 - Bronx,,,,,,,,
1,55672692,818,735A - Unwarranted alarm/defective condition o...,2013-01-01 00:00:37,2013-01-01 00:09:03,3.0,2013-01-01 00:30:06,"1 - More than initial alarm, less than Signal 7-5",1769.0,86 - Investigate,...,10036,1 - Manhattan,,,,,,,,
2,55672693,9656,"300 - Rescue, EMS incident, other",2013-01-01 00:01:17,2013-01-01 00:04:55,1.0,2013-01-01 00:15:18,"1 - More than initial alarm, less than Signal 7-5",841.0,"00 - Action taken, other",...,11418,5 - Queens,,,,,,,,
3,55672695,7412,412 - Gas leak (natural gas or LPG),2013-01-01 00:02:32,2013-01-01 00:07:48,4.0,2013-01-01 00:40:11,"1 - More than initial alarm, less than Signal 7-5",2259.0,44 - Hazardous materials leak control & contai...,...,11103,5 - Queens,1.0,,,,,,,
4,55672697,4019,735A - Unwarranted alarm/defective condition o...,2013-01-01 00:01:49,2013-01-01 00:06:27,6.0,2013-01-01 00:24:56,"1 - More than initial alarm, less than Signal 7-5",1387.0,86 - Investigate,...,11385,5 - Queens,,,,,,,,


##### 1. What proportion of FDNY responses in this dataset correspond to the most common type of incident?

In [25]:
# count the number of different types and convert to a dataframe
df["INCIDENT_TYPE_DESC"].value_counts().to_frame(name = "counts")

Unnamed: 0,counts
"300 - Rescue, EMS incident, other",906466
"651 - Smoke scare, odor of smoke",161886
353 - Removal of victim(s) from stalled elevator,131509
"710 - Malicious, mischievous false call, other",129466
522 - Water or steam leak,122716
...,...
632 - Prescribed fire,3
135 - Aircraft fire,3
136 - Self-propelled motor home or recreational vehicle,2
173 - Cultivated trees or nursery stock fire,2


In [26]:
# calculate the fraction of the most common type
fraction = df["INCIDENT_TYPE_DESC"].value_counts()[0]/len(df)

In [27]:
print("{:.10f} of calls are of the most common type of incident".format(fraction))

0.3598861026 of calls are of the most common type of incident


##### 2. What is the ratio of the average number of units that arrive to a scene of an incident classified as '111 - Building fire' to the number that arrive for '651 - Smoke scare, odor of smoke'?

In [28]:
# Calculate the average number of units that arrive to a scene of an incident classified as '111 - Building fire'
unit_111 = df[df["INCIDENT_TYPE_DESC"] == "111 - Building fire"]["UNITS_ONSCENE"].mean()

In [29]:
# Calculate the average number of units that arrive to a scene of an incident classified as '651 - Smoke scare, odor of smoke'
unit_651 = df[df["INCIDENT_TYPE_DESC"] == "651 - Smoke scare, odor of smoke"]["UNITS_ONSCENE"].mean()

In [30]:
# Calculate the ratio of the two avearge number of units
unit_ratio = unit_111/unit_651

In [31]:
print("the ratio of the two types of incidents is {:.10f}".format(unit_ratio))

the ratio of the two types of incidents is 2.8011572258


##### 3. How many times more likely is an incident in Staten Island a false call compared to in Manhattan? The answer should be the ratio of Staten Island false call rate to Manhattan false call rate. A false call is an incident for which 'INCIDENT_TYPE_DESC' is '710 - Malicious, mischievous false call, other'.


In [32]:
# Calculate the false call rate of Staten Island
rate_staten = (df[df["BOROUGH_DESC"] == "3 - Staten Island"]["INCIDENT_TYPE_DESC"] == "710 - Malicious, mischievous false call, other").mean()

In [33]:
# Calculate the false call rate of Manhattan
rate_manhattan = (df[df["BOROUGH_DESC"] == "1 - Manhattan"]["INCIDENT_TYPE_DESC"] == "710 - Malicious, mischievous false call, other").mean()

In [34]:
# Calculate the ratio of Staten Island false call rate to Manhattan false call rate
falsecall_ratio = rate_staten/rate_manhattan

In [35]:
print("{:.10f} times more likely is an incident in Staten Island a false call compared to in Manhattan".format(falsecall_ratio))

1.6030401197 times more likely is an incident in Staten Island a false call compared to in Manhattan


##### 4. Check the distribution of the number of minutes it takes between the time a '111 - Building fire' incident has been logged into the Computer Aided Dispatch system and the time at which the first unit arrives on scene. What is the third quartile of that distribution. Note: the number of minutes can be fractional (ie, do not round).

In [36]:
# Calculate the response time between incident logged into computer and time of arrival
response_time = (df["ARRIVAL_DATE_TIME"] - df["INCIDENT_DATE_TIME"]).dt.seconds/60

In [37]:
# Calculate the third quartile of the distribution
time_quartile = response_time.quantile(q = 0.75)

In [38]:
print("The third quartile of the distribution is {:.10f}".format(time_quartile))

The third quartile of the distribution is 5.7833333333


##### 5. We can use the FDNY dataset to investigate at what time of the day people cook most. Compute what proportion of all incidents are cooking fires for every hour of the day by normalizing the number of cooking fires in a given hour by the total number of incidents that occured in that hour. Find the hour of the day that has the highest proportion of cooking fires and submit that proportion of cooking fires. A cooking fire is an incident for which 'INCIDENT_TYPE_DESC' is '113 - Cooking fire, confined to container'. Note: round incident times down. For example, if an incident occured at 22:55 it occured in hour 22.

In [39]:
# Calculate the proportion of cook fire to all incidents in each hour 
proportion = df.groupby(df["INCIDENT_DATE_TIME"].dt.hour)["INCIDENT_TYPE_DESC"].agg(lambda x: np.mean(x == "113 - Cooking fire, confined to container"))

In [193]:
# Find the hour of day with the highest proportion
cook_proportion = proportion.sort_values(ascending = False).iloc[0]

In [42]:
print("The hour 18 has the highest proportion of cooking fires with the proportion of {:.10f}".format(cook_proportion))

The hour 18 has the highest proportion of cooking fires with the proportion of 0.0510933146


#####  6. What is the coefficient of determination (R squared) between the number of residents at each ZIP code and the number of inicidents whose type is classified as '111 - Building fire' at each of those zip codes. Note: the population for each ZIP code in New York state can be found here. Ignore ZIP codes that do not appear on the website.

In [87]:
# read the file for population for each ZIP in New York
df_zip = pd.read_csv("NYC_zip.csv")

In [88]:
# change zip column to float data type
df_zip["Zip Code"] = df_zip["Zip Code"].astype(float)

In [52]:
# change the zip column to the same data type
df["ZIP_CODE_new"] = df["ZIP_CODE"].str.replace('-\d+', '')
df["ZIP_CODE_new"] = df["ZIP_CODE_new"].astype(float)

In [78]:
# subset the original incident datasets to only keep type and zipcode
df_reg = df[["INCIDENT_TYPE_DESC", "ZIP_CODE_new"]]

In [104]:
# get the data only for building fire
df_bf = df_reg[df_reg["INCIDENT_TYPE_DESC"] == "111 - Building fire"]

In [106]:
# use groupby to find the count of incidents for each zip code
df_group = df_bf.groupby("ZIP_CODE_new")["INCIDENT_TYPE_DESC"].count().to_frame().reset_index()

In [91]:
# concatnate the two dataframes based on zip code
df_regression = df_group.merge(df_zip, left_on = "ZIP_CODE_new", right_on = "Zip Code", how = "left")

In [111]:
# change the column name to count
df_regression.rename(columns = {"INCIDENT_TYPE_DESC": "Count"}, inplace = True)

In [114]:
# remove nan
df_regression.dropna(inplace = True)

In [115]:
# calculate the coefficient of OLS using np.polyfit
coefficient, intercept = np.polyfit(df_regression["Population"], df_regression["Count"], deg = 1)

In [116]:
print("the coefficient between he number of residents at each ZIP code and the number of inicident is {:.10f}".format(coefficient))

the coefficient between he number of residents at each ZIP code and the number of inicident is 0.0015251086


##### 7. Calculate the chi-square test statistic for testing whether an incident is more likely to last longer than 60 minutes when CO detector is not present. Again only consider incidents that have information about whether a CO detector was present or not.

In [134]:
# subset the data for only the incidents with CO detector information
df_CO = df[df["CO_DETECTOR_PRESENT_DESC"].notnull()]

In [137]:
# calculate the time duration for each instance
df_CO["duration"] = (df_CO["LAST_UNIT_CLEARED_DATE_TIME"] - df_CO["ARRIVAL_DATE_TIME"]).dt.seconds

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


In [140]:
# only keep the duration and CO detector information
df_CO = df_CO[["duration" , "CO_DETECTOR_PRESENT_DESC"]]

In [176]:
# calculate the frequency of each case
# with CO detector and less than 60 minute
chi_11 = df_CO[(df_CO["duration"] <= 3600) & (df_CO["CO_DETECTOR_PRESENT_DESC"] == "Yes")].shape[0]

In [177]:
# with CO detector and longer than 60 minute
chi_12 = df_CO[(df_CO["duration"] > 3600) & (df_CO["CO_DETECTOR_PRESENT_DESC"] == "Yes")].shape[0]

In [178]:
# without CO detector and less than 60 minute
chi_21 = df_CO[(df_CO["duration"] <= 3600) & (df_CO["CO_DETECTOR_PRESENT_DESC"] == "No")].shape[0]

In [179]:
# without CO detector and longer than 60 minute
chi_22 = df_CO[(df_CO["duration"] > 3600) & (df_CO["CO_DETECTOR_PRESENT_DESC"] == "No")].shape[0]

In [186]:
# run the chi-square test
chi2, p, dof, ex = stats.chi2_contingency(np.array([[25498, 738],[5287,829]]), correction=False)

In [187]:
print("the test statistic of chi-squre for CO detector and response duration is {:.10f}".format(chi2))

the test statistic of chi-squre for CO detector and response duration is 1241.6599284647
