In [1]:
#Import Pandas
import pandas as pd

In [2]:
#Read CSV
df = pd.read_csv('data.csv', parse_dates=True)

In [3]:
#Filter Columns
df = df[['IM_INCIDENT_KEY', 'INCIDENT_DATE_TIME', 'ARRIVAL_DATE_TIME', 'TOTAL_INCIDENT_DURATION', 'ACTION_TAKEN1_DESC', 'ACTION_TAKEN2_DESC', 'ZIP_CODE', 'BOROUGH_DESC']]

In [4]:
#Get All columns name
#df.columns

In [5]:
#Check memory Usage (6.1 MB)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119324 entries, 0 to 119323
Data columns (total 8 columns):
IM_INCIDENT_KEY            119324 non-null int64
INCIDENT_DATE_TIME         119324 non-null object
ARRIVAL_DATE_TIME          118636 non-null object
TOTAL_INCIDENT_DURATION    119324 non-null int64
ACTION_TAKEN1_DESC         119324 non-null object
ACTION_TAKEN2_DESC         115132 non-null object
ZIP_CODE                   119322 non-null float64
BOROUGH_DESC               119324 non-null object
dtypes: float64(1), int64(2), object(5)
memory usage: 7.3+ MB


In [6]:
#Reduce memory Usage (4.9 MB)
df['ACTION_TAKEN1_DESC'] = df['ACTION_TAKEN1_DESC'].astype('category')
df['ACTION_TAKEN2_DESC'] = df['ACTION_TAKEN1_DESC'].astype('category')
df['BOROUGH_DESC'] = df['BOROUGH_DESC'].astype('category')


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119324 entries, 0 to 119323
Data columns (total 8 columns):
IM_INCIDENT_KEY            119324 non-null int64
INCIDENT_DATE_TIME         119324 non-null object
ARRIVAL_DATE_TIME          118636 non-null object
TOTAL_INCIDENT_DURATION    119324 non-null int64
ACTION_TAKEN1_DESC         119324 non-null category
ACTION_TAKEN2_DESC         119324 non-null category
ZIP_CODE                   119322 non-null float64
BOROUGH_DESC               119324 non-null category
dtypes: category(3), float64(1), int64(2), object(2)
memory usage: 4.9+ MB


In [7]:
# Drop NA Valus
df = df.dropna()
df

Unnamed: 0,IM_INCIDENT_KEY,INCIDENT_DATE_TIME,ARRIVAL_DATE_TIME,TOTAL_INCIDENT_DURATION,ACTION_TAKEN1_DESC,ACTION_TAKEN2_DESC,ZIP_CODE,BOROUGH_DESC
1,63476693,06/30/2018 11:35:27 PM,06/30/2018 11:40:19 PM,918,216 - Checked exposures,216 - Checked exposures,10462.0,2 - Bronx
2,63476614,06/30/2018 11:22:51 PM,06/30/2018 11:27:18 PM,995,86 - Investigate,86 - Investigate,10472.0,2 - Bronx
3,63476562,06/30/2018 11:05:55 PM,06/30/2018 11:11:39 PM,671,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,10452.0,2 - Bronx
4,63476510,06/30/2018 10:53:15 PM,06/30/2018 10:58:49 PM,1384,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,10463.0,2 - Bronx
5,63476319,06/30/2018 10:14:27 PM,06/30/2018 10:18:58 PM,878,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,11207.0,4 - Brooklyn
6,63476256,06/30/2018 10:06:52 PM,06/30/2018 10:12:14 PM,2074,45 - Remove hazard,45 - Remove hazard,10459.0,2 - Bronx
7,63476121,06/30/2018 09:45:33 PM,06/30/2018 09:50:59 PM,1196,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,10472.0,2 - Bronx
8,63476049,06/30/2018 09:27:53 PM,06/30/2018 09:32:23 PM,1288,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,10452.0,2 - Bronx
9,63475827,06/30/2018 08:47:44 PM,06/30/2018 08:52:23 PM,759,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,11207.0,4 - Brooklyn
10,63475721,06/30/2018 08:24:39 PM,06/30/2018 08:28:37 PM,645,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,11213.0,4 - Brooklyn


In [8]:
# Get Columns Types
df.dtypes

IM_INCIDENT_KEY               int64
INCIDENT_DATE_TIME           object
ARRIVAL_DATE_TIME            object
TOTAL_INCIDENT_DURATION       int64
ACTION_TAKEN1_DESC         category
ACTION_TAKEN2_DESC         category
ZIP_CODE                    float64
BOROUGH_DESC               category
dtype: object

In [9]:
# Split Date/Time and export to a separate dataframe
INCIDENT_DATE_TIME = df["INCIDENT_DATE_TIME"].str.split(" ", n = 1, expand = True) 
ARRIVAL_DATE_TIME = df["ARRIVAL_DATE_TIME"].str.split(" ", n = 1, expand = True) 

In [10]:
#Delete column in the original DF with the date and time combined (we will merge the new df)
del df['INCIDENT_DATE_TIME']
del df['ARRIVAL_DATE_TIME']

In [11]:
#Reset Indexes (they now start at two)

#First DF
df.reset_index(inplace=True, drop=True)

#Second DF
INCIDENT_DATE_TIME.reset_index(inplace=True, drop=True)

#Third DF
ARRIVAL_DATE_TIME.reset_index(inplace=True, drop=True)

In [12]:
#Rename new DF columns names

#Incident Date/time
INCIDENT_DATE_TIME.rename(columns={0: 'Incident Date', 1: 'Incident Time'}, inplace=True)

#Arrival Date/Time
ARRIVAL_DATE_TIME.rename(columns={0: 'Arrival Date', 1: 'Arrival Time'}, inplace=True)

In [13]:
# Join the two DFs
df = df.join(INCIDENT_DATE_TIME, how='outer')
df = df.join(ARRIVAL_DATE_TIME, how='outer')

In [14]:
df

Unnamed: 0,IM_INCIDENT_KEY,TOTAL_INCIDENT_DURATION,ACTION_TAKEN1_DESC,ACTION_TAKEN2_DESC,ZIP_CODE,BOROUGH_DESC,Incident Date,Incident Time,Arrival Date,Arrival Time
0,63476693,918,216 - Checked exposures,216 - Checked exposures,10462.0,2 - Bronx,06/30/2018,11:35:27 PM,06/30/2018,11:40:19 PM
1,63476614,995,86 - Investigate,86 - Investigate,10472.0,2 - Bronx,06/30/2018,11:22:51 PM,06/30/2018,11:27:18 PM
2,63476562,671,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,10452.0,2 - Bronx,06/30/2018,11:05:55 PM,06/30/2018,11:11:39 PM
3,63476510,1384,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,10463.0,2 - Bronx,06/30/2018,10:53:15 PM,06/30/2018,10:58:49 PM
4,63476319,878,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,11207.0,4 - Brooklyn,06/30/2018,10:14:27 PM,06/30/2018,10:18:58 PM
5,63476256,2074,45 - Remove hazard,45 - Remove hazard,10459.0,2 - Bronx,06/30/2018,10:06:52 PM,06/30/2018,10:12:14 PM
6,63476121,1196,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,10472.0,2 - Bronx,06/30/2018,09:45:33 PM,06/30/2018,09:50:59 PM
7,63476049,1288,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,10452.0,2 - Bronx,06/30/2018,09:27:53 PM,06/30/2018,09:32:23 PM
8,63475827,759,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,11207.0,4 - Brooklyn,06/30/2018,08:47:44 PM,06/30/2018,08:52:23 PM
9,63475721,645,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,11213.0,4 - Brooklyn,06/30/2018,08:24:39 PM,06/30/2018,08:28:37 PM


In [15]:
#Import weather data frame and parse dates
df_weather = pd.read_csv("nycweatherdata.csv", parse_dates=True)
df_weather.head()


Unnamed: 0,STATION,DATE,TMAX,TMIN
0,USW00094728,1/1/13,40,26
1,USW00094728,1/2/13,33,22
2,USW00094728,1/3/13,32,24
3,USW00094728,1/4/13,37,30
4,USW00094728,1/5/13,42,32


In [16]:
# Rename column name/ we need to match the two columns name in the weather and our original df to join on
df_weather.rename(columns={'DATE': 'Incident Date'}, inplace=True)
df_weather.head()

Unnamed: 0,STATION,Incident Date,TMAX,TMIN
0,USW00094728,1/1/13,40,26
1,USW00094728,1/2/13,33,22
2,USW00094728,1/3/13,32,24
3,USW00094728,1/4/13,37,30
4,USW00094728,1/5/13,42,32


In [17]:
# Convert the dates columns to date_time object to join on
df['Incident Date'] = pd.to_datetime(df['Incident Date'])
df_weather['Incident Date'] = pd.to_datetime(df_weather['Incident Date'])

In [18]:
# Join weather and our DF
final_df = df_weather.merge(df, on='Incident Date')

In [19]:
# Delete Station column (we don't need it)
del final_df['STATION']

In [26]:
# verify join worked fine
final_df.head()

Unnamed: 0,Incident Date,TMAX,TMIN,IM_INCIDENT_KEY,TOTAL_INCIDENT_DURATION,ACTION_TAKEN1_DESC,ACTION_TAKEN2_DESC,ZIP_CODE,BOROUGH_DESC,Incident Time,Arrival Date,Arrival Time
0,2013-01-01,40,26,55675645,1108,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,10455,2 - Bronx,11:30:10 PM,01/01/2013,11:34:39 PM
1,2013-01-01,40,26,55675621,461,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,10027,1 - Manhattan,11:14:24 PM,01/01/2013,11:19:08 PM
2,2013-01-01,40,26,55675611,829,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,11207,4 - Brooklyn,11:08:08 PM,01/01/2013,11:10:30 PM
3,2013-01-01,40,26,55675547,1025,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,11373,5 - Queens,10:26:05 PM,01/01/2013,10:29:29 PM
4,2013-01-01,40,26,55675480,1054,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,11360,5 - Queens,09:33:56 PM,01/01/2013,09:39:20 PM


In [24]:
# Get rid of the .0 that is in the zip code column
final_df['ZIP_CODE'] = final_df['ZIP_CODE'].astype(int)

In [28]:
# Check zipcode formatting
final_df.head()

Unnamed: 0,Incident Date,TMAX,TMIN,IM_INCIDENT_KEY,TOTAL_INCIDENT_DURATION,ACTION_TAKEN1_DESC,ACTION_TAKEN2_DESC,ZIP_CODE,BOROUGH_DESC,Incident Time,Arrival Date,Arrival Time
0,2013-01-01,40,26,55675645,1108,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,10455,2 - Bronx,11:30:10 PM,01/01/2013,11:34:39 PM
1,2013-01-01,40,26,55675621,461,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,10027,1 - Manhattan,11:14:24 PM,01/01/2013,11:19:08 PM
2,2013-01-01,40,26,55675611,829,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,11207,4 - Brooklyn,11:08:08 PM,01/01/2013,11:10:30 PM
3,2013-01-01,40,26,55675547,1025,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,11373,5 - Queens,10:26:05 PM,01/01/2013,10:29:29 PM
4,2013-01-01,40,26,55675480,1054,44 - Hazardous materials leak control & contai...,44 - Hazardous materials leak control & contai...,11360,5 - Queens,09:33:56 PM,01/01/2013,09:39:20 PM
