In [1]:
#Import Pandas
import pandas as pd
from sqlalchemy import create_engine

In [2]:
#Read CSV
df = pd.read_csv('data.csv', parse_dates=True)

In [3]:
#Filter Columns
df = df[['IM_INCIDENT_KEY', 'INCIDENT_DATE_TIME', 'ARRIVAL_DATE_TIME', 'TOTAL_INCIDENT_DURATION', 'ACTION_TAKEN1_DESC', 'ACTION_TAKEN2_DESC', 'ZIP_CODE', 'BOROUGH_DESC']]

In [4]:
#Get All columns name
#df.columns

In [5]:
#Check memory Usage (6.1 MB)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119324 entries, 0 to 119323
Data columns (total 8 columns):
IM_INCIDENT_KEY            119324 non-null int64
INCIDENT_DATE_TIME         119324 non-null object
ARRIVAL_DATE_TIME          118636 non-null object
TOTAL_INCIDENT_DURATION    119324 non-null int64
ACTION_TAKEN1_DESC         119324 non-null object
ACTION_TAKEN2_DESC         115132 non-null object
ZIP_CODE                   119322 non-null float64
BOROUGH_DESC               119324 non-null object
dtypes: float64(1), int64(2), object(5)
memory usage: 7.3+ MB


In [6]:
#Reduce memory Usage (4.9 MB)
df['ACTION_TAKEN1_DESC'] = df['ACTION_TAKEN1_DESC'].astype('category')
df['ACTION_TAKEN2_DESC'] = df['ACTION_TAKEN2_DESC'].astype('category')
df['BOROUGH_DESC'] = df['BOROUGH_DESC'].astype('category')


df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119324 entries, 0 to 119323
Data columns (total 8 columns):
IM_INCIDENT_KEY            119324 non-null int64
INCIDENT_DATE_TIME         119324 non-null object
ARRIVAL_DATE_TIME          118636 non-null object
TOTAL_INCIDENT_DURATION    119324 non-null int64
ACTION_TAKEN1_DESC         119324 non-null category
ACTION_TAKEN2_DESC         115132 non-null category
ZIP_CODE                   119322 non-null float64
BOROUGH_DESC               119324 non-null category
dtypes: category(3), float64(1), int64(2), object(2)
memory usage: 4.9+ MB


In [7]:
# Drop NA Valus
df = df.dropna()
df.head(3)

Unnamed: 0,IM_INCIDENT_KEY,INCIDENT_DATE_TIME,ARRIVAL_DATE_TIME,TOTAL_INCIDENT_DURATION,ACTION_TAKEN1_DESC,ACTION_TAKEN2_DESC,ZIP_CODE,BOROUGH_DESC
2,63476614,06/30/2018 11:22:51 PM,06/30/2018 11:27:18 PM,995,86 - Investigate,45 - Remove hazard,10472.0,2 - Bronx
3,63476562,06/30/2018 11:05:55 PM,06/30/2018 11:11:39 PM,671,44 - Hazardous materials leak control & contai...,64 - Shut down system,10452.0,2 - Bronx
4,63476510,06/30/2018 10:53:15 PM,06/30/2018 10:58:49 PM,1384,44 - Hazardous materials leak control & contai...,64 - Shut down system,10463.0,2 - Bronx


In [8]:
# Get Columns Types
df.dtypes

IM_INCIDENT_KEY               int64
INCIDENT_DATE_TIME           object
ARRIVAL_DATE_TIME            object
TOTAL_INCIDENT_DURATION       int64
ACTION_TAKEN1_DESC         category
ACTION_TAKEN2_DESC         category
ZIP_CODE                    float64
BOROUGH_DESC               category
dtype: object

In [9]:
# Split Date/Time and export to a separate dataframe
INCIDENT_DATE_TIME = df["INCIDENT_DATE_TIME"].str.split(" ", n = 1, expand = True) 
ARRIVAL_DATE_TIME = df["ARRIVAL_DATE_TIME"].str.split(" ", n = 1, expand = True) 

In [10]:
#Delete column in the original DF with the date and time combined (we will merge the new df)
del df['INCIDENT_DATE_TIME']
del df['ARRIVAL_DATE_TIME']

In [11]:
#Reset Indexes (they now start at two)

#First DF
df.reset_index(inplace=True, drop=True)

#Second DF
INCIDENT_DATE_TIME.reset_index(inplace=True, drop=True)

#Third DF
ARRIVAL_DATE_TIME.reset_index(inplace=True, drop=True)

In [12]:
#Rename new DF columns names

#Incident Date/time
INCIDENT_DATE_TIME.rename(columns={0: 'Incident Date', 1: 'Incident Time'}, inplace=True)

#Arrival Date/Time
ARRIVAL_DATE_TIME.rename(columns={0: 'Arrival Date', 1: 'Arrival Time'}, inplace=True)

In [13]:
# Join the two DFs
df = df.join(INCIDENT_DATE_TIME, how='outer')
df = df.join(ARRIVAL_DATE_TIME, how='outer')

In [14]:
df.head(3)

Unnamed: 0,IM_INCIDENT_KEY,TOTAL_INCIDENT_DURATION,ACTION_TAKEN1_DESC,ACTION_TAKEN2_DESC,ZIP_CODE,BOROUGH_DESC,Incident Date,Incident Time,Arrival Date,Arrival Time
0,63476614,995,86 - Investigate,45 - Remove hazard,10472.0,2 - Bronx,06/30/2018,11:22:51 PM,06/30/2018,11:27:18 PM
1,63476562,671,44 - Hazardous materials leak control & contai...,64 - Shut down system,10452.0,2 - Bronx,06/30/2018,11:05:55 PM,06/30/2018,11:11:39 PM
2,63476510,1384,44 - Hazardous materials leak control & contai...,64 - Shut down system,10463.0,2 - Bronx,06/30/2018,10:53:15 PM,06/30/2018,10:58:49 PM


In [15]:
#Import weather data frame and parse dates
df_weather = pd.read_csv("nycweatherdata.csv", parse_dates=True)
df_weather.head(3)


Unnamed: 0,STATION,DATE,TMAX,TMIN
0,USW00094728,1/1/13,40,26
1,USW00094728,1/2/13,33,22
2,USW00094728,1/3/13,32,24


In [16]:
# Rename column name/ we need to match the two columns name in the weather and our original df to join on
df_weather.rename(columns={'DATE': 'Incident Date'}, inplace=True)
df_weather.head(3)

Unnamed: 0,STATION,Incident Date,TMAX,TMIN
0,USW00094728,1/1/13,40,26
1,USW00094728,1/2/13,33,22
2,USW00094728,1/3/13,32,24


In [17]:
# Convert the dates columns to date_time object to join on
df['Incident Date'] = pd.to_datetime(df['Incident Date'])
df_weather['Incident Date'] = pd.to_datetime(df_weather['Incident Date'])

In [18]:
# Join weather and our DF
final_df = df_weather.merge(df, on='Incident Date')

In [19]:
# Delete Station column (we don't need it)
del final_df['STATION']

In [20]:
# verify join worked fine
final_df.head(3)

Unnamed: 0,Incident Date,TMAX,TMIN,IM_INCIDENT_KEY,TOTAL_INCIDENT_DURATION,ACTION_TAKEN1_DESC,ACTION_TAKEN2_DESC,ZIP_CODE,BOROUGH_DESC,Incident Time,Arrival Date,Arrival Time
0,2013-01-01,40,26,55675645,1108,44 - Hazardous materials leak control & contai...,64 - Shut down system,10455.0,2 - Bronx,11:30:10 PM,01/01/2013,11:34:39 PM
1,2013-01-01,40,26,55675621,461,44 - Hazardous materials leak control & contai...,64 - Shut down system,10027.0,1 - Manhattan,11:14:24 PM,01/01/2013,11:19:08 PM
2,2013-01-01,40,26,55675611,829,44 - Hazardous materials leak control & contai...,64 - Shut down system,11207.0,4 - Brooklyn,11:08:08 PM,01/01/2013,11:10:30 PM


In [21]:
# Get rid of the .0 that is in the zip code column
final_df['ZIP_CODE'] = final_df['ZIP_CODE'].astype(int)

In [22]:
# Check zipcode formatting
final_df.head(3)

Unnamed: 0,Incident Date,TMAX,TMIN,IM_INCIDENT_KEY,TOTAL_INCIDENT_DURATION,ACTION_TAKEN1_DESC,ACTION_TAKEN2_DESC,ZIP_CODE,BOROUGH_DESC,Incident Time,Arrival Date,Arrival Time
0,2013-01-01,40,26,55675645,1108,44 - Hazardous materials leak control & contai...,64 - Shut down system,10455,2 - Bronx,11:30:10 PM,01/01/2013,11:34:39 PM
1,2013-01-01,40,26,55675621,461,44 - Hazardous materials leak control & contai...,64 - Shut down system,10027,1 - Manhattan,11:14:24 PM,01/01/2013,11:19:08 PM
2,2013-01-01,40,26,55675611,829,44 - Hazardous materials leak control & contai...,64 - Shut down system,11207,4 - Brooklyn,11:08:08 PM,01/01/2013,11:10:30 PM


In [23]:
#Remove numbers that are nex to the Bourough (Maps need only the name with no nums)
final_df['BOROUGH_DESC'] = final_df['BOROUGH_DESC'].map(lambda x: str(x)[4:])

In [24]:
final_df.head(3)

Unnamed: 0,Incident Date,TMAX,TMIN,IM_INCIDENT_KEY,TOTAL_INCIDENT_DURATION,ACTION_TAKEN1_DESC,ACTION_TAKEN2_DESC,ZIP_CODE,BOROUGH_DESC,Incident Time,Arrival Date,Arrival Time
0,2013-01-01,40,26,55675645,1108,44 - Hazardous materials leak control & contai...,64 - Shut down system,10455,Bronx,11:30:10 PM,01/01/2013,11:34:39 PM
1,2013-01-01,40,26,55675621,461,44 - Hazardous materials leak control & contai...,64 - Shut down system,10027,Manhattan,11:14:24 PM,01/01/2013,11:19:08 PM
2,2013-01-01,40,26,55675611,829,44 - Hazardous materials leak control & contai...,64 - Shut down system,11207,Brooklyn,11:08:08 PM,01/01/2013,11:10:30 PM


In [25]:
final_df = final_df.rename(columns = {"Incident Date": "Date",
                                 "Arrival Date":"Arrival_Date",
                                 "Arrival Time": "Arrival_Time",
                                    "Incident Time":"Incident_Time"})

In [26]:
final_df.head()

Unnamed: 0,Date,TMAX,TMIN,IM_INCIDENT_KEY,TOTAL_INCIDENT_DURATION,ACTION_TAKEN1_DESC,ACTION_TAKEN2_DESC,ZIP_CODE,BOROUGH_DESC,Incident_Time,Arrival_Date,Arrival_Time
0,2013-01-01,40,26,55675645,1108,44 - Hazardous materials leak control & contai...,64 - Shut down system,10455,Bronx,11:30:10 PM,01/01/2013,11:34:39 PM
1,2013-01-01,40,26,55675621,461,44 - Hazardous materials leak control & contai...,64 - Shut down system,10027,Manhattan,11:14:24 PM,01/01/2013,11:19:08 PM
2,2013-01-01,40,26,55675611,829,44 - Hazardous materials leak control & contai...,64 - Shut down system,11207,Brooklyn,11:08:08 PM,01/01/2013,11:10:30 PM
3,2013-01-01,40,26,55675547,1025,44 - Hazardous materials leak control & contai...,64 - Shut down system,11373,Queens,10:26:05 PM,01/01/2013,10:29:29 PM
4,2013-01-01,40,26,55675480,1054,44 - Hazardous materials leak control & contai...,64 - Shut down system,11360,Queens,09:33:56 PM,01/01/2013,09:39:20 PM


In [27]:
engine = create_engine('sqlite:///coned.db', echo=True)

In [28]:
final_df.to_sql('ConEdDB', con=engine, if_exists='replace')

2019-08-24 00:26:07,308 INFO sqlalchemy.engine.base.Engine SELECT CAST('test plain returns' AS VARCHAR(60)) AS anon_1
2019-08-24 00:26:07,309 INFO sqlalchemy.engine.base.Engine ()
2019-08-24 00:26:07,312 INFO sqlalchemy.engine.base.Engine SELECT CAST('test unicode returns' AS VARCHAR(60)) AS anon_1
2019-08-24 00:26:07,312 INFO sqlalchemy.engine.base.Engine ()
2019-08-24 00:26:07,313 INFO sqlalchemy.engine.base.Engine PRAGMA table_info("ConEdDB")
2019-08-24 00:26:07,314 INFO sqlalchemy.engine.base.Engine ()
2019-08-24 00:26:07,316 INFO sqlalchemy.engine.base.Engine PRAGMA table_info("ConEdDB")
2019-08-24 00:26:07,316 INFO sqlalchemy.engine.base.Engine ()
2019-08-24 00:26:07,319 INFO sqlalchemy.engine.base.Engine SELECT name FROM sqlite_master WHERE type='table' ORDER BY name
2019-08-24 00:26:07,320 INFO sqlalchemy.engine.base.Engine ()
2019-08-24 00:26:07,322 INFO sqlalchemy.engine.base.Engine PRAGMA table_info("ConEdDB")
2019-08-24 00:26:07,323 INFO sqlalchemy.engine.base.Engine ()
201

In [29]:
yr_2013 = final_df[final_df['Date'].dt.year == 2013]
yr_2013

Unnamed: 0,Date,TMAX,TMIN,IM_INCIDENT_KEY,TOTAL_INCIDENT_DURATION,ACTION_TAKEN1_DESC,ACTION_TAKEN2_DESC,ZIP_CODE,BOROUGH_DESC,Incident_Time,Arrival_Date,Arrival_Time
0,2013-01-01,40,26,55675645,1108,44 - Hazardous materials leak control & contai...,64 - Shut down system,10455,Bronx,11:30:10 PM,01/01/2013,11:34:39 PM
1,2013-01-01,40,26,55675621,461,44 - Hazardous materials leak control & contai...,64 - Shut down system,10027,Manhattan,11:14:24 PM,01/01/2013,11:19:08 PM
2,2013-01-01,40,26,55675611,829,44 - Hazardous materials leak control & contai...,64 - Shut down system,11207,Brooklyn,11:08:08 PM,01/01/2013,11:10:30 PM
3,2013-01-01,40,26,55675547,1025,44 - Hazardous materials leak control & contai...,64 - Shut down system,11373,Queens,10:26:05 PM,01/01/2013,10:29:29 PM
4,2013-01-01,40,26,55675480,1054,44 - Hazardous materials leak control & contai...,64 - Shut down system,11360,Queens,09:33:56 PM,01/01/2013,09:39:20 PM
5,2013-01-01,40,26,55675481,1515,44 - Hazardous materials leak control & contai...,64 - Shut down system,10314,Staten Island,09:33:49 PM,01/01/2013,09:38:23 PM
6,2013-01-01,40,26,55675429,2707,44 - Hazardous materials leak control & contai...,64 - Shut down system,10019,Manhattan,09:10:36 PM,01/01/2013,09:14:47 PM
7,2013-01-01,40,26,55675384,899,44 - Hazardous materials leak control & contai...,64 - Shut down system,11221,Brooklyn,08:40:27 PM,01/01/2013,08:42:33 PM
8,2013-01-01,40,26,55675098,961,44 - Hazardous materials leak control & contai...,64 - Shut down system,10456,Bronx,06:50:03 PM,01/01/2013,06:54:08 PM
9,2013-01-01,40,26,55675057,1610,44 - Hazardous materials leak control & contai...,64 - Shut down system,10035,Manhattan,06:30:09 PM,01/01/2013,06:33:29 PM


In [30]:
yr_2013.sort_values(by='Date')

Unnamed: 0,Date,TMAX,TMIN,IM_INCIDENT_KEY,TOTAL_INCIDENT_DURATION,ACTION_TAKEN1_DESC,ACTION_TAKEN2_DESC,ZIP_CODE,BOROUGH_DESC,Incident_Time,Arrival_Date,Arrival_Time
0,2013-01-01,40,26,55675645,1108,44 - Hazardous materials leak control & contai...,64 - Shut down system,10455,Bronx,11:30:10 PM,01/01/2013,11:34:39 PM
20,2013-01-01,40,26,55674643,1099,44 - Hazardous materials leak control & contai...,64 - Shut down system,10467,Bronx,02:49:34 PM,01/01/2013,02:55:55 PM
21,2013-01-01,40,26,55674634,896,44 - Hazardous materials leak control & contai...,64 - Shut down system,10467,Bronx,02:46:45 PM,01/01/2013,02:52:32 PM
22,2013-01-01,40,26,55674617,660,44 - Hazardous materials leak control & contai...,64 - Shut down system,10467,Bronx,02:36:26 PM,01/01/2013,02:41:51 PM
23,2013-01-01,40,26,55674542,2663,44 - Hazardous materials leak control & contai...,64 - Shut down system,10458,Bronx,01:54:11 PM,01/01/2013,02:00:07 PM
24,2013-01-01,40,26,55674500,2987,44 - Hazardous materials leak control & contai...,64 - Shut down system,10016,Manhattan,01:23:13 PM,01/01/2013,01:27:02 PM
25,2013-01-01,40,26,55674496,802,44 - Hazardous materials leak control & contai...,64 - Shut down system,10470,Bronx,01:21:14 PM,01/01/2013,01:27:05 PM
19,2013-01-01,40,26,55674670,1602,44 - Hazardous materials leak control & contai...,64 - Shut down system,11231,Brooklyn,03:05:15 PM,01/01/2013,03:08:12 PM
26,2013-01-01,40,26,55674439,817,44 - Hazardous materials leak control & contai...,64 - Shut down system,10014,Manhattan,12:52:36 PM,01/01/2013,12:56:13 PM
29,2013-01-01,40,26,55674138,1652,44 - Hazardous materials leak control & contai...,64 - Shut down system,11234,Brooklyn,09:15:08 AM,01/01/2013,09:18:59 AM
