In [2]:
# Import pandas(for making data tables)
import pandas as pd
import numpy as np
from datetime import datetime

# Graphing modules
import matplotlib.pyplot as plt
from matplotlib import cm as cm
from matplotlib import mlab as ml
import seaborn as sns

# Makes plots in notebook
%matplotlib inline

In [3]:
# Create a Dataframe (data from data.seattle.gov)
df = pd.read_csv('../data/Seattle_Real_Time_Fire_911_Calls.csv', low_memory=False)

In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 565472 entries, 0 to 565471
Data columns (total 7 columns):
Address            564567 non-null object
Type               565472 non-null object
Datetime           565472 non-null object
Latitude           564130 non-null object
Longitude          564106 non-null float64
Report Location    525792 non-null object
Incident Number    536439 non-null object
dtypes: float64(1), object(6)
memory usage: 30.2+ MB


In [5]:
df.shape

(565472, 7)

In [6]:
df = df[pd.notnull(df['Datetime'])]
df = df[pd.notnull(df['Longitude'])]
df = df[pd.notnull(df['Latitude'])]
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 564106 entries, 1 to 565471
Data columns (total 7 columns):
Address            564097 non-null object
Type               564106 non-null object
Datetime           564106 non-null object
Latitude           564106 non-null object
Longitude          564106 non-null float64
Report Location    524433 non-null object
Incident Number    535074 non-null object
dtypes: float64(1), object(6)
memory usage: 34.4+ MB


In [7]:
df.head()

Unnamed: 0,Address,Type,Datetime,Latitude,Longitude,Report Location,Incident Number
1,6900 37th Av S,Medic Response,11/09/2011 11:33:00 PM +0000,47.540683,-122.286131,"(47.540683, -122.286131)",F110104166
2,N 50th St / Stone Way N,Aid Response,11/09/2011 11:32:00 PM +0000,47.665034,-122.340207,"(47.665034, -122.340207)",F110104164
3,E John St / E Olive Way,Aid Response,11/09/2011 11:32:00 PM +0000,47.619575,-122.324257,"(47.619575, -122.324257)",F110104165
4,611 12th Av S,Aid Response,11/09/2011 11:29:00 PM +0000,47.597406,-122.317228,"(47.597406, -122.317228)",F110104162
5,4545 42nd Av Sw,Automatic Medical Alarm,11/09/2011 11:25:00 PM +0000,47.562472,-122.385455,"(47.562472, -122.385455)",F110104161


In [8]:
# df = df.ix[1:]
# df.head()

In [9]:
# Type of 911 calls made with counts
df.groupby('Type').size()

Type
1RED 1 Unit                      5910
3RED - 1 +1 + 1                   326
4RED - 2 + 1 + 1                 1888
AFA4 - Auto Alarm 2 + 1 + 1      1104
AFAH - Auto Alarm Hazmat            1
ANTIB - Antibiotic Delivery         4
Activated CO Detector            1818
Aid Resp Infectious                48
Aid Response                   304802
Aid Response Freeway              715
Aid Response Yellow             11730
Aid Service                       120
Aircraft Crash                      3
Aircraft Standby                    5
Alarm Bell                       3054
Assault w/Weap 7 per Rule        1224
Assault w/Weapons 14               46
Assault w/Weapons, Aid             33
Assault w/Weapons- Aid            228
Auto Fire Alarm                 30029
Automatic Aid Dist 11               1
Automatic Fire Alarm False       2086
Automatic Fire Alarm Resd        8049
Automatic Fire Dist 11              3
Automatic Medical Alarm          7637
Bark Fire                        1172
Boat Fi

In [10]:
# Replace the spaces (missing date) in Latitude with NaN
df.Latitude = pd.to_numeric(df.Latitude, errors="coerce")
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 564106 entries, 1 to 565471
Data columns (total 7 columns):
Address            564097 non-null object
Type               564106 non-null object
Datetime           564106 non-null object
Latitude           564106 non-null float64
Longitude          564106 non-null float64
Report Location    524433 non-null object
Incident Number    535074 non-null object
dtypes: float64(2), object(5)
memory usage: 34.4+ MB


In [11]:
length = []
for dt in df.Datetime:
    if len(dt) not in length:
        length.append(len(dt))
length

[28, 25]

In [12]:
# Remove +0000
dt_list = []
for dt in df.Datetime:
    if 'T' in dt:
        dt_list.append(dt[:-5].strip())
    if 'AM' in dt or 'PM' in dt:
        dt_list.append(dt[:-6])

In [13]:
df['dt_crop'] =  dt_list
df.head()

Unnamed: 0,Address,Type,Datetime,Latitude,Longitude,Report Location,Incident Number,dt_crop
1,6900 37th Av S,Medic Response,11/09/2011 11:33:00 PM +0000,47.540683,-122.286131,"(47.540683, -122.286131)",F110104166,11/09/2011 11:33:00 PM
2,N 50th St / Stone Way N,Aid Response,11/09/2011 11:32:00 PM +0000,47.665034,-122.340207,"(47.665034, -122.340207)",F110104164,11/09/2011 11:32:00 PM
3,E John St / E Olive Way,Aid Response,11/09/2011 11:32:00 PM +0000,47.619575,-122.324257,"(47.619575, -122.324257)",F110104165,11/09/2011 11:32:00 PM
4,611 12th Av S,Aid Response,11/09/2011 11:29:00 PM +0000,47.597406,-122.317228,"(47.597406, -122.317228)",F110104162,11/09/2011 11:29:00 PM
5,4545 42nd Av Sw,Automatic Medical Alarm,11/09/2011 11:25:00 PM +0000,47.562472,-122.385455,"(47.562472, -122.385455)",F110104161,11/09/2011 11:25:00 PM


In [14]:
date_list = []
for time in df.dt_crop:
    if 'T' in time:
        date_list.append(time[:10])
    else:
        month = time[:2]
        day = time[3:5]
        year = time[6:10]
        date = year + "-" + month + '-' + day
        date_list.append(date)  

In [15]:
df['date'] = date_list
df.head()

Unnamed: 0,Address,Type,Datetime,Latitude,Longitude,Report Location,Incident Number,dt_crop,date
1,6900 37th Av S,Medic Response,11/09/2011 11:33:00 PM +0000,47.540683,-122.286131,"(47.540683, -122.286131)",F110104166,11/09/2011 11:33:00 PM,2011-11-09
2,N 50th St / Stone Way N,Aid Response,11/09/2011 11:32:00 PM +0000,47.665034,-122.340207,"(47.665034, -122.340207)",F110104164,11/09/2011 11:32:00 PM,2011-11-09
3,E John St / E Olive Way,Aid Response,11/09/2011 11:32:00 PM +0000,47.619575,-122.324257,"(47.619575, -122.324257)",F110104165,11/09/2011 11:32:00 PM,2011-11-09
4,611 12th Av S,Aid Response,11/09/2011 11:29:00 PM +0000,47.597406,-122.317228,"(47.597406, -122.317228)",F110104162,11/09/2011 11:29:00 PM,2011-11-09
5,4545 42nd Av Sw,Automatic Medical Alarm,11/09/2011 11:25:00 PM +0000,47.562472,-122.385455,"(47.562472, -122.385455)",F110104161,11/09/2011 11:25:00 PM,2011-11-09


In [16]:
len(date_list)

564106

In [17]:
for time in df.dt_crop[0:50000]:
    if 'T' in time:
        print time [-8:]

24:59:00
24:24:00
24:12:00
24:00:00
24:59:00
24:43:00
24:43:00
24:26:00
24:13:00
24:12:00
24:46:00
24:38:00
24:24:00
24:01:00
24:50:00
24:44:00
24:20:00
24:00:00
24:56:00
24:13:00
24:57:00
24:32:00
24:28:00
24:07:00
24:05:00
24:55:00
24:46:00
24:42:00
24:24:00
24:12:00
24:24:00
24:08:00
24:43:00
24:24:00
24:10:00
24:45:00
24:33:00
24:32:00
24:17:00
24:14:00
24:05:00
24:00:00
24:52:00
24:43:00
24:12:00
24:01:00
24:55:00
24:46:00
24:23:00
24:07:00
24:42:00
24:42:00
24:18:00
24:08:00
24:38:00
24:27:00
24:22:00
24:13:00
24:04:00
24:51:00
24:27:00
24:24:00
24:39:00
24:38:00
24:37:00
24:15:00
24:07:00
24:07:00
24:06:00
24:36:00
24:21:00
24:13:00
24:51:00
24:32:00
24:25:00
24:35:00
24:04:00
24:23:00
24:16:00
24:05:00
24:28:00
24:53:00
24:38:00
24:34:00
24:27:00
24:23:00
24:49:00
24:25:00
24:14:00
24:05:00
24:01:00
24:34:00
24:30:00
24:18:00
24:01:00
24:57:00
24:53:00
24:33:00
24:10:00
24:04:00
24:44:00
24:15:00
24:13:00
24:12:00
24:05:00
24:46:00
24:43:00
24:33:00
24:22:00
24:18:00
24:17:00
2

In [18]:
time_list = []
for time in df.dt_crop:
    if 'A' in time:
        time = time[-11:-3]
        time_list.append(time)
    elif 'P' in time:
        hr = time[-11:-9]
        hr = int(hr) + 12
        hr = str(hr)
        time = hr + time[-9:-3]
        time_list.append(time)
    elif 'T' in time:
        time = time[-8:]
        time_list.append(time)

In [19]:
len(time_list)

564106

In [20]:
df['time'] = time_list
df.head()

Unnamed: 0,Address,Type,Datetime,Latitude,Longitude,Report Location,Incident Number,dt_crop,date,time
1,6900 37th Av S,Medic Response,11/09/2011 11:33:00 PM +0000,47.540683,-122.286131,"(47.540683, -122.286131)",F110104166,11/09/2011 11:33:00 PM,2011-11-09,23:33:00
2,N 50th St / Stone Way N,Aid Response,11/09/2011 11:32:00 PM +0000,47.665034,-122.340207,"(47.665034, -122.340207)",F110104164,11/09/2011 11:32:00 PM,2011-11-09,23:32:00
3,E John St / E Olive Way,Aid Response,11/09/2011 11:32:00 PM +0000,47.619575,-122.324257,"(47.619575, -122.324257)",F110104165,11/09/2011 11:32:00 PM,2011-11-09,23:32:00
4,611 12th Av S,Aid Response,11/09/2011 11:29:00 PM +0000,47.597406,-122.317228,"(47.597406, -122.317228)",F110104162,11/09/2011 11:29:00 PM,2011-11-09,23:29:00
5,4545 42nd Av Sw,Automatic Medical Alarm,11/09/2011 11:25:00 PM +0000,47.562472,-122.385455,"(47.562472, -122.385455)",F110104161,11/09/2011 11:25:00 PM,2011-11-09,23:25:00


In [21]:
# Change hour 24 to hour 0 so can be cast as datetime
df['date2'] = pd.to_datetime(df.date)
df['time2'] = pd.to_timedelta(df.time)
# Adjust the date
df.Datetime = df.date2 + df.time2
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 564106 entries, 1 to 565471
Data columns (total 12 columns):
Address            564097 non-null object
Type               564106 non-null object
Datetime           564106 non-null datetime64[ns]
Latitude           564106 non-null float64
Longitude          564106 non-null float64
Report Location    524433 non-null object
Incident Number    535074 non-null object
dt_crop            564106 non-null object
date               564106 non-null object
time               564106 non-null object
date2              564106 non-null datetime64[ns]
time2              564106 non-null timedelta64[ns]
dtypes: datetime64[ns](2), float64(2), object(7), timedelta64[ns](1)
memory usage: 55.9+ MB


In [22]:
# YES WE HAVE DATETIME!!!!

In [23]:
# Cast date correctly after midnight fix
df.date = df.Datetime.dt.date

In [24]:
# Drop duplicate rows
df_no_dup = df.drop_duplicates()

In [25]:
df_no_dup.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 551095 entries, 1 to 565471
Data columns (total 12 columns):
Address            551086 non-null object
Type               551095 non-null object
Datetime           551095 non-null datetime64[ns]
Latitude           551095 non-null float64
Longitude          551095 non-null float64
Report Location    520631 non-null object
Incident Number    522111 non-null object
dt_crop            551095 non-null object
date               551095 non-null object
time               551095 non-null object
date2              551095 non-null datetime64[ns]
time2              551095 non-null timedelta64[ns]
dtypes: datetime64[ns](2), float64(2), object(7), timedelta64[ns](1)
memory usage: 54.7+ MB


In [26]:
df.to_csv("../data/clean_seattle_911.csv")

In [27]:
df_no_dup.to_csv("../data/clean_seattle_911_no_dup.csv")