In [256]:
import pandas as pd 
import numpy as np

In [333]:
fields = ['CALL_NO', 'UNIT_DISPATCH_DATE', 'Time of dispatch', 'CALL_TYPE_FINAL', 'CALL_TYPE_FINAL_D',  'XCOORD', 'YCOORD']

In [371]:
data = pd.read_csv('/home/march/Downloads/EMSDataFixed.csv', usecols=fields)

  interactivity=interactivity, compiler=compiler, result=result)


In [372]:
data.isnull().sum(axis=0)

CALL_NO                   0
UNIT_DISPATCH_DATE        0
Time of dispatch      18317
CALL_TYPE_FINAL           0
CALL_TYPE_FINAL_D         0
XCOORD                  619
YCOORD                  619
dtype: int64

In [373]:
# Rename Time of dispatch to DISPATCH_TIME
data.rename(columns={'Time of dispatch':'DISPATCH_TIME'}, inplace=True)

The 18,317 rows (out of 1018169) that are missing time of dispatch are the final 18,317 rows. This corresponds to dates from 11/01/2016 to 12/31/2016. For point process purposes, removing them from use. 

Run this for proof of that fact: 

time_isnull = data['Time of dispatch'].isnull()
lastRead = False
false_true_flip = []
for i in range(0, len(time_isnull)):
    if lastRead != time_isnull[i]:
        false_true_flip.append(i)
    lastRead = time_isnull[i]
print(false_true_flip) # index of flip from False to True = [999852]

In [374]:
# Removing final 18317 rows
indexes_to_remove = []
for i in range(999852, len(data)):
    indexes_to_remove.append(i)
data.drop(data.index[indexes_to_remove], inplace=True)
data.isnull().sum(axis=0)

CALL_NO                 0
UNIT_DISPATCH_DATE      0
DISPATCH_TIME           0
CALL_TYPE_FINAL         0
CALL_TYPE_FINAL_D       0
XCOORD                611
YCOORD                611
dtype: int64

In [375]:
import matplotlib.pyplot as plt
plt.rcParams['figure.figsize'] = [20,10]
loc_null = data['XCOORD'].isnull()
# plt.plot(loc_null, '.')
# The distribution of lost data is *fairly* regular

There are 611 points without any coordinates, which comes out to 100.0*611/999852) = 0.0611% of the rows. As a first pass, deleting these rows from the dataframe as the amount (and distribution) seems small enough not to greatly affect the models. 

In [376]:
data.isna().sum(axis = 0)

CALL_NO                 0
UNIT_DISPATCH_DATE      0
DISPATCH_TIME           0
CALL_TYPE_FINAL         0
CALL_TYPE_FINAL_D       0
XCOORD                611
YCOORD                611
dtype: int64

In [377]:
data.dropna(axis=0, how='any', inplace=True)
data.isna().sum(axis = 0)

CALL_NO               0
UNIT_DISPATCH_DATE    0
DISPATCH_TIME         0
CALL_TYPE_FINAL       0
CALL_TYPE_FINAL_D     0
XCOORD                0
YCOORD                0
dtype: int64

Also need to remove gps points that are obviously wrong. We are removing anything that is smaller than 37 long because that is in Kentucky, not Indiana. 

In [378]:
data = data[data.XCOORD>38]
data.reset_index(drop=True, inplace=True)

In [385]:
# convert dispatch times into strings to manipulate into datetime, next three cells
data.DISPATCH_TIME = data.DISPATCH_TIME.astype(str)

In [453]:
fixed_times = []
for i in range(0, len(data)):
    time = data.DISPATCH_TIME[i]
    if len(time)<8:
        time = time.zfill(6)
        s = time
        t = iter(s)
        time = ':'.join(a+b for a,b in zip(t, t))
    fixed_times.append(time)

In [454]:
data.UNIT_DISPATCH_DATE = data.UNIT_DISPATCH_DATE.astype(str)

In [455]:
from datetime import datetime
datetimes = []
for i in range(0, len(data)):
    datetime_object = datetime.strptime(data.UNIT_DISPATCH_DATE[i] + fixed_times[i], '%Y-%m-%d%H:%M:%S')
    datetimes.append(datetime_object)

In [460]:
datetimes[len(data)-1]

datetime.datetime(2016, 10, 31, 23, 58, 3)

In [461]:
data['Date_Time '] = datetimes

In [462]:
data

Unnamed: 0,CALL_NO,UNIT_DISPATCH_DATE,DISPATCH_TIME,CALL_TYPE_FINAL,CALL_TYPE_FINAL_D,XCOORD,YCOORD,DATE_TIME,Date_Time
0,61209515,2006-12-28,044548,773A,UNCONSCIOUS PERS,39.824332,-86.236119,2006-12-28 04:48:53,2006-12-28 04:45:48
1,61209516,2006-12-28,044853,774A,ASSAULT/TRAUMA,39.781627,-86.120826,2006-12-28 04:48:53,2006-12-28 04:48:53
2,61209517,2006-12-28,045219,768A,SEIZURE,39.756840,-85.998714,2006-12-28 04:48:53,2006-12-28 04:52:19
3,61209519,2006-12-28,051057,773A,UNCONSCIOUS PERS,39.782064,-86.044592,2006-12-28 04:48:53,2006-12-28 05:10:57
4,61209520,2006-12-28,051756,790A,*MEDICAL ALARM,39.742415,-86.111559,2006-12-28 04:48:53,2006-12-28 05:17:56
5,61209521,2006-12-28,051906,765A,CHEST PAIN/HRT,39.807587,-86.188334,2006-12-28 04:48:53,2006-12-28 05:19:06
6,61209522,2006-12-28,053302,773A,UNCONSCIOUS PERS,39.714985,-86.139617,2006-12-28 04:48:53,2006-12-28 05:33:02
7,61209523,2006-12-28,053916,768A,SEIZURE,39.640828,-86.052848,2006-12-28 04:48:53,2006-12-28 05:39:16
8,61209524,2006-12-28,054924,770A,ABDOMIN/BACK PN,39.746238,-86.126386,2006-12-28 04:48:53,2006-12-28 05:49:24
9,61209525,2006-12-28,060030,790A,SICK PERSON,39.806765,-86.178582,2006-12-28 04:48:53,2006-12-28 06:00:30
