# Weather and Motor Vehicle Collisions

In [5]:
import pandas as pd
from datetime import date
from dateutil.rrule import rrule, DAILY
from __future__ import division

pd.set_option('display.max_columns', None)

## Download weather data

In [None]:
start_date = date(2012, 7, 1)
end_date = date(2016, 2, 29)

# data = pd.DataFrame()
frames = []
url_template = 'https://www.wunderground.com/history/airport/KNYC/%s/%s/%s/DailyHistory.html?req_city=New+York&req_state=NY&req_statename=New+York&reqdb.zip=10001&reqdb.magic=4&reqdb.wmo=99999&format=1.csv'

month = ""

for dt in rrule(DAILY, dtstart=start_date, until=end_date):
    if (month != dt.strftime("%m")):
        month = dt.strftime("%m")
        print 'Downloading to memory: ' + dt.strftime("%Y-%m")    
    frames.append(pd.read_csv(url_template % (dt.strftime("%Y"),dt.strftime("%m"), dt.strftime("%d"))))

print "Saving data to csv..."
data = pd.concat(frames)
data.to_csv('weather_data_nyc.csv', sep=',')

## Cleaning the weather dataset
### Convert weather DateUTC to local time

In [2]:
from datetime import datetime
from dateutil import tz

weather = pd.read_csv('datasets/weather_data_nyc_clean.csv')

def UTCtoActual(utcDate):
    from_zone = tz.gettz('UTC')
    to_zone = tz.gettz('America/New_York')
    
    utc = datetime.strptime(utcDate.DateUTC, '%m/%d/%Y %H:%M:%S')\
                  .replace(tzinfo=from_zone)\
                  .astimezone(to_zone)
    s = pd.Series([utc.year, utc.month, utc.day, utc.hour])
    s.columns = ['Year', 'Month', 'Day', 'Hour']
    return s
    
#weather['DateActual'] = weather.DateUTC.map()

In [17]:
weather[['Year', 'Month', 'Day', 'Hour']] = weather.apply(UTCtoActual, axis=1)
weather.to_csv('datasets/weather_data_nyc_clean2.csv')

### Merge weather and NYPD MVC datasets

In [3]:
incidents = pd.read_csv('datasets/NYPD_Motor_Vehicle_Collisions.csv')
weather = pd.read_csv('datasets/weather_data_nyc_clean2.csv')
weather.head(1)

Unnamed: 0.1,Unnamed: 0,Conditions,DateUTC,Dew PointC,Events,Gust SpeedKm/h,Humidity,Precipitationmm,Sea Level PressurehPa,TemperatureC,TimeEDT,TimeEST,VisibilityKm,Wind Direction,Wind SpeedKm/h,WindDirDegrees,Year,Month,Day,Hour
0,0,Clear,7/1/2012 4:51:00,17.2,,,58,,1008.6,26.1,12:51 AM,,16.1,West,13,280,2012,7,1,0


In [None]:
weather[(weather.Year == 2015) & (weather.Month == 11) & (weather.Day == 27)]

In [86]:
features0 = ['Conditions', 'TemperatureC']
features = ['Conditions', 'Dew PointC', 'Events', \
            'Gust SpeedKm/h', 'Precipitationmm', \
            'TemperatureC', 'VisibilityKm', \
            'Wind SpeedKm/h', 'WindDirDegrees']

def lookup_weather2(year, month, day, hour):
    w = weather[(weather.Year == year) & (weather.Month == month) & (weather.Day == day) & (weather.Hour == hour)]
    return w

def lookup_weather(date, time):
    month = int(date.split('/')[0])
    day = int(date.split('/')[1])
    year = int(date.split('/')[2])
    hour = int(time.split(':')[0])
    d = lookup_weather2(year, month, day, hour).head(1)
    if (d.empty):
        dt_back = datetime.datetime(year, month, day, hour) - datetime.timedelta(hours=1)
        dt_forward = datetime.datetime(year, month, day, hour) + datetime.timedelta(hours=1)
        
        d_back = lookup_weather2(dt_back.year, dt_back.month, dt_back.day, dt_back.hour)
        if (not d_back.empty): return d_back
        
        d_forward = lookup_weather2(dt_forward.year, dt_forward.month, dt_forward.day, dt_forward.hour)
        if (not d_forward.empty): return d_forward
    return d



def merge_weather(incident):
    date = incident.DATE
    time = incident.TIME
    
    w = lookup_weather(date, time)
    #[unnamed, condition, dateUTC, Dew, Events, Gust, Humidity,Precipitationmm,Sea_Level_PressurehPa, TemperatureC] = w.values[0]
    
    
    try:
        #print w
        #s = pd.Series([w['Conditions'].iloc[0], w['Dew PointC'].iloc[0], w['Events'].iloc[0], \
        #           w['Gust SpeedKm/h'].iloc[0], w['Precipitationmm'].iloc[0], w['TemperatureC'].iloc[0], \
        #           w['TemperatureC'].iloc[0], w['VisibilityKm'].iloc[0], w['Wind SpeedKm/h'].iloc[0], w['WindDirDegrees'].iloc[0]])
        s = pd.Series([w.values[0]])
        #s.columns = features
        return s
    except:
        print date
        print time
        #s = pd.Series([None,None,None,None,None,None,None,None,None,None])
        s = pd.Series([])
        #s.columns = features
        return s
    
    
    

#lookup_weather2(2016, 2, 14, 7)
#lookup_weather('03/14/2016', '3:27').values[0]
#[unnamed, condition, dateUTC, Dew, Events, Gust, Humidity,Precipitationmm,Sea_Level_PressurehPa, TemperatureC] = lookup_weather('01/27/2016', '3:27').values[0]

In [87]:
incidents[features] = incidents[incidents.DATE.str.split('/').str.get(2) != '2016'].apply(merge_weather, axis=1)
incidents.head()
incidents.to_csv('datasets/NYPD_Motor_Vehicle_Collisions_weather.csv', sep=',')

07/26/2014
11:15
07/26/2014
11:30
07/26/2014
11:40
07/26/2014
11:45
07/26/2014
11:45
07/26/2014
11:45
07/26/2014
12:03
07/26/2014
12:11
07/26/2014
12:15
07/26/2014
12:15
07/26/2014
12:30
07/26/2014
12:30
07/26/2014
11:25
07/26/2014
11:25
07/26/2014
11:30
07/26/2014
11:30
07/26/2014
12:15
07/26/2014
12:16
07/26/2014
12:20
07/26/2014
12:30
07/26/2014
12:30
07/26/2014
11:00
07/26/2014
11:00
07/26/2014
11:55
07/26/2014
11:00
07/26/2014
11:05
07/26/2014
11:10
07/26/2014
11:10
07/26/2014
11:15
07/26/2014
11:15
07/26/2014
11:18
07/26/2014
11:47
07/26/2014
11:50
07/26/2014
11:50
07/26/2014
11:55
07/26/2014
12:00
07/26/2014
12:30
07/26/2014
12:35
07/26/2014
12:40
07/26/2014
12:45
07/26/2014
11:00
07/26/2014
11:00
07/26/2014
12:15
07/26/2014
12:48
05/06/2014
5:50
05/06/2014
5:40
05/06/2014
5:40
05/06/2014
5:00
05/06/2014
5:20
05/06/2014
5:44
05/06/2014
5:55
09/18/2012
2:20
09/18/2012
2:30
09/18/2012
3:20
09/18/2012
2:55
09/18/2012
2:22
09/18/2012
1:45
09/18/2012
1:27


ValueError: Columns must be same length as key

In [57]:
 incidents[incidents.DATE.str.split('/').str.get(2) != '2016'].head(10).apply(merge_weather, axis=1)

       Unnamed: 0        Conditions              DateUTC  Dew PointC Events  \
37513       37513  Scattered Clouds  12/31/2015 10:14:00           5    NaN   

       Gust SpeedKm/h  Humidity  Precipitationmm  Sea Level PressurehPa  \
37513             NaN        86              NaN                 1017.2   

       TemperatureC TimeEDT  TimeEST  VisibilityKm Wind Direction  \
37513           7.2     NaN  5:14 AM          12.9           Calm   

      Wind SpeedKm/h  WindDirDegrees  Year  Month  Day  Hour  
37513           Calm               0  2015     12   31     5  
       Unnamed: 0        Conditions              DateUTC  Dew PointC Events  \
37513       37513  Scattered Clouds  12/31/2015 10:14:00           5    NaN   

       Gust SpeedKm/h  Humidity  Precipitationmm  Sea Level PressurehPa  \
37513             NaN        86              NaN                 1017.2   

       TemperatureC TimeEDT  TimeEST  VisibilityKm Wind Direction  \
37513           7.2     NaN  5:14 AM          

Unnamed: 0,0
41375,"[37513, Scattered Clouds, 12/31/2015 10:14:00,..."
41376,"[37513, Scattered Clouds, 12/31/2015 10:14:00,..."
41377,"[37513, Scattered Clouds, 12/31/2015 10:14:00,..."
41378,"[37513, Scattered Clouds, 12/31/2015 10:14:00,..."
41379,"[37515, Clear, 12/31/2015 11:51:00, 5.6, nan, ..."
41380,"[37516, Mostly Cloudy, 12/31/2015 12:14:00, 5...."
41381,"[37516, Mostly Cloudy, 12/31/2015 12:14:00, 5...."
41382,"[37516, Mostly Cloudy, 12/31/2015 12:14:00, 5...."
41383,"[37519, Clear, 12/31/2015 13:51:00, 5.0, nan, ..."
41384,"[37519, Clear, 12/31/2015 13:51:00, 5.0, nan, ..."


In [None]:
incidents.head()

## Make some nice data analysis

In [6]:
incidents = pd.read_csv('datasets/NYPD_Motor_Vehicle_Collisions.csv')
weather = pd.read_csv('datasets/weather_data_nyc_clean2.csv')

In [12]:
lookup_weather('11/27/2015', '3:27')

Unnamed: 0.1,Unnamed: 0,Conditions,DateUTC,Dew PointC,Events,Gust SpeedKm/h,Humidity,Precipitationmm,Sea Level PressurehPa,TemperatureC,TimeEDT,TimeEST,VisibilityKm,Wind Direction,Wind SpeedKm/h,WindDirDegrees,Year,Month,Day,Hour
36462,36462,Clear,11/27/2015 8:51:00,8.9,,,86.0,,1035.1,11.1,,3:51 AM,9.7,WNW,7.4,290,2015,11,27,3


In [17]:
lookup_weather('02/20/2014', '20')

Unnamed: 0.1,Unnamed: 0,Conditions,DateUTC,Dew PointC,Events,Gust SpeedKm/h,Humidity,Precipitationmm,Sea Level PressurehPa,TemperatureC,TimeEDT,TimeEST,VisibilityKm,Wind Direction,Wind SpeedKm/h,WindDirDegrees,Year,Month,Day,Hour
17467,17467,Overcast,2/21/2014 1:51:00,2.2,,,86.0,0.0,1019.8,4.4,,8:51 PM,4.8,NNE,5.6,30,2014,2,20,20


In [78]:
import datetime
from datetime import date
def lookup_weather2(year, month, day, hour, forward=False):
    w = weather[(weather.Year == year) & (weather.Month == month) & (weather.Day == day) & (weather.Hour == hour)]
    
    if (w.empty):
        print "was empty: %s/%s/%s %s" % (year, month, day, hour)
        dt_back = datetime.datetime(year, month, day, hour) - datetime.timedelta(hours=1)
        dt_forward = datetime.datetime(year, month, day, hour) + datetime.timedelta(hours=1)
        w_new = pd.DataFrame
        if (!forward):
            w_new = lookup_weather2(dt_back.year, dt_back.month, dt_back.day, dt_back.hour, forward=True)
        else:
            w_new = lookup_weather2(dt_forward.year, dt_forward.month, dt_forward.day, dt_forward.hour)
        
        return w_back

    return w

def nearestDate(dates, pivot):
    return min(dates, key=lambda x: abs(x - pivot))

SyntaxError: invalid syntax (<ipython-input-78-aa43ca78e5a3>, line 11)

In [85]:
# lookup_weather2(2014, 2, 20, 18)
lookup_weather('02/20/2014', '18:30')

# dt2 = datetime.datetime(2014, 2, 20, 18)
# dates = [datetime.datetime(2014, 2, 20, 20), datetime.datetime(2014, 2, 20, 17)]

# nearestDate(dates, dt2)


Unnamed: 0.1,Unnamed: 0,Conditions,DateUTC,Dew PointC,Events,Gust SpeedKm/h,Humidity,Precipitationmm,Sea Level PressurehPa,TemperatureC,TimeEDT,TimeEST,VisibilityKm,Wind Direction,Wind SpeedKm/h,WindDirDegrees,Year,Month,Day,Hour
17466,17466,Overcast,2/20/2014 22:51:00,-3.3,,,55.0,0.0,1020.6,5.0,,5:51 PM,16.1,Variable,7.4,0,2014,2,20,17


In [None]:
print 'eh'