# Weather and Motor Vehicle Collisions

In [1]:
import pandas as pd
from datetime import date
from dateutil.rrule import rrule, DAILY
from __future__ import division

pd.set_option('display.max_columns', None)

## Download weather data

In [None]:
start_date = date(2012, 7, 1)
end_date = date(2016, 2, 29)

# data = pd.DataFrame()
frames = []
url_template = 'https://www.wunderground.com/history/airport/KNYC/%s/%s/%s/DailyHistory.html?req_city=New+York&req_state=NY&req_statename=New+York&reqdb.zip=10001&reqdb.magic=4&reqdb.wmo=99999&format=1.csv'

month = ""

for dt in rrule(DAILY, dtstart=start_date, until=end_date):
    if (month != dt.strftime("%m")):
        month = dt.strftime("%m")
        print 'Downloading to memory: ' + dt.strftime("%Y-%m")    
    frames.append(pd.read_csv(url_template % (dt.strftime("%Y"),dt.strftime("%m"), dt.strftime("%d"))))

print "Saving data to csv..."
data = pd.concat(frames)
data.to_csv('weather_data_nyc.csv', sep=',')

## Cleaning the weather dataset
### Convert weather DateUTC to local time

In [2]:
from datetime import datetime
from dateutil import tz

weather = pd.read_csv('datasets/weather_data_nyc_clean.csv')

def UTCtoActual(utcDate):
    from_zone = tz.gettz('UTC')
    to_zone = tz.gettz('America/New_York')
    
    utc = datetime.strptime(utcDate.DateUTC, '%m/%d/%Y %H:%M:%S')\
                  .replace(tzinfo=from_zone)\
                  .astimezone(to_zone)
    s = pd.Series([utc.year, utc.month, utc.day, utc.hour])
    s.columns = ['Year', 'Month', 'Day', 'Hour']
    return s
    
#weather['DateActual'] = weather.DateUTC.map()

In [17]:
weather[['Year', 'Month', 'Day', 'Hour']] = weather.apply(UTCtoActual, axis=1)
weather.to_csv('datasets/weather_data_nyc_clean2.csv')

### Merge weather and NYPD MVC datasets

In [3]:
incidents = pd.read_csv('datasets/NYPD_Motor_Vehicle_Collisions.csv')
weather = pd.read_csv('datasets/weather_data_nyc_clean2.csv')
weather.head(1)

Unnamed: 0.1,Unnamed: 0,Conditions,DateUTC,Dew PointC,Events,Gust SpeedKm/h,Humidity,Precipitationmm,Sea Level PressurehPa,TemperatureC,TimeEDT,TimeEST,VisibilityKm,Wind Direction,Wind SpeedKm/h,WindDirDegrees,Year,Month,Day,Hour
0,0,Clear,7/1/2012 4:51:00,17.2,,,58,,1008.6,26.1,12:51 AM,,16.1,West,13,280,2012,7,1,0


In [None]:
weather[(weather.Year == 2015) & (weather.Month == 11) & (weather.Day == 27)]

In [7]:
features0 = ['Conditions', 'TemperatureC']
features = ['Conditions', 'Dew PointC', 'Events', \
            'Gust SpeedKm/h', 'Precipitationmm', \
            'TemperatureC', 'VisibilityKm', \
            'Wind SpeedKm/h', 'WindDirDegrees']

def lookup_weather2(year, month, day, hour):
    w = weather[(weather.Year == year) & (weather.Month == month) & (weather.Day == day) & (weather.Hour == hour)]
    return w

def lookup_weather(date, time):
    month = date.split('/')[0]
    day = date.split('/')[1]
    year = date.split('/')[2]
    hour = time.split(':')[0]
    return lookup_weather2(int(year), int(month), int(day), int(hour)).head(1)

def merge_weather(incident):
    date = incident.DATE
    time = incident.TIME
    
    w = lookup_weather(date, time)
    #[unnamed, condition, dateUTC, Dew, Events, Gust, Humidity,Precipitationmm,Sea_Level_PressurehPa, TemperatureC] = w.values[0]
    
    
    try:
        #print w
        #s = pd.Series([w['Conditions'].iloc[0], w['Dew PointC'].iloc[0], w['Events'].iloc[0], \
        #           w['Gust SpeedKm/h'].iloc[0], w['Precipitationmm'].iloc[0], w['TemperatureC'].iloc[0], \
        #           w['TemperatureC'].iloc[0], w['VisibilityKm'].iloc[0], w['Wind SpeedKm/h'].iloc[0], w['WindDirDegrees'].iloc[0]])
        s = pd.Series([w.values[0]])
        #s.columns = features
        return s
    except:
        print date
        print time
        #s = pd.Series([None,None,None,None,None,None,None,None,None,None])
        s = pd.Series([])
        #s.columns = features
        return s
    
    
    

#lookup_weather2(2016, 2, 14, 7)
#lookup_weather('03/14/2016', '3:27').values[0]
#[unnamed, condition, dateUTC, Dew, Events, Gust, Humidity,Precipitationmm,Sea_Level_PressurehPa, TemperatureC] = lookup_weather('01/27/2016', '3:27').values[0]

In [None]:
incidents[features] = incidents[incidents.DATE.str.split('/').str.get(2) != '2016'].apply(merge_weather, axis=1)
incidents.head()
incidents.to_csv('datasets/NYPD_Motor_Vehicle_Collisions_weather.csv', sep=',')

11/29/2015
22:20
11/29/2015
22:31
11/29/2015
22:50
11/29/2015
22:06
11/29/2015
22:45
11/29/2015
22:40
11/29/2015
22:30
11/29/2015
22:30
11/29/2015
22:30
11/29/2015
22:30
11/29/2015
22:25
11/29/2015
22:20
11/29/2015
22:00
11/29/2015
22:00
11/29/2015
22:00
11/29/2015
22:20
11/29/2015
22:20
11/28/2015
13:01
11/28/2015
21:00
11/28/2015
21:45
11/28/2015
13:00
11/28/2015
13:50
11/28/2015
5:15
11/28/2015
5:27
11/28/2015
13:05
11/28/2015
21:45
11/28/2015
13:30
11/28/2015
13:27
11/28/2015
13:49
11/28/2015
13:50
11/28/2015
13:50
11/28/2015
21:43
11/28/2015
21:04
11/28/2015
21:20
11/28/2015
21:10
11/28/2015
21:00
11/28/2015
13:55
11/28/2015
13:45
11/28/2015
13:35
11/28/2015
13:33
11/28/2015
13:20
11/28/2015
13:15
11/28/2015
13:15
11/28/2015
13:00
11/28/2015
13:00
11/28/2015
13:00
11/28/2015
13:06
11/28/2015
13:20
11/28/2015
13:20
11/28/2015
13:25
11/28/2015
13:30
11/28/2015
13:30
11/28/2015
13:30
11/28/2015
13:33
11/28/2015
21:00
11/28/2015
21:00
11/28/2015
21:00
11/28/2015
21:00
11/28/2015
21:10

In [57]:
 incidents[incidents.DATE.str.split('/').str.get(2) != '2016'].head(10).apply(merge_weather, axis=1)

       Unnamed: 0        Conditions              DateUTC  Dew PointC Events  \
37513       37513  Scattered Clouds  12/31/2015 10:14:00           5    NaN   

       Gust SpeedKm/h  Humidity  Precipitationmm  Sea Level PressurehPa  \
37513             NaN        86              NaN                 1017.2   

       TemperatureC TimeEDT  TimeEST  VisibilityKm Wind Direction  \
37513           7.2     NaN  5:14 AM          12.9           Calm   

      Wind SpeedKm/h  WindDirDegrees  Year  Month  Day  Hour  
37513           Calm               0  2015     12   31     5  
       Unnamed: 0        Conditions              DateUTC  Dew PointC Events  \
37513       37513  Scattered Clouds  12/31/2015 10:14:00           5    NaN   

       Gust SpeedKm/h  Humidity  Precipitationmm  Sea Level PressurehPa  \
37513             NaN        86              NaN                 1017.2   

       TemperatureC TimeEDT  TimeEST  VisibilityKm Wind Direction  \
37513           7.2     NaN  5:14 AM          

Unnamed: 0,0
41375,"[37513, Scattered Clouds, 12/31/2015 10:14:00,..."
41376,"[37513, Scattered Clouds, 12/31/2015 10:14:00,..."
41377,"[37513, Scattered Clouds, 12/31/2015 10:14:00,..."
41378,"[37513, Scattered Clouds, 12/31/2015 10:14:00,..."
41379,"[37515, Clear, 12/31/2015 11:51:00, 5.6, nan, ..."
41380,"[37516, Mostly Cloudy, 12/31/2015 12:14:00, 5...."
41381,"[37516, Mostly Cloudy, 12/31/2015 12:14:00, 5...."
41382,"[37516, Mostly Cloudy, 12/31/2015 12:14:00, 5...."
41383,"[37519, Clear, 12/31/2015 13:51:00, 5.0, nan, ..."
41384,"[37519, Clear, 12/31/2015 13:51:00, 5.0, nan, ..."


## Make some nice data analysis

In [None]:
incidents = pd.read_csv('datasets/NYPD_Motor_Vehicle_Collisions.csv')
weather = pd.read_csv('datasets/weather_data_nyc_clean2.csv')

In [None]:
weather.Conditions.unique()