By Sophia and Mac-I. This notebook (mostly a copy of model iteration 2) is about creating code that will allow us to integrate weather data into our project. (More explanation on non-weather related code can be found in this notebook)

In [7]:
%matplotlib inline

import pandas as pd
import numpy as np
import itertools
import re
from datetime import datetime

import matplotlib.pyplot as plt

In [8]:
def recodeData(df, isTrain = False):
    '''This function takes in the dataframe that we get from loading in the 
    SF crime data and returns a re-coded dataframe that has all the 
    additional features we want to add and the categorical features recoded 
    and cleaned.
    '''

    #since the modifications are done in-place we don't return the dataframe. 
    #we do, however, return the list of all the columns we added.
    df, newDate = recodeDates(df)
    df, newWeather = addWeather(df)
    
    addedColumns = [] 
    addedColumns += newDate
    addedColumns += newWeather
         
        

    return df, addedColumns, []

In [9]:
def recodeDates(df):
    '''This function takes in a dataframe and recodes the date field into 
    useable values. Here, we also recode the day of week.'''
    #Recode the dates column to year, month, day and hour columns
    df['DateTime'] = df['Dates'].apply(
        lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

    df['Year'] = df['DateTime'].apply(lambda x: x.year)
    df['Month'] = df['DateTime'].apply(lambda x: x.month)
    df['Day'] = df['DateTime'].apply(lambda x: x.day)
    df['Hour'] = df['DateTime'].apply(lambda x: x.hour)
    df['Minute'] = df['DateTime'].apply(lambda x: x.minute)
    df['DayOfWeekRecode'] = df['DateTime'].apply(lambda x: x.weekday())

    return df, ['Year', 'Month', 'Day', 'Hour', 'Minute', 'DayOfWeekRecode']

This is the function we added to add in weather data. 

In [12]:
def addWeather(df):
    #Create a column we can use to matchc weather data
    df['DATE'] = df['DateTime'].apply(lambda x: int( str(x.year)+x.strftime('%m')+x.strftime('%d') ))
    
    #Read in weather data
    weatherData = pd.read_csv('./../weather1.csv')
    #Replace nans
    weatherData = weatherData.replace('-9999', np.nan)
    #Get the date, precipitation, high, and low temperature
    weatherData = weatherData[['DATE','PRCP','TMAX','TMIN']]
    
    #Merge the dataframes
    df = pd.merge(df, weatherData, on='DATE')
    
    return df, ['PRCP','TMAX','TMIN']

In [13]:
crimeData = pd.read_csv('./../train.csv')
crimeData, addedColumns, streetColumns = recodeData(
    crimeData, isTrain = True)
crimeData.describe()

Unnamed: 0,X,Y,Year,Month,Day,Hour,Minute,DayOfWeekRecode,DATE,PRCP,TMAX,TMIN
count,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0
mean,-122.422616,37.77102,2008.712046,6.436509,15.570623,13.412655,20.155026,2.992691,20087779.678321,13.759622,179.030606,107.848622
std,0.030354,0.456893,3.631194,3.428972,8.783005,6.549573,18.594915,1.972023,36296.947681,52.905671,41.371282,24.864211
min,-122.513642,37.707879,2003.0,1.0,1.0,0.0,0.0,0.0,20030106.0,0.0,72.0,22.0
25%,-122.432952,37.752427,2006.0,3.0,8.0,9.0,0.0,1.0,20060111.0,0.0,150.0,89.0
50%,-122.41642,37.775421,2009.0,6.0,16.0,14.0,19.0,3.0,20090307.0,0.0,178.0,111.0
75%,-122.406959,37.784369,2012.0,9.0,23.0,19.0,33.0,5.0,20120611.0,0.0,200.0,122.0
max,-120.5,90.0,2015.0,12.0,31.0,23.0,59.0,6.0,20150513.0,864.0,367.0,200.0


Great! We can read in the weather data!