# Model Iteration 2
Here, I want to implement a very simple model. To do this, I'm not going to create very many features, just recode some of the variables that are categorical like district. 

## Importing Everything!

In [1]:
                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                                      % matplotlib inline
import pandas as pd
import numpy as np
import itertools
import re

import logging

import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
from matplotlib import cm
from datetime import datetime
from ipywidgets import widgets  
from IPython.display import display

import sklearn
from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_iris
from sklearn import tree
from sklearn.metrics import log_loss

logging.basicConfig(level=logging.INFO, format='%(asctime)s %(message)s')

print sklearn.__version__

0.17


## Loading in the data
Now, I need to load in the data. Here we are assigning categories so that our submission file has the rows in the correct order (this was the reason that our first few submissions did so poorly). 

In [3]:
crimeData = pd.read_csv('./../train.csv')
Categories = ['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
              'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
              'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION',
              'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING',
              'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING',
              'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES',
              'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE',
              'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE',
              'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE',
              'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT',
              'WARRANTS', 'WEAPON LAWS']

## Helper functions for recoding data
Here are the helper functions for recoding data. We'll add more as we create some new features

In [4]:
def recodeData(df, isTrain = False):
    '''This function takes in the dataframe that we get from loading in the 
    SF crime data and returns a re-coded dataframe that has all the 
    additional features we want to add and the categorical features recoded 
    and cleaned.
    '''

    #since the modifications are done in-place we don't return the dataframe. 
    #we do, however, return the list of all the columns we added.
    newLatLon = removeOutlierLatLon(df)
    newDate = recodeDates(df)
    newDistrict = recodePoliceDistricts(df)
    newAddress = recodeAddresses(df)

    
    addedColumns = [] 
    addedColumns += newDate
    addedColumns += newDistrict 
    addedColumns += newAddress
    addedColumns += newLatLon
    
    
   

    if (isTrain):
        newCategory = recodeCategories(df)
        addedColumns += newCategory
        try: #prevents error if the coumns have already been removed or we are processing test data
            columnsToDrop = ['Descript', 'Resolution']
            df.drop(columnsToDrop, axis=1, inplace=True)
        except:
            print "already recoded or using test data"
         

    return df, addedColumns

In [5]:
def recodeDates(df):
    '''This function takes in a dataframe and recodes the date field into 
    useable values. Here, we also recode the day of week.'''
    #Recode the dates column to year, month, day and hour columns
    df['DateTime'] = df['Dates'].apply(
        lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

    df['Year'] = df['DateTime'].apply(lambda x: x.year)
    df['Month'] = df['DateTime'].apply(lambda x: x.month)
    df['Day'] = df['DateTime'].apply(lambda x: x.day)
    df['Hour'] = df['DateTime'].apply(lambda x: x.hour)
    df['Minute'] = df['DateTime'].apply(lambda x: x.minute)
    df['DayOfWeekRecode'] = df['DateTime'].apply(lambda x: x.weekday())
    df['MinuteOfWeek'] = df['DateTime'].apply(lambda x: x.weekday()*24*60 + x.hour*60 + x.minute)

    return ['Year', 'Month', 'Day', 'Hour', 'Minute', 'DayOfWeekRecode','MinuteOfWeek']

In [6]:
def recodePoliceDistricts(df):
    '''This function recodes the police district to a one-hot encoding 
    scheme.'''
    districts = df['PdDistrict'].unique().tolist()
    newColumns = []
    for district in districts:
        newColumns.append('District' + district)
        df['District' + district] = df['PdDistrict'].apply(
            lambda x: int(x == district))

    return newColumns

In [7]:
def recodeAddresses(df):
    '''This function will attempt to create some features related to the address field in the database. To do this, 
    first, we need to split up the address field into two different address fields'''
    
    #If there are two addresss, split fields. Also extract the block number
    df['Address1'] = df['Address'].apply(lambda x: re.sub(r'^\d+ Block of ','',x.split(" / ")[0]))
    df['Address2'] = df['Address'].apply(lambda x: (x.split(" / ")[1]) if (len(x.split(" / ")) > 1) else None)
    df['BlockNumber'] = df['Address'].apply(lambda x: int(re.findall(r'^\d+',x)[0]) if (len(re.findall(r'^\d+',x)) > 0) else None )
    df['BlockNumber'] = df['BlockNumber'].fillna(-1)
    
    #Also add the "did the crime occur on a street corner field?"
    df['StreetCornerFlag'] = df['Address'].apply(lambda x: len(x.split(" / ")) > 1)
    
    return ['StreetCornerFlag', 'BlockNumber']

In [8]:
def removeOutlierLatLon(df):
    '''This function will attempt remove outlier Latitudes and Longitudes'''
    df.loc[df.X > -121, 'X'] = df.loc[(df.X > -121)].apply(lambda row: df.X[df["PdDistrict"] == row['PdDistrict']].median(), axis=1)
    df.loc[df.Y > 38, 'Y'] = df.loc[(df.Y > 38)].apply(lambda row: df.Y[df["PdDistrict"] == row['PdDistrict']].median(), axis=1)

    return ['X', 'Y']

In [9]:
def recodeCategories(df):
    '''This function will attempt remove outlier Latitudes and Longitudes'''
    #if 'Category' in df.columns:
    df['CategoryRecode'] = df.Category.apply(lambda x: Categories.index(x))
        
    return ['CategoryRecode']

## Recoding Columns
Here, we want to do some recoding of the columns. To do this, we're going to use our handy-dandy helper functions.  

In [10]:
crimeData, addedColumns = recodeData(
    crimeData, isTrain = True)
crimeData.describe()

Unnamed: 0,X,Y,Year,Month,Day,Hour,Minute,DayOfWeekRecode,MinuteOfWeek,DistrictNORTHERN,...,DistrictBAYVIEW,DistrictRICHMOND,DistrictCENTRAL,DistrictTARAVAL,DistrictTENDERLOIN,DistrictMISSION,DistrictSOUTHERN,BlockNumber,StreetCornerFlag,CategoryRecode
count,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,...,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049,878049.0
mean,-122.422763,37.767035,2008.712046,6.436509,15.570623,13.412655,20.155026,2.992691,5134.388787,0.11992,...,0.101852,0.051488,0.097329,0.074707,0.093171,0.136562,0.179013,704.776747,0.297038,19.338687
std,0.025284,0.024165,3.631194,3.428972,8.783005,6.549573,18.594915,1.972023,2858.378361,0.324869,...,0.302454,0.220991,0.296406,0.262917,0.290673,0.343384,0.383363,1001.193208,0.456954,10.688637
min,-122.513642,37.707879,2003.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,False,0.0
25%,-122.432952,37.752427,2006.0,3.0,8.0,9.0,0.0,1.0,2600.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,16.0
50%,-122.41642,37.775421,2009.0,6.0,16.0,14.0,19.0,3.0,5220.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,300.0,0,20.0
75%,-122.406959,37.784368,2012.0,9.0,23.0,19.0,33.0,5.0,7606.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0,1,25.0
max,-122.364937,37.819975,2015.0,12.0,31.0,23.0,59.0,6.0,10079.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8300.0,True,38.0


## Model Iteration
Now that we've done some recoding, we're going to create my model. To do this, I'm going to do a random forest classifier. The goal of this model is really just to give us a baseline submission.

In [12]:
columnsToUse = addedColumns
columnsToUse = ['X','Y', 'Year', 'Month', 'Day','Hour', 'Minute',
       'DayOfWeekRecode', 'DistrictNORTHERN', 'DistrictPARK',
       'DistrictINGLESIDE', 'DistrictBAYVIEW', 'DistrictRICHMOND',
       'DistrictCENTRAL', 'DistrictTARAVAL', 'DistrictTENDERLOIN',
       'DistrictMISSION', 'DistrictSOUTHERN']
# columnsToUse = ['X','Y', 'Year', 'Month', 'Day','Hour', 'Minute',
#        'DayOfWeekRecode', 'DistrictNORTHERN', 'DistrictPARK',
#        'DistrictINGLESIDE', 'DistrictBAYVIEW', 'DistrictRICHMOND',
#        'DistrictCENTRAL', 'DistrictTARAVAL', 'DistrictTENDERLOIN',
#        'DistrictMISSION', 'DistrictSOUTHERN', 'StreetCornerFlag' ]

X = crimeData[columnsToUse]
y = crimeData['CategoryRecode']

clf = RandomForestClassifier(n_estimators=30, max_depth = 20, random_state=1, n_jobs = 1, verbose=2)

k_folds = StratifiedShuffleSplit(y, 3, test_size=0.5, random_state=0)

scores = []

for k, (train, test) in enumerate(k_folds):
    clf.fit(X.iloc[train], y.iloc[train])
    probs = clf.predict_proba(X.iloc[test])
    score = log_loss(y.iloc[test].values, probs)
    print score
    scores.append(score)
    
print(scores)
print("Average: " + str(np.average(scores)))

building tree 1 of 30
building tree 2 of 30
building tree 3 of 30
building tree 4 of 30
building tree 5 of 30
building tree 6 of 30
building tree 7 of 30
building tree 8 of 30
building tree 9 of 30
building tree 10 of 30
building tree 11 of 30
building tree 12 of 30
building tree 13 of 30
building tree 14 of 30
building tree 15 of 30
building tree 16 of 30
building tree 17 of 30
building tree 18 of 30
building tree 19 of 30
building tree 20 of 30
building tree 21 of 30
building tree 22 of 30
building tree 23 of 30
building tree 24 of 30
building tree 25 of 30
building tree 26 of 30
building tree 27 of 30
building tree 28 of 30
building tree 29 of 30
building tree 30 of 30

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   31.5s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    9.5s finished



2.61100651506
building tree 1 of 30
building tree 2 of 30
building tree 3 of 30
building tree 4 of 30
building tree 5 of 30
building tree 6 of 30
building tree 7 of 30
building tree 8 of 30
building tree 9 of 30
building tree 10 of 30
building tree 11 of 30
building tree 12 of 30
building tree 13 of 30
building tree 14 of 30
building tree 15 of 30
building tree 16 of 30
building tree 17 of 30
building tree 18 of 30
building tree 19 of 30
building tree 20 of 30
building tree 21 of 30
building tree 22 of 30
building tree 23 of 30
building tree 24 of 30
building tree 25 of 30
building tree 26 of 30
building tree 27 of 30
building tree 28 of 30
building tree 29 of 30
building tree 30 of 30

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   30.4s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    9.3s finished



2.59563500804
building tree 1 of 30
building tree 2 of 30
building tree 3 of 30
building tree 4 of 30
building tree 5 of 30
building tree 6 of 30
building tree 7 of 30
building tree 8 of 30
building tree 9 of 30
building tree 10 of 30
building tree 11 of 30
building tree 12 of 30
building tree 13 of 30
building tree 14 of 30
building tree 15 of 30
building tree 16 of 30
building tree 17 of 30
building tree 18 of 30
building tree 19 of 30
building tree 20 of 30
building tree 21 of 30
building tree 22 of 30
building tree 23 of 30
building tree 24 of 30
building tree 25 of 30
building tree 26 of 30
building tree 27 of 30
building tree 28 of 30
building tree 29 of 30
building tree 30 of 30

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   31.4s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   11.6s finished



2.60444391898
[2.6110065150577122, 2.5956350080376058, 2.6044439189789244]
Average: 2.60369514736


To make a submission file, we modified the submission function from [this script](https://www.kaggle.com/shifanmao/sf-crime/random-forest-2/code). 

In [13]:
def make_submission(clf, predictors, path='my_submission.csv'):
    '''This function will take in a trained model, a list of predictors, and an optional 
    filepath and create a submissision file for us.'''
   
    test_data = pd.read_csv('test.csv')
    
    test_data, newColumns = recodeData(test_data)
    
    #clf.fit(trainX[predictors], trainY)
    predictions = clf.predict_proba(test_data[predictors])

    submission = pd.DataFrame({
        'Id': test_data.Id
    })
    
    for i in range(predictions.shape[1]):
        submission[Categories[i]] = predictions[:,i]
    submission.to_csv(path, index=False)

    print(" -- Wrote submission to file {}.".format(path))


Now we fit the model with all the data and make a submission file. 

In [None]:
clf.fit(X, y)
print "model fitted with all data"

In [None]:
make_submission(clf, columnsToUse)

This model received a score of 2.491 with proper parameter tuning. This isn't bad, but let's see if we can't get better. 