# Model Iteration - For Super Computer
By Sophia and Mac I
This notebook is meant to be run on deepthought. (As a result, we won't execute all the cells.) What we've done in this model is just increase the number/depth of trees that we are making with the random forest. 

## Importing Everything!

In [None]:
import pandas as pd
import numpy as np
import itertools
import re

import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
from matplotlib import cm
from datetime import datetime
from ipywidgets import widgets  
from IPython.display import display


from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_iris
from sklearn import tree
from sklearn.metrics import log_loss

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

print "imports done"

from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint


## Loading in the data
Now, I need to load in the data

In [None]:
readData = pd.read_csv('train.csv')

print "dataRead"

Categories = ['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
              'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
              'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION',
              'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING',
              'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING',
              'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES',
              'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE',
              'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE',
              'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE',
              'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT',
              'WARRANTS', 'WEAPON LAWS']

## Helper functions for recoding data
Here are the helper functions for recoding data. We'll add more as we create some new features

In [None]:
def recodeData(df, isTrain = False):
    '''This function takes in the dataframe that we get from loading in the 
    SF crime data and returns a re-coded dataframe that has all the 
    additional features we want to add and the categorical features recoded 
    and cleaned.
    '''

    #since the modifications are done in-place we don't return the dataframe. 
    #we do, however, return the list of all the columns we added.
    df, newLatLon = removeOutlierLatLon(df)
    df, newDate = recodeDates(df)
    df, newDistrict = recodePoliceDistricts(df)
    print "recoding addresses"
    df, newAddress, streetColumns = recodeAddresses(df)

    
    addedColumns = [] 
    addedColumns += newDate
    addedColumns += newDistrict 
    addedColumns += newLatLon
    addedColumns += newAddress
    
    
   

    if (isTrain):
        newCategory = recodeCategories(df)
        addedColumns += newCategory
        try: #prevents error if the coumns have already been removed or we are processing test data
            columnsToDrop = ['Descript', 'Resolution']
            df.drop(columnsToDrop, axis=1, inplace=True)
        except:
            print "already recoded or using test data"
         

    return df, addedColumns, streetColumns

In [None]:
def recodeDates(df):
    '''This function takes in a dataframe and recodes the date field into 
    useable values. Here, we also recode the day of week.'''
    #Recode the dates column to year, month, day and hour columns
    df['DateTime'] = df['Dates'].apply(
        lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

    df['Year'] = df['DateTime'].apply(lambda x: x.year)
    df['Month'] = df['DateTime'].apply(lambda x: x.month)
    df['Day'] = df['DateTime'].apply(lambda x: x.day)
    df['Hour'] = df['DateTime'].apply(lambda x: x.hour)
    df['Minute'] = df['DateTime'].apply(lambda x: x.minute)
    df['DayOfWeekRecode'] = df['DateTime'].apply(lambda x: x.weekday())
    #df['MinuteOfWeek'] = df['DateTime'].apply(lambda x: x.weekday()*24*60 + x.hour*60 + x.minute)

    return df, ['Year', 'Month', 'Day', 'Hour', 'Minute', 'DayOfWeekRecode']

In [None]:
def recodePoliceDistricts(df):
    '''This function recodes the police district to a one-hot encoding 
    scheme.'''
    districts = df['PdDistrict'].unique().tolist()
    newColumns = []
    for district in districts:
        newColumns.append('District' + district)
        df['District' + district] = df['PdDistrict'].apply(
            lambda x: int(x == district))

    return df, newColumns

In [None]:
def recodeAddresses(df):
    '''This function will attempt to create some features related to the address field in the database. To do this, 
    first, we need to split up the address field into two different address fields'''
    
    #If there are two addresss, split fields. Also extract the block number
    df['Address1'] = df['Address'].apply(lambda x: re.sub(r'^\d+ Block of ','',x.split(" / ")[0]))
    df['Address2'] = df['Address'].apply(lambda x: (x.split(" / ")[1]) if (len(x.split(" / ")) > 1) else '')
    
    streets = set(df['Address1'].unique().tolist() + df['Address2'].unique().tolist())
    
    streetColumns = []
    i = 0
    
    print "starting address dummy creation"
    
    #address1Dummy = pd.get_dummies(df['Address1']).rename(columns=lambda x: str(x))
    address1Dummy = pd.get_dummies(df['Address1'])
    address1Dummy = address1Dummy.replace(0, np.nan)
    
    print "completed address 1 dummy creation"
    address2Dummy = pd.get_dummies(df['Address2'])
    address2Dummy = address2Dummy.replace(0, np.nan)
    
    print "completed address 2 dummy creation"
    
#     print address1Dummy
#     print address2Dummy.info()
    
    mergedAddressDummy = address1Dummy.combine_first(address2Dummy)
    
    print "completed address dummy DataFrames merge"
    mergedAddressDummy = mergedAddressDummy.fillna(0)
    print "completed fillna on mergedAddressDummy"
    
    streetColumns = list(mergedAddressDummy.columns.values)
    
    df = pd.concat([df, mergedAddressDummy], axis=1)
    print "completed merge of original df and new dummy variable df"
    
    print "Dummy creation finished"
    
#     for street in streets:
#         df['OnStreet' + street] = df.apply(lambda x: (x['Address1'] == street or x['Address2'] == street), axis=1)
#         streetColumns.append('OnStreet' + street)
#         i += 1
#         print i
        

    
    df['BlockNumber'] = df['Address'].apply(lambda x: int(re.findall(r'^\d+',x)[0]) if (len(re.findall(r'^\d+',x)) > 0) else None )
    df['BlockNumber'] = df['BlockNumber'].fillna(-1)
    
    #Also add the "did the crime occur on a street corner field?"
    df['StreetCornerFlag'] = df['Address'].apply(lambda x: len(x.split(" / ")) > 1)
    
    return df, ['StreetCornerFlag', 'BlockNumber'], streetColumns

In [None]:
def removeOutlierLatLon(df):
    '''This function will attempt remove outlier Latitudes and Longitudes'''
    df.loc[df.X > -121, 'X'] = df.loc[(df.X > -121)].apply(lambda row: df.X[df["PdDistrict"] == row['PdDistrict']].median(), axis=1)
    df.loc[df.Y > 38, 'Y'] = df.loc[(df.Y > 38)].apply(lambda row: df.Y[df["PdDistrict"] == row['PdDistrict']].median(), axis=1)

    return df, ['X', 'Y']

In [None]:
def recodeCategories(df):
    '''This function will attempt remove outlier Latitudes and Longitudes'''
    #if 'Category' in df.columns:
    df['CategoryRecode'] = df.Category.apply(lambda x: Categories.index(x))
        
    return df, ['CategoryRecode']

## Recoding Columns
Here, we want to do some recoding of the columns. To do this, we're going to use our handy-dandy helper functions.  

In [None]:
crimeData, addedColumns, streetColumns = recodeData(
    readData, isTrain = True)
crimeData.describe()


In [None]:
streetColumns

In [None]:
crimeData.head(5)

This is a list of the commone streets correstponding to each category of crime (in the same order as the list above) generated using the same method as in the exploring street data. We'll use those to learn just the one-hot encoding of those streets. 

In [None]:
commonStreets = ['FOLSOM ST',
 '16TH ST',
 'JONES ST',
 'TAYLOR ST',
 'ARMSTRONG AV',
 'EDDY ST',
 'LARKIN ST',
 'CASTRO ST',
 '10TH AV',
 '5TH ST',
 'HAIGHT ST',
 'OFARRELL ST',
 '11TH AV',
 'PAGE ST',
 'FITCH ST',
 'CAPP ST',
 '13TH ST',
 '24TH AV',
 '17TH ST',
 '18TH ST',
 '19TH ST',
 'GENEVA AV',
 'GEARY BL',
 'BRYANT ST',
 'HYDE ST',
 '4TH ST',
 'FULTON ST',
 'LEAVENWORTH ST',
 'COLE ST',
 'ALEMANY BL',
 'PHELPS ST',
 'MISSION ST',
 '6TH ST',
 '12TH AV',
 'SHOTWELL ST',
 'TREAT AV',
 '7TH ST',
 'JEFFERSON ST',
 'QUESADA AV',
 'TURK ST',
 '2ND ST',
 'MARKET ST',
 'GGBRIDGE HY',
 '24TH ST',
 'CAPITOL AV',
 'KEARNY ST',
 'HARRISON ST',
 'LYON ST',
 'BUSH ST',
 'POLK ST',
 '3RD ST',
 'ELLIS ST',
 'SOUTH VAN NESS AV',
 'POTRERO AV',
 '20TH ST',
 'POWELL ST']

## Model Iteration 1
Now that I've done some recoding, I'm going to create my model. To do this, I'm going to do a random forest classifier. 

In [None]:
columnsToUse = addedColumns

columnsToUse = ['X','Y', 'Year', 'Month', 'Hour', 'Minute',
       'DayOfWeekRecode', 'DistrictNORTHERN', 'DistrictPARK',
       'DistrictINGLESIDE', 'DistrictBAYVIEW', 'DistrictRICHMOND',
       'DistrictCENTRAL', 'DistrictTARAVAL', 'DistrictTENDERLOIN',
       'DistrictMISSION', 'DistrictSOUTHERN', 'StreetCornerFlag', 'BlockNumber']



X = crimeData[columnsToUse]
y = crimeData['CategoryRecode']

clf = RandomForestClassifier(n_estimators=400, max_depth = 20, random_state=1, n_jobs = 1, verbose=2)

In [None]:


k_folds = StratifiedShuffleSplit(y, 3, test_size=0.5, random_state=0)

scores = []
print "starting kfold"

for k, (train, test) in enumerate(k_folds):
    clf.fit(X.iloc[train], y.iloc[train])
    probs = clf.predict_proba(X.iloc[test])
    score = log_loss(y.iloc[test].values, probs)
    print score
    scores.append(score)
    
print(scores)
print("Average: " + str(np.average(scores)))

To make a submission file, we modified the submission function from [this script](https://www.kaggle.com/shifanmao/sf-crime/random-forest-2/code). 

In [None]:
def make_submission(clf, Xtrain, ytrain, predictors, path='my_submission.csv'):
    '''This function will take in a trained model, a list of predictors, and an optional 
    filepath and create a submissision file for us.'''
   
    test_data = pd.read_csv('test.csv')
    print "test data loaded"
    
    test_data, newColumns, streetColumns = recodeData(test_data)
    print "test data recoded"
    
    testDataColumns = list(test_data.columns.values)
    
    existingPredictors = list(set(predictors) & set(testDataColumns))
    
    clf.fit(Xtrain[existingPredictors], ytrain)
    print "model fitted with all data"
    
    #clf.fit(trainX[predictors], trainY)
    predictions = clf.predict_proba(test_data[existingPredictors])

    submission = pd.DataFrame({
        'Id': test_data.Id
    })
    
    for i in range(predictions.shape[1]):
        submission[Categories[i]] = predictions[:,i]
    submission.to_csv(path, index=False)

    print(" -- Wrote submission to file {}.".format(path))
    return existingPredictors


This model performed better than previous models, scoring 2.29871! Adding street information appears to have helped! Yay!

In [None]:
predictorsUsed = make_submission(clf, X, y, columnsToUse)

In [None]:
predictorsUsedSaved = ['FOLSOM ST',
 '16TH ST',
 '13TH ST',
 'TAYLOR ST',
 'ARMSTRONG AV',
 'EDDY ST',
 'TREAT AV',
 'LARKIN ST',
 'CASTRO ST',
 'StreetCornerFlag',
 '10TH AV',
 '5TH ST',
 'HAIGHT ST',
 'DistrictNORTHERN',
 'OFARRELL ST',
 'QUESADA AV',
 '11TH AV',
 'DistrictPARK',
 'PAGE ST',
 'FITCH ST',
 'DistrictTARAVAL',
 'JONES ST',
 'GEARY BL',
 '17TH ST',
 '18TH ST',
 '19TH ST',
 'BRYANT ST',
 'HYDE ST',
 '4TH ST',
 'GENEVA AV',
 'Minute',
 'COLE ST',
 '7TH ST',
 'DistrictBAYVIEW',
 'ALEMANY BL',
 'PHELPS ST',
 'MISSION ST',
 'Hour',
 '6TH ST',
 'DistrictSOUTHERN',
 'CAPP ST',
 'SHOTWELL ST',
 '12TH AV',
 'DistrictTENDERLOIN',
 'DayOfWeekRecode',
 'JEFFERSON ST',
 'Month',
 'Y',
 'X',
 'TURK ST',
 '2ND ST',
 'MARKET ST',
 'GGBRIDGE HY',
 'DistrictCENTRAL',
 'DistrictINGLESIDE',
 '24TH AV',
 '24TH ST',
 'CAPITOL AV',
 'KEARNY ST',
 'HARRISON ST',
 'FULTON ST',
 'BUSH ST',
 'POLK ST',
 'DistrictRICHMOND',
 'BlockNumber',
 '3RD ST',
 'ELLIS ST',
 'SOUTH VAN NESS AV',
 'LEAVENWORTH ST',
 'POTRERO AV',
 'Year',
 '20TH ST',
 'DistrictMISSION',
 'POWELL ST',
 'LYON ST']


Best Score: 	
2.44156

The code below is us experimenting with grid search. We didn't find this particularly helpful, but it was fun to try out. 

In [None]:
# Utility function to report best scores
def report(grid_scores, n_top=3):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

clfOpt = RandomForestClassifier(verbose=3)
#(n_estimators=400, max_depth = 20, random_state=1, )

param_dist = {"max_depth": [5, 10, 20 , None],
              "n_estimators": [50, 200, 400, 800],
              "min_samples_split": [2, 10, 100],
              "max_features": ['auto', None]}

# run randomized search
n_iter_search = 48
random_search = RandomizedSearchCV(clfOpt, param_distributions=param_dist,
                                   n_iter=n_iter_search, scoring="log_loss", verbose=3, n_jobs = 12)

finalX = X[predictorsUsedSaved].values.astype(np.float32)
finaly = y.values.astype(int)
print "pre Random search finished"

In [None]:
print "starting Randomized Search"
start = time()
random_search.fit(finalX, finaly)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)

In [None]:
finalXfloat = finalX.astype(np.float32)

In [None]:
finalXfloat.dtype

In [None]:
finalXfloat.nbytes

We didn't really get much of an improvement with score using this, but maybe there are other sets of data that perform very well with a grid-search approach. 