Sophia and Mac-I

This notebook is a notebook with tuned parameters to run on deepthought. Unsurprisingly, a boost in computational power also leads to a boost in model performance

## Importing everything

In [5]:
import pandas as pd
import numpy as np
import itertools
import re

# import matplotlib.pyplot as plt
# from matplotlib.patches import Polygon
# from matplotlib.collections import PatchCollection
# from matplotlib import cm
from datetime import datetime
from ipywidgets import widgets  
from IPython.display import display


from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_iris
from sklearn import tree
from sklearn.metrics import log_loss

from time import time
from operator import itemgetter
from scipy.stats import randint as sp_randint

from sklearn.grid_search import GridSearchCV, RandomizedSearchCV

import xgboost as xgb

print "imports imported"

imports imported


## Read in data and Recoding Data

In [6]:
crimeData = pd.read_csv('train.csv')
Categories = ['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
              'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
              'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION',
              'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING',
              'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING',
              'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES',
              'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE',
              'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE',
              'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE',
              'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT',
              'WARRANTS', 'WEAPON LAWS']

In [7]:
def recodeData(df, isTrain = False):
    '''This function takes in the dataframe that we get from loading in the 
    SF crime data and returns a re-coded dataframe that has all the 
    additional features we want to add and the categorical features recoded 
    and cleaned.
    '''

    #since the modifications are done in-place we don't return the dataframe. 
    #we do, however, return the list of all the columns we added.
    df, newLatLon = removeOutlierLatLon(df)
    df, newDate = recodeDates(df)
    df, newDistrict = recodePoliceDistricts(df)
    df, newWeather = addWeather(df)
    print "recoding addresses"
    df, newAddress, streetColumns = recodeAddresses(df)

    
    addedColumns = [] 
    addedColumns += newDate
    addedColumns += newDistrict 
    addedColumns += newLatLon
    addedColumns += newWeather
    addedColumns += newAddress
    
    
    
   

    if (isTrain):
        newCategory = recodeCategories(df)
        addedColumns += newCategory
        try: #prevents error if the coumns have already been removed or we are processing test data
            columnsToDrop = ['Descript', 'Resolution']
            df.drop(columnsToDrop, axis=1, inplace=True)
        except:
            print "already recoded or using test data"
         

    return df, addedColumns, streetColumns

In [8]:
def recodeDates(df):
    '''This function takes in a dataframe and recodes the date field into 
    useable values. Here, we also recode the day of week.'''
    #Recode the dates column to year, month, day and hour columns
    df['DateTime'] = df['Dates'].apply(
        lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

    df['Year'] = df['DateTime'].apply(lambda x: x.year)
    df['Month'] = df['DateTime'].apply(lambda x: x.month)
    df['Day'] = df['DateTime'].apply(lambda x: x.day)
    df['Hour'] = df['DateTime'].apply(lambda x: x.hour)
    df['Minute'] = df['DateTime'].apply(lambda x: x.minute)
    df['DayOfWeekRecode'] = df['DateTime'].apply(lambda x: x.weekday())
    df['MinuteOfWeek'] = df['DateTime'].apply(lambda x: x.weekday()*24*60 + x.hour*60 + x.minute)

    return df, ['Year', 'Month', 'Day', 'Hour', 'Minute', 'DayOfWeekRecode','MinuteOfWeek']

In [9]:
def recodePoliceDistricts(df):
    '''This function recodes the police district to a one-hot encoding 
    scheme.'''
    districts = df['PdDistrict'].unique().tolist()
    newColumns = []
    for district in districts:
        newColumns.append('District' + district)
        df['District' + district] = df['PdDistrict'].apply(
            lambda x: int(x == district))

    return df, newColumns

In [10]:
def recodeAddresses(df):
    '''This function will attempt to create some features related to the address field in the database. To do this, 
    first, we need to split up the address field into two different address fields'''
    
    #If there are two addresss, split fields. Also extract the block number
    df['Address1'] = df['Address'].apply(lambda x: re.sub(r'^\d+ Block of ','',x.split(" / ")[0]))
    df['Address2'] = df['Address'].apply(lambda x: (x.split(" / ")[1]) if (len(x.split(" / ")) > 1) else '')
    
    streets = set(df['Address1'].unique().tolist() + df['Address2'].unique().tolist())
    
    streetColumns = []
    i = 0
    
    print "starting address dummy creation"
    
    #address1Dummy = pd.get_dummies(df['Address1']).rename(columns=lambda x: str(x))
    address1Dummy = pd.get_dummies(df['Address1'])
    address1Dummy = address1Dummy.replace(0, np.nan)
    
    print "completed address 1 dummy creation"
    address2Dummy = pd.get_dummies(df['Address2'])
    address2Dummy = address2Dummy.replace(0, np.nan)
    
    print "completed address 2 dummy creation"
    
#     print address1Dummy
#     print address2Dummy.info()
    
    mergedAddressDummy = address1Dummy.combine_first(address2Dummy)
    
    print "completed address dummy DataFrames merge"
    mergedAddressDummy = mergedAddressDummy.fillna(0)
    print "completed fillna on mergedAddressDummy"
    
    streetColumns = list(mergedAddressDummy.columns.values)
    
    df = pd.concat([df, mergedAddressDummy], axis=1)
    print "completed merge of original df and new dummy variable df"
    
    print "Dummy creation finished"
    
#     for street in streets:
#         df['OnStreet' + street] = df.apply(lambda x: (x['Address1'] == street or x['Address2'] == street), axis=1)
#         streetColumns.append('OnStreet' + street)
#         i += 1
#         print i
        

    
    df['BlockNumber'] = df['Address'].apply(lambda x: int(re.findall(r'^\d+',x)[0]) if (len(re.findall(r'^\d+',x)) > 0) else None )
    df['BlockNumber'] = df['BlockNumber'].fillna(-1)
    
    #Also add the "did the crime occur on a street corner field?"
    df['StreetCornerFlag'] = df['Address'].apply(lambda x: len(x.split(" / ")) > 1)
    
    return df, ['StreetCornerFlag', 'BlockNumber'], streetColumns

In [11]:
def removeOutlierLatLon(df):
    '''This function will attempt remove outlier Latitudes and Longitudes'''
    df.loc[df.X > -121, 'X'] = df.loc[(df.X > -121)].apply(lambda row: df.X[df["PdDistrict"] == row['PdDistrict']].median(), axis=1)
    df.loc[df.Y > 38, 'Y'] = df.loc[(df.Y > 38)].apply(lambda row: df.Y[df["PdDistrict"] == row['PdDistrict']].median(), axis=1)

    return df, ['X', 'Y']

In [12]:
def addWeather(df):
    df['DATE'] = df['DateTime'].apply(lambda x: int( str(x.year)+x.strftime('%m')+x.strftime('%d') ))
    
    weatherData = pd.read_csv('weather1.csv')
    weatherData = weatherData.replace('-9999', np.nan)
    
    weatherData = weatherData[['DATE','PRCP','TMAX','TMIN']]
    
    df = pd.merge(df, weatherData, on='DATE')
    
    return df, ['PRCP','TMAX','TMIN']

In [13]:
def recodeCategories(df):
    '''This function will attempt remove outlier Latitudes and Longitudes'''
    #if 'Category' in df.columns:
    df['CategoryRecode'] = df.Category.apply(lambda x: Categories.index(x))
        
    return df, ['CategoryRecode']

In [14]:
crimeData, addedColumns, streetColumns = recodeData(
    crimeData, isTrain = True)
crimeData.describe()

recoding addresses
starting address dummy creation
completed address 1 dummy creation
completed address 2 dummy creation
completed address dummy DataFrames merge
completed fillna on mergedAddressDummy
completed merge of original df and new dummy variable df
Dummy creation finished


Unnamed: 0,X,Y,Year,Month,Day,Hour,Minute,DayOfWeekRecode,MinuteOfWeek,DistrictNORTHERN,...,YOUNG CT,YUKON ST,ZAMPA LN,ZENO PL,ZIRCON PL,ZOE ST,ZOO RD,BlockNumber,StreetCornerFlag,CategoryRecode
count,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,...,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049,878049.0
mean,-122.422763,37.767035,2008.712046,6.436509,15.570623,13.412655,20.155026,2.992691,5134.388787,0.11992,...,2.6e-05,2.6e-05,1.8e-05,5e-06,1.5e-05,9.1e-05,0.000114,704.776747,0.2970381,19.338687
std,0.025284,0.024165,3.631194,3.428972,8.783005,6.549573,18.594915,1.972023,2858.378361,0.324869,...,0.005118,0.005118,0.004269,0.002134,0.003848,0.009545,0.010671,1001.193208,0.4569537,10.688637
min,-122.513642,37.707879,2003.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,-1.0,False,0.0
25%,-122.432952,37.752427,2006.0,3.0,8.0,9.0,0.0,1.0,2600.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0,16.0
50%,-122.41642,37.775421,2009.0,6.0,16.0,14.0,19.0,3.0,5220.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,300.0,0,20.0
75%,-122.406959,37.784368,2012.0,9.0,23.0,19.0,33.0,5.0,7606.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1000.0,1,25.0
max,-122.364937,37.819975,2015.0,12.0,31.0,23.0,59.0,6.0,10079.0,1.0,...,1.0,1.0,1.0,1.0,1.0,1.0,1.0,8300.0,True,38.0


## Learn a Model

In [15]:
columnsToUse = addedColumns

commonStreets = ['FOLSOM ST','16TH ST','JONES ST','TAYLOR ST',
                 'ARMSTRONG AV','EDDY ST','LARKIN ST','CASTRO ST',
                 '10TH AV','5TH ST','HAIGHT ST','OFARRELL ST',
                 '11TH AV','PAGE ST','FITCH ST','CAPP ST','13TH ST',
                 '24TH AV','17TH ST','18TH ST','19TH ST','GENEVA AV',
                 'GEARY BL','BRYANT ST','HYDE ST','4TH ST','FULTON ST',
                 'LEAVENWORTH ST','COLE ST','ALEMANY BL','PHELPS ST',
                 'MISSION ST','6TH ST','12TH AV','SHOTWELL ST',
                 'TREAT AV','7TH ST','JEFFERSON ST','QUESADA AV',
                 'TURK ST','2ND ST','MARKET ST','GGBRIDGE HY',
                 '24TH ST','CAPITOL AV','KEARNY ST','HARRISON ST',
                 'LYON ST','BUSH ST','POLK ST','3RD ST','ELLIS ST',
                 'SOUTH VAN NESS AV','POTRERO AV','20TH ST','POWELL ST']

columnsToUse = ['X','Y', 'Year', 'Month', 'Day','Hour', 'Minute',
       'DayOfWeekRecode', 'DistrictNORTHERN', 'DistrictPARK',
       'DistrictINGLESIDE', 'DistrictBAYVIEW', 'DistrictRICHMOND',
       'DistrictCENTRAL', 'DistrictTARAVAL', 'DistrictTENDERLOIN',
       'DistrictMISSION', 'DistrictSOUTHERN', 'StreetCornerFlag', 'PRCP','TMAX','TMIN']

# columnsToUse = ['X','Y', 'Year', 'Month', 'Day','Hour', 'Minute',
#        'DayOfWeekRecode', 'StreetCornerFlag']

X = crimeData[columnsToUse]
y = crimeData['CategoryRecode']

print "prepping for training complete"

prepping for training complete


Here, we've created a function that takes in a classifier and how many splits to do, and runs a model. 

In [None]:
def runOnTrainData(clf, numSplits=1):
    
    k_folds = StratifiedShuffleSplit(y, numSplits, test_size=0.5, random_state=0)

    scores = []
    print "starting kfold testing"

    for k, (train, test) in enumerate(k_folds):
        print "starting fit"
        start = time()
        clf.fit(X.iloc[train], y.iloc[train], eval_metric = 'mlogloss', verbose=3)
        print "fit complete, time: " + str((time() - start))
        startPredictTime = time()
        probs = clf.predict_proba(X.iloc[test])
        print "predict complete, time: " + str((time() - startPredictTime))
        score = log_loss(y.iloc[test].values, probs)
        print score
        print "total time: " + str((time() - start))
        scores.append(score)

    print(scores)
    print("Average: " + str(np.average(scores)))

In [16]:
clf = xgb.XGBClassifier(max_depth=8, n_estimators=800, objective='multi:softprob', max_delta_step = .8, learning_rate = .0375, nthread = -1)
runOnTrainData(clf, numSplits=2)

In [None]:
predictorsUsed = ['DistrictBAYVIEW',
 'Y',
 'Hour',
 'DistrictSOUTHERN',
 'DistrictTARAVAL',
 'DistrictRICHMOND',
 'Month',
 'DistrictTENDERLOIN',
 'DistrictNORTHERN',
 'DayOfWeekRecode',
 'Minute',
 'Year',
 'StreetCornerFlag',
 'X',
 'DistrictPARK',
 'DistrictMISSION',
 'Day',
 'DistrictCENTRAL',
 'DistrictINGLESIDE']

To make a submission file, we modified the submission function from [this script](https://www.kaggle.com/shifanmao/sf-crime/random-forest-2/code). 

In [17]:
def make_submission(clf, Xtrain, ytrain, predictors, path='my_submissionXGBoostWithWeather.csv'):
    '''This function will take in a trained model, a list of predictors, and an optional 
    filepath and create a submissision file for us.'''
   
    test_data = pd.read_csv('test.csv')
    print "test data loaded"
    
    test_data, newColumns, streetColumns = recodeData(test_data)
    print "test data recoded"
    
    testDataColumns = list(test_data.columns.values)
    
    existingPredictors = list(set(predictors) & set(testDataColumns))
    
    clf.fit(Xtrain[existingPredictors], ytrain, eval_metric = 'mlogloss')
    print "model fitted with all data"
    
    #clf.fit(trainX[predictors], trainY)
    predictions = clf.predict_proba(test_data[existingPredictors])

    submission = pd.DataFrame({
        'Id': test_data.Id
    })
    
    for i in range(predictions.shape[1]):
        submission[Categories[i]] = predictions[:,i]
    submission.to_csv(path, index=False)

    print(" -- Wrote submission to file {}.".format(path))
    return existingPredictors

In [None]:
predictorsUsed = make_submission(clf, X, y, columnsToUse)

Here, we've created a function to report the best scores from our classifier, doing a grid search on parameters to learn on. 

In [None]:
# Utility function to report best scores
def report(grid_scores, n_top=10):
    top_scores = sorted(grid_scores, key=itemgetter(1), reverse=True)[:n_top]
    for i, score in enumerate(top_scores):
        print("Model with rank: {0}".format(i + 1))
        print("Mean validation score: {0:.3f} (std: {1:.3f})".format(
              score.mean_validation_score,
              np.std(score.cv_validation_scores)))
        print("Parameters: {0}".format(score.parameters))
        print("")

clfOpt = xgb.XGBClassifier(objective='multi:softprob', nthread = 12, max_delta_step = 1)
#(max_depth=7, n_estimators=100, objective='multi:softprob', max_delta_step = 1, learning_rate = .33)

param_dist = {"max_depth": [5, 7, 9],
              "n_estimators": [500, 1000],
              "learning_rate": [0.033, .1]}

# run randomized search
n_iter_search = 20
random_search = RandomizedSearchCV(clfOpt, param_distributions=param_dist,
                                   n_iter=n_iter_search, scoring="log_loss", verbose=3, cv=1)

finalX = X[predictorsUsed].values.astype(np.float32)
finaly = y.values.astype(int)

print "pre Random search finished"

In [None]:
print "starting Randomized Search"
start = time()
random_search.fit(finalX, finaly)
print("RandomizedSearchCV took %.2f seconds for %d candidates"
      " parameter settings." % ((time() - start), n_iter_search))
report(random_search.grid_scores_)

We explored using grid search to find a good set of parameters for our model. We found this useful, and found that the number of estimates should be about 30 times the learning rate. 