# Model Iteration 1
Here, I want to implement a very simple model. To do this, I'm not going to create very many features, just recode some of the variables that are categorical like district. 

## Importing Everything!

In [None]:
import shapefile
import pandas as pd
import numpy as np
import itertools
import re

import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
from matplotlib import cm
from datetime import datetime
from ipywidgets import widgets  
from IPython.display import display


from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_iris
from sklearn import tree
from sklearn.metrics import log_loss


## Loading in the data
Now, I need to load in the data

In [None]:
readData = pd.read_csv('train.csv')
Categories = ['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
              'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
              'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION',
              'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING',
              'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING',
              'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES',
              'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE',
              'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE',
              'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE',
              'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT',
              'WARRANTS', 'WEAPON LAWS']

## Helper functions for recoding data
Here are the helper functions for recoding data. We'll add more as we create some new features

In [None]:
def recodeData(df, isTrain = False):
    '''This function takes in the dataframe that we get from loading in the 
    SF crime data and returns a re-coded dataframe that has all the 
    additional features we want to add and the categorical features recoded 
    and cleaned.
    '''

    #since the modifications are done in-place we don't return the dataframe. 
    #we do, however, return the list of all the columns we added.
    newLatLon = removeOutlierLatLon(df)
    newDate = recodeDates(df)
    newDistrict = recodePoliceDistricts(df)
    newAddress = recodeAddresses(df)

    
    addedColumns = [] 
    addedColumns += newDate
    addedColumns += newDistrict 
    addedColumns += newAddress
    addedColumns += newLatLon
    
    
   

    if (isTrain):
        newCategory = recodeCategories(df)
#         addedColumns += newCategory
        try: #prevents error if the coumns have already been removed or we are processing test data
            columnsToDrop = ['Descript', 'Resolution']
            df.drop(columnsToDrop, axis=1, inplace=True)
        except:
            print "already recoded"
         

    return df, addedColumns

In [None]:
def recodeDates(df):
    '''This function takes in a dataframe and recodes the date field into 
    useable values. Here, we also recode the day of week.'''
    print("Recoding Dates")
    #Recode the dates column to year, month, day and hour columns
    df['DateTime'] = df['Dates'].apply(
        lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

    df['Year'] = df['DateTime'].apply(lambda x: x.year)
    df['Month'] = df['DateTime'].apply(lambda x: x.month)
    df['Day'] = df['DateTime'].apply(lambda x: x.day)
    df['Hour'] = df['DateTime'].apply(lambda x: x.hour)
    df['Minute'] = df['DateTime'].apply(lambda x: x.minute)
    df['DayOfWeekRecode'] = df['DateTime'].apply(lambda x: x.weekday())
    #df['MinuteOfWeek'] = df['DateTime'].apply(lambda x: x.weekday()*24*60 + x.hour*60 + x.minute)

    return ['Year', 'Month', 'Day', 'Hour', 'Minute', 'DayOfWeekRecode']

In [None]:
def recodePoliceDistricts(df):
    '''This function recodes the police district to a one-hot encoding 
    scheme.'''
    print ("Recoding Districts")
    districts = df['PdDistrict'].unique().tolist()
    newColumns = []
    for district in districts:
        newColumns.append('District' + district)
        df['District' + district] = df['PdDistrict'] == district

    return newColumns

In [None]:
def recodeAddresses(df):
    '''This function will attempt to create some features related to the address field in the database. To do this, 
    first, we need to split up the address field into two different address fields'''
    
    print ("Recoding Addresses")
    #If there are two addresss, split fields. Also extract the block number
    df['Address1'] = df['Address'].apply(lambda x: re.sub(r'^\d+ Block of ','',x.split(" / ")[0]))
    df['Address2'] = df['Address'].apply(lambda x: (x.split(" / ")[1]) if (len(x.split(" / ")) > 1) else '')
    
    popularStreets = {'ARSON': ['FITCH ST', 'FOLSOM ST', 'MISSION ST', 'BRYANT ST', 'POLK ST', 'MARKET ST'],
                         'ASSAULT': ['BRYANT ST', 'MARKET ST', 'MISSION ST', 'JONES ST'],
                         'BAD CHECKS': ['BRYANT ST', '2ND ST', 'MARKET ST', 'MISSION ST', '16TH ST'],
                         'BRIBERY': ['6TH ST', 'BRYANT ST', 'MISSION ST', 'EDDY ST'],
                         'BURGLARY': ['MISSION ST', 'FULTON ST', 'HARRISON ST', 'MARKET ST', 'POLK ST', 'BRYANT ST'],
                         'DISORDERLY CONDUCT': ['CAPP ST', '16TH ST', '17TH ST', 'SHOTWELL ST', 'ELLIS ST', 'HAIGHT ST'],
                         'DRIVING UNDER THE INFLUENCE': ['MARKET ST', 'FOLSOM ST', 'GEARY BL', 'MISSION ST'],
                         'DRUG/NARCOTIC': ['MISSION ST', 'JONES ST', 'TAYLOR ST', 'TURK ST', 'MARKET ST', 'LEAVENWORTH ST'],
                         'DRUNKENNESS': ['COLE ST', 'MARKET ST', 'MISSION ST', 'HAIGHT ST'],
                         'EMBEZZLEMENT': ['BRYANT ST', 'OFARRELL ST', '4TH ST', 'MARKET ST', 'MISSION ST'],
                         'EXTORTION': ['MISSION ST', 'LYON ST', 'GENEVA AV', 'BRYANT ST', 'OFARRELL ST', 'GEARY BL'],
                         'FAMILY OFFENSES': ['MARKET ST', 'JONES ST', 'EDDY ST', 'MISSION ST', 'POTRERO AV'],
                         'FORGERY/COUNTERFEITING': ['BRYANT ST', 'MARKET ST', 'MISSION ST', 'JONES ST'],
                         'FRAUD': ['MARKET ST', 'BRYANT ST', 'MISSION ST', 'POWELL ST'],
                         'GAMBLING': ['MISSION ST', 'KEARNY ST', '3RD ST', 'JEFFERSON ST', 'QUESADA AV', 'POWELL ST'],
                         'KIDNAPPING': ['MARKET ST', 'TURK ST', 'BRYANT ST', 'MISSION ST', '16TH ST'],
                         'LARCENY/THEFT': ['HARRISON ST', 'BRYANT ST', 'MARKET ST', 'MISSION ST'],
                         'LIQUOR LAWS': ['3RD ST', '24TH ST', 'MISSION ST', 'COLE ST'],
                         'LOITERING': ['HARRISON ST', 'SOUTH VAN NESS AV', 'MISSION ST', '13TH ST'],
                         'MISSING PERSON': ['PHELPS ST', 'MISSION ST', 'HARRISON ST', 'MARKET ST', 'POTRERO AV', 'BRYANT ST'],
                         'NON-CRIMINAL': ['MARKET ST', 'BRYANT ST', 'MISSION ST', 'POWELL ST'],
                         'OTHER OFFENSES': ['BRYANT ST', 'MARKET ST', 'MISSION ST', 'JONES ST'],
                         'PORNOGRAPHY/OBSCENE MAT': ['16TH ST', '18TH ST', 'LARKIN ST', 'TREAT AV', '10TH AV', '5TH ST'],
                         'PROSTITUTION': ['17TH ST', 'LARKIN ST', '19TH ST', 'HYDE ST', 'SHOTWELL ST'],
                         'RECOVERED VEHICLE': ['HARRISON ST', 'ELLIS ST', 'ALEMANY BL', 'LARKIN ST', 'MISSION ST'],
                         'ROBBERY': ['BRYANT ST', 'MARKET ST', 'MISSION ST', 'JONES ST'],
                         'RUNAWAY': ['MISSION ST', '12TH AV', 'CAPITOL AV', 'POTRERO AV', 'MARKET ST', 'PAGE ST'],
                         'SECONDARY CODES': ['3RD ST', 'MARKET ST', 'BRYANT ST', 'MISSION ST'],
                         'SEX OFFENSES FORCIBLE': ['MARKET ST', 'POLK ST', 'BRYANT ST', 'MISSION ST'],
                         'SEX OFFENSES NON FORCIBLE': ['18TH ST', '3RD ST', 'BRYANT ST', '17TH ST', 'POTRERO AV', '11TH AV'],
                         'STOLEN PROPERTY': ['BRYANT ST', 'POLK ST', 'MARKET ST', 'MISSION ST'],
                         'SUICIDE': ['BRYANT ST', 'BUSH ST', '7TH ST', 'POTRERO AV', '24TH AV', 'GGBRIDGE HY'],
                         'SUSPICIOUS OCC': ['MARKET ST', 'BRYANT ST', 'MISSION ST', 'TAYLOR ST'],
                         'TREA': ['20TH ST', 'ARMSTRONG AV', 'CASTRO ST'],
                         'TRESPASS': ['HARRISON ST', 'BRYANT ST', 'MARKET ST', 'MISSION ST', 'POTRERO AV'],
                         'VANDALISM': ['HARRISON ST', 'MARKET ST', 'FOLSOM ST', 'BRYANT ST', 'MISSION ST'],
                         'VEHICLE THEFT': ['3RD ST', 'HARRISON ST', 'FOLSOM ST', 'MISSION ST'],
                         'WARRANTS': ['BRYANT ST', 'MARKET ST', 'MISSION ST', 'JONES ST'],
                         'WEAPON LAWS': ['TURK ST', 'MARKET ST', 'MISSION ST', 'JONES ST']}

    
    addedCols = ['StreetCornerFlag', 'BlockNumber']
    
    for category in Categories:
        addedCols.append('CommonStreet' + category)
        #df['CommonStreet' + category] = df.apply(lambda x: (x['Address1'] in popularStreets[category] or x['Address2'] in popularStreets[category]), axis=1)
        df['CommonStreet' + category] = (df['Address1'].isin(popularStreets[category])) | (df['Address2'].isin(popularStreets[category]))
    
    #     streets = set(df['Address1'].unique().tolist() + df['Address2'].unique().tolist())
    #     for street in streets:
    #         df['OnStreet' + street] = df.apply(lambda x: (x['Address1'] == street or x['Address2'] == street), axis=1)

    #Also, try getting dummies for the addresses?
    
    
    df['BlockNumber'] = df['Address'].apply(lambda x: int(re.findall(r'^\d+',x)[0]) if (len(re.findall(r'^\d+',x)) > 0) else None )
    df['BlockNumber'] = df['BlockNumber'].fillna(-1)
    
    #Also add the "did the crime occur on a street corner field?"
    df['StreetCornerFlag'] = df['Address'].apply(lambda x: len(x.split(" / ")) > 1)
    return addedCols

In [None]:
def removeOutlierLatLon(df):
    '''This function will attempt remove outlier Latitudes and Longitudes'''
    print("Removing LatLong Outliers and Binning Lat and Long")
    df.loc[df.X > -121, 'X'] = df.loc[(df.X > -121)].apply(lambda row: df.X[df["PdDistrict"] == row['PdDistrict']].median(), axis=1)
    df.loc[df.Y > 38, 'Y'] = df.loc[(df.Y > 38)].apply(lambda row: df.Y[df["PdDistrict"] == row['PdDistrict']].median(), axis=1)
    
#     xs = [df['X'].min(), df['X'].max()]
#     ys = [df['Y'].min(), df['Y'].max()]
#     xbins = 20
#     ybins = 20
    
#     xBins = np.linspace(xs[0],xs[1],xbins+1 )
#     yBins = np.linspace(ys[0],ys[1],ybins+1 )
#     #Handle weird outliers
#     xBins = np.concatenate(([xBins[0] -1], xBins, [xBins[-1]+ 1]))
#     yBins = np.concatenate(([yBins[0] -1], yBins, [yBins[-1]+ 1]))
#     print (xBins, yBins)

    
#     df['XBins'] = pd.cut(df['X'], xBins, labels=range(len(xBins)-1))
#     df['YBins'] = pd.cut(df['Y'], yBins, labels=range(len(yBins)-1))
#     return ['XBins', 'YBins']
    return ['X', 'Y']

In [None]:
def recodeCategories(df):
    '''This function will recode the crime category to the appropriate number'''
    print('Recoding categories')
    #if 'Category' in df.columns:
    df['CategoryRecode'] = df.Category.apply(lambda x: Categories.index(x))
        
    return ['CategoryRecode']

## Recoding Columns
Here, we want to do some recoding of the columns. To do this, we're going to use our handy-dandy helper functions.  

In [None]:
crimeData, addedColumns = recodeData(
    readData, isTrain = True)
crimeData.describe()
# crimeData[crimeData[['XBins', 'YBins']].isnull().any(axis=1)]

## Model Iteration 1
Now that I've done some recoding, I'm going to create my model. To do this, I'm going to do a random forest classifier. 

In [None]:
columnsToUse = addedColumns

# columnsToUse = ['X','Y', 'Year', 'Month', 'Hour', 'Minute',
#        'DayOfWeekRecode', 'DistrictNORTHERN', 'DistrictPARK',
#        'DistrictINGLESIDE', 'DistrictBAYVIEW', 'DistrictRICHMOND',
#        'DistrictCENTRAL', 'DistrictTARAVAL', 'DistrictTENDERLOIN',
#        'DistrictMISSION', 'DistrictSOUTHERN', 'StreetCornerFlag', 'BlockNumber']

X = crimeData[columnsToUse]
y = crimeData['CategoryRecode']

clf = RandomForestClassifier(n_estimators=30, max_depth = 8, max_leaf_nodes = 100, random_state=1, verbose=2)

k_folds = StratifiedShuffleSplit(y, 3, test_size=0.5, random_state=0)

scores = []

for k, (train, test) in enumerate(k_folds):
    clf.fit(X.iloc[train], y.iloc[train])
    probs = clf.predict_proba(X.iloc[test])
    score = log_loss(y.iloc[test], probs)
    print score
    scores.append(score)
    
print(scores)
print("Average: " + str(np.average(scores)))

In [None]:
addedColumns

In [None]:
def make_submission(clf, predictors, path='my_submission.csv'):
    '''This function will take in a trained model, a list of predictors, and an optional 
    filepath and create a submissision file for us.'''
   
    test_data = pd.read_csv('test.csv')
    
    test_data, newColumns = recodeData(test_data)
    
    #clf.fit(trainX[predictors], trainY)
    predictions = clf.predict_proba(test_data[newColumns])

    submission = pd.DataFrame({
        'Id': test_data.Id
    })
    
    for i in range(predictions.shape[1]):
        submission[Categories[i]] = predictions[:,i]
    submission.to_csv(path, index=False)

    print(" -- Wrote submission to file {}.".format(path))


In [None]:
clf.fit(X, y)
print "model fitted with all data"

In [None]:
make_submission(clf, columnsToUse)

Best Score: 	
2.44156

Okay, so what I'm going to do here, is to go category by category and make human decisions about what constitutes as a top contender for streets based on each crime category. Then, I'll create a dictionary of what I think are the top streets for this to happen on. 

In [None]:
countsByStreet = crimeData.groupby(['Category','Address1']).count().reset_index()
countsByStreet2 = crimeData.groupby(['Category','Address2']).count().reset_index()

mostPopStreetsNum = 3
howManyTopStreets = 5
#Get the most popular streets for crime:
streetCounts =  crimeData.groupby(['Address1']).count().reset_index().sort_values(by =['Dates'], ascending=[0]).groupby('Address1').head(mostPopStreetsNum)
popStreets = streetCounts.head(mostPopStreetsNum)['Address1'].tolist()


streetCounts2 =  crimeData[crimeData.Address2 != ''].groupby(['Address2']).count().reset_index().sort_values(by =['Dates'], ascending=[0]).groupby('Address2').head(mostPopStreetsNum)
popStreets2 = streetCounts2.head(mostPopStreetsNum)['Address2'].tolist()

popStreets = list(set(popStreets + popStreets2))

popularCategories = {}
for category in Categories:
    categoryCounts = countsByStreet[countsByStreet.Category == category]
    categoryCounts2 = countsByStreet2[countsByStreet2.Category == category]
    categoryCounts2 = categoryCounts2[categoryCounts2.Address2 != '']
    
    maxCountsByStreet = categoryCounts.sort_values(by =['Category', 'Dates'], ascending=[1,0]).groupby('Category').head(howManyTopStreets)
    maxCountsByStreet2 = categoryCounts2.sort_values(by =['Category', 'Dates'], ascending=[1,0]).groupby('Category').head(howManyTopStreets)
    
    address1Streets = [street for street in maxCountsByStreet['Address1'].tolist() if not(street in popStreets)]
    address2Streets = [street for street in maxCountsByStreet2['Address2'].tolist() if not(street not in popStreets)]
    
    popularCategories[category] = list(set(address1Streets + address2Streets))


In [None]:
popularCategories

In [None]:
popStreets

Maybe, it'll be interesting to divide the lat longs into a grid, and have that be a feature. This way the model has to split less on lat long?

In [None]:
xs = [crimeData['X'].min(), crimeData['X'].max()]
ys = [crimeData['Y'].min(), crimeData['Y'].max()]
xbins = 2
ybins = 2
xBins = np.linspace(xs[0],xs[1],xbins+1 )
yBins = np.linspace(ys[0],ys[1],ybins+1 )
xs,xBins, ys, yBins



In [None]:
crimeData['XBins'] = pd.cut(crimeData['X'], xBins, labels=range(1,len(xBins)))
crimeData['YBins'] = pd.cut(crimeData['Y'], yBins, labels=range(1,len(yBins)))