# Model iteration - Street Data (Sophia and Mac-I)

Here, we're going to build on our model, by including the address data into what we're taking into account 

## Importing Everything!

In [1]:
import shapefile
import pandas as pd
import numpy as np
import itertools
import re

import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
from matplotlib import cm
from datetime import datetime
from ipywidgets import widgets  
from IPython.display import display


from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.cross_validation import StratifiedKFold
from sklearn.cross_validation import StratifiedShuffleSplit
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_iris
from sklearn import tree
from sklearn.metrics import log_loss


## Loading in the data
Now, I need to load in the data

In [3]:
readData = pd.read_csv('./../train.csv')
Categories = ['ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
              'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE',
              'DRUG/NARCOTIC', 'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION',
              'FAMILY OFFENSES', 'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING',
              'KIDNAPPING', 'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING',
              'MISSING PERSON', 'NON-CRIMINAL', 'OTHER OFFENSES',
              'PORNOGRAPHY/OBSCENE MAT', 'PROSTITUTION', 'RECOVERED VEHICLE',
              'ROBBERY', 'RUNAWAY', 'SECONDARY CODES', 'SEX OFFENSES FORCIBLE',
              'SEX OFFENSES NON FORCIBLE', 'STOLEN PROPERTY', 'SUICIDE',
              'SUSPICIOUS OCC', 'TREA', 'TRESPASS', 'VANDALISM', 'VEHICLE THEFT',
              'WARRANTS', 'WEAPON LAWS']

## Helper functions for recoding data
Here are the helper functions for recoding data. We'll add more as we create some new features

In [4]:
def recodeData(df, isTrain = False):
    '''This function takes in the dataframe that we get from loading in the 
    SF crime data and returns a re-coded dataframe that has all the 
    additional features we want to add and the categorical features recoded 
    and cleaned.
    '''

    #since the modifications are done in-place we don't return the dataframe. 
    #we do, however, return the list of all the columns we added.
    newLatLon = removeOutlierLatLon(df)
    newDate = recodeDates(df)
    newDistrict = recodePoliceDistricts(df)
    newAddress = recodeAddresses(df)

    
    addedColumns = [] 
    addedColumns += newDate
    addedColumns += newDistrict 
    addedColumns += newAddress
    addedColumns += newLatLon
    
    
   

    if (isTrain):
        newCategory = recodeCategories(df)
#         addedColumns += newCategory
        try: #prevents error if the coumns have already been removed or we are processing test data
            columnsToDrop = ['Descript', 'Resolution']
            df.drop(columnsToDrop, axis=1, inplace=True)
        except:
            print "already recoded"
         

    return df, addedColumns

In [5]:
def recodeDates(df):
    '''This function takes in a dataframe and recodes the date field into 
    useable values. Here, we also recode the day of week.'''
    print("Recoding Dates")
    #Recode the dates column to year, month, day and hour columns
    df['DateTime'] = df['Dates'].apply(
        lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

    df['Year'] = df['DateTime'].apply(lambda x: x.year)
    df['Month'] = df['DateTime'].apply(lambda x: x.month)
    df['Day'] = df['DateTime'].apply(lambda x: x.day)
    df['Hour'] = df['DateTime'].apply(lambda x: x.hour)
    df['Minute'] = df['DateTime'].apply(lambda x: x.minute)
    df['DayOfWeekRecode'] = df['DateTime'].apply(lambda x: x.weekday())
    #df['MinuteOfWeek'] = df['DateTime'].apply(lambda x: x.weekday()*24*60 + x.hour*60 + x.minute)

    return ['Year', 'Month', 'Day', 'Hour', 'Minute', 'DayOfWeekRecode']

In [6]:
def recodePoliceDistricts(df):
    '''This function recodes the police district to a one-hot encoding 
    scheme.'''
    print ("Recoding Districts")
    districts = df['PdDistrict'].unique().tolist()
    newColumns = []
    for district in districts:
        newColumns.append('District' + district)
        df['District' + district] = df['PdDistrict'] == district

    return newColumns

In [8]:
def recodeAddresses(df):
    '''This function will attempt to create some features related to the address field in the database. To do this, 
    first, we need to split up the address field into two different address fields'''
    
    print ("Recoding Addresses")
    #If there are two addresss, split fields. Also extract the block number
    df['Address1'] = df['Address'].apply(lambda x: re.sub(r'^\d+ Block of ','',x.split(" / ")[0]))
    df['Address2'] = df['Address'].apply(lambda x: (x.split(" / ")[1]) if (len(x.split(" / ")) > 1) else '')
    
    #More on how we generated this dictionary below
    popularStreets = {'ARSON': ['FITCH ST', 'FOLSOM ST', 'MISSION ST', 'BRYANT ST', 'POLK ST', 'MARKET ST'],
                         'ASSAULT': ['BRYANT ST', 'MARKET ST', 'MISSION ST', 'JONES ST'],
                         'BAD CHECKS': ['BRYANT ST', '2ND ST', 'MARKET ST', 'MISSION ST', '16TH ST'],
                         'BRIBERY': ['6TH ST', 'BRYANT ST', 'MISSION ST', 'EDDY ST'],
                         'BURGLARY': ['MISSION ST', 'FULTON ST', 'HARRISON ST', 'MARKET ST', 'POLK ST', 'BRYANT ST'],
                         'DISORDERLY CONDUCT': ['CAPP ST', '16TH ST', '17TH ST', 'SHOTWELL ST', 'ELLIS ST', 'HAIGHT ST'],
                         'DRIVING UNDER THE INFLUENCE': ['MARKET ST', 'FOLSOM ST', 'GEARY BL', 'MISSION ST'],
                         'DRUG/NARCOTIC': ['MISSION ST', 'JONES ST', 'TAYLOR ST', 'TURK ST', 'MARKET ST', 'LEAVENWORTH ST'],
                         'DRUNKENNESS': ['COLE ST', 'MARKET ST', 'MISSION ST', 'HAIGHT ST'],
                         'EMBEZZLEMENT': ['BRYANT ST', 'OFARRELL ST', '4TH ST', 'MARKET ST', 'MISSION ST'],
                         'EXTORTION': ['MISSION ST', 'LYON ST', 'GENEVA AV', 'BRYANT ST', 'OFARRELL ST', 'GEARY BL'],
                         'FAMILY OFFENSES': ['MARKET ST', 'JONES ST', 'EDDY ST', 'MISSION ST', 'POTRERO AV'],
                         'FORGERY/COUNTERFEITING': ['BRYANT ST', 'MARKET ST', 'MISSION ST', 'JONES ST'],
                         'FRAUD': ['MARKET ST', 'BRYANT ST', 'MISSION ST', 'POWELL ST'],
                         'GAMBLING': ['MISSION ST', 'KEARNY ST', '3RD ST', 'JEFFERSON ST', 'QUESADA AV', 'POWELL ST'],
                         'KIDNAPPING': ['MARKET ST', 'TURK ST', 'BRYANT ST', 'MISSION ST', '16TH ST'],
                         'LARCENY/THEFT': ['HARRISON ST', 'BRYANT ST', 'MARKET ST', 'MISSION ST'],
                         'LIQUOR LAWS': ['3RD ST', '24TH ST', 'MISSION ST', 'COLE ST'],
                         'LOITERING': ['HARRISON ST', 'SOUTH VAN NESS AV', 'MISSION ST', '13TH ST'],
                         'MISSING PERSON': ['PHELPS ST', 'MISSION ST', 'HARRISON ST', 'MARKET ST', 'POTRERO AV', 'BRYANT ST'],
                         'NON-CRIMINAL': ['MARKET ST', 'BRYANT ST', 'MISSION ST', 'POWELL ST'],
                         'OTHER OFFENSES': ['BRYANT ST', 'MARKET ST', 'MISSION ST', 'JONES ST'],
                         'PORNOGRAPHY/OBSCENE MAT': ['16TH ST', '18TH ST', 'LARKIN ST', 'TREAT AV', '10TH AV', '5TH ST'],
                         'PROSTITUTION': ['17TH ST', 'LARKIN ST', '19TH ST', 'HYDE ST', 'SHOTWELL ST'],
                         'RECOVERED VEHICLE': ['HARRISON ST', 'ELLIS ST', 'ALEMANY BL', 'LARKIN ST', 'MISSION ST'],
                         'ROBBERY': ['BRYANT ST', 'MARKET ST', 'MISSION ST', 'JONES ST'],
                         'RUNAWAY': ['MISSION ST', '12TH AV', 'CAPITOL AV', 'POTRERO AV', 'MARKET ST', 'PAGE ST'],
                         'SECONDARY CODES': ['3RD ST', 'MARKET ST', 'BRYANT ST', 'MISSION ST'],
                         'SEX OFFENSES FORCIBLE': ['MARKET ST', 'POLK ST', 'BRYANT ST', 'MISSION ST'],
                         'SEX OFFENSES NON FORCIBLE': ['18TH ST', '3RD ST', 'BRYANT ST', '17TH ST', 'POTRERO AV', '11TH AV'],
                         'STOLEN PROPERTY': ['BRYANT ST', 'POLK ST', 'MARKET ST', 'MISSION ST'],
                         'SUICIDE': ['BRYANT ST', 'BUSH ST', '7TH ST', 'POTRERO AV', '24TH AV', 'GGBRIDGE HY'],
                         'SUSPICIOUS OCC': ['MARKET ST', 'BRYANT ST', 'MISSION ST', 'TAYLOR ST'],
                         'TREA': ['20TH ST', 'ARMSTRONG AV', 'CASTRO ST'],
                         'TRESPASS': ['HARRISON ST', 'BRYANT ST', 'MARKET ST', 'MISSION ST', 'POTRERO AV'],
                         'VANDALISM': ['HARRISON ST', 'MARKET ST', 'FOLSOM ST', 'BRYANT ST', 'MISSION ST'],
                         'VEHICLE THEFT': ['3RD ST', 'HARRISON ST', 'FOLSOM ST', 'MISSION ST'],
                         'WARRANTS': ['BRYANT ST', 'MARKET ST', 'MISSION ST', 'JONES ST'],
                         'WEAPON LAWS': ['TURK ST', 'MARKET ST', 'MISSION ST', 'JONES ST']}

    
    addedCols = ['StreetCornerFlag', 'BlockNumber']
    
    for category in Categories:
        addedCols.append('CommonStreet' + category)
        #df['CommonStreet' + category] = df.apply(lambda x: (x['Address1'] in popularStreets[category] or x['Address2'] in popularStreets[category]), axis=1)
        df['CommonStreet' + category] = (df['Address1'].isin(popularStreets[category])) | (df['Address2'].isin(popularStreets[category]))
    
    #     streets = set(df['Address1'].unique().tolist() + df['Address2'].unique().tolist())
    #     for street in streets:
    #         df['OnStreet' + street] = df.apply(lambda x: (x['Address1'] == street or x['Address2'] == street), axis=1)

    #Also, try getting dummies for the addresses?
    
    
    df['BlockNumber'] = df['Address'].apply(lambda x: int(re.findall(r'^\d+',x)[0]) if (len(re.findall(r'^\d+',x)) > 0) else None )
    df['BlockNumber'] = df['BlockNumber'].fillna(-1)
    
    #Also add the "did the crime occur on a street corner field?"
    df['StreetCornerFlag'] = df['Address'].apply(lambda x: len(x.split(" / ")) > 1)
    return addedCols

In [15]:
def removeOutlierLatLon(df):
    '''This function will attempt remove outlier Latitudes and Longitudes'''
    print("Removing LatLong Outliers and Binning Lat and Long")
    df.loc[df.X > -121, 'X'] = df.loc[(df.X > -121)].apply(lambda row: df.X[df["PdDistrict"] == row['PdDistrict']].median(), axis=1)
    df.loc[df.Y > 38, 'Y'] = df.loc[(df.Y > 38)].apply(lambda row: df.Y[df["PdDistrict"] == row['PdDistrict']].median(), axis=1)
    
    return ['X', 'Y']

In [10]:
def recodeCategories(df):
    '''This function will recode the crime category to the appropriate number'''
    print('Recoding categories')
    #if 'Category' in df.columns:
    df['CategoryRecode'] = df.Category.apply(lambda x: Categories.index(x))
        
    return ['CategoryRecode']

## Recoding Columns
Here, we want to do some recoding of the columns. To do this, we're going to use our handy-dandy helper functions.  

In [12]:
crimeData, addedColumns = recodeData(
    readData, isTrain = True)
crimeData.describe()
# crimeData[crimeData[['XBins', 'YBins']].isnull().any(axis=1)]

Removing LatLong Outliers and Binning Lat and Long
Recoding Dates
Recoding Districts
Recoding Addresses
Recoding categories


Unnamed: 0,X,Y,Year,Month,Day,Hour,Minute,DayOfWeekRecode,DistrictNORTHERN,DistrictPARK,...,CommonStreetSUSPICIOUS OCC,CommonStreetTREA,CommonStreetTRESPASS,CommonStreetVANDALISM,CommonStreetVEHICLE THEFT,CommonStreetWARRANTS,CommonStreetWEAPON LAWS,BlockNumber,StreetCornerFlag,CategoryRecode
count,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049.0,878049,878049,...,878049,878049,878049,878049,878049,878049,878049,878049.0,878049,878049.0
mean,-122.422763,37.767035,2008.712046,6.436509,15.570623,13.412655,20.155026,2.992691,0.11992,0.056162,...,0.150365,0.00946986,0.160315,0.164519,0.0959138,0.153397,0.137031,704.776747,0.297038,19.338687
std,0.025284,0.024165,3.631194,3.428972,8.783005,6.549573,18.594915,1.972023,0.324869,0.230234,...,0.357429,0.0968514,0.366898,0.370746,0.294473,0.36037,0.34388,1001.193208,0.456954,10.688637
min,-122.513642,37.707879,2003.0,1.0,1.0,0.0,0.0,0.0,False,False,...,False,False,False,False,False,False,False,-1.0,False,0.0
25%,-122.432952,37.752427,2006.0,3.0,8.0,9.0,0.0,1.0,0,0,...,0,0,0,0,0,0,0,0.0,0,16.0
50%,-122.41642,37.775421,2009.0,6.0,16.0,14.0,19.0,3.0,0,0,...,0,0,0,0,0,0,0,300.0,0,20.0
75%,-122.406959,37.784368,2012.0,9.0,23.0,19.0,33.0,5.0,0,0,...,0,0,0,0,0,0,0,1000.0,1,25.0
max,-122.364937,37.819975,2015.0,12.0,31.0,23.0,59.0,6.0,True,True,...,True,True,True,True,True,True,True,8300.0,True,38.0


## Model
Now that I've done some recoding, I'm going to create my model. To do this, I'm going to do a random forest classifier. 

In [13]:
columnsToUse = addedColumns

X = crimeData[columnsToUse]
y = crimeData['CategoryRecode']

clf = RandomForestClassifier(n_estimators=30, max_depth = 8, max_leaf_nodes = 100, random_state=1, verbose=2)

k_folds = StratifiedShuffleSplit(y, 3, test_size=0.5, random_state=0)

scores = []

for k, (train, test) in enumerate(k_folds):
    clf.fit(X.iloc[train], y.iloc[train])
    probs = clf.predict_proba(X.iloc[test])
    score = log_loss(y.iloc[test], probs)
    print score
    scores.append(score)
    
print(scores)
print("Average: " + str(np.average(scores)))

building tree 1 of 30
building tree 2 of 30
building tree 3 of 30
building tree 4 of 30
building tree 5 of 30
building tree 6 of 30
building tree 7 of 30
building tree 8 of 30
building tree 9 of 30
building tree 10 of 30
building tree 11 of 30
building tree 12 of 30
building tree 13 of 30
building tree 14 of 30
building tree 15 of 30
building tree 16 of 30
building tree 17 of 30
building tree 18 of 30
building tree 19 of 30
building tree 20 of 30
building tree 21 of 30
building tree 22 of 30
building tree 23 of 30
building tree 24 of 30
building tree 25 of 30
building tree 26 of 30
building tree 27 of 30
building tree 28 of 30
building tree 29 of 30
building tree 30 of 30

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   15.9s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    4.7s finished



2.45093891761
building tree 1 of 30
building tree 2 of 30
building tree 3 of 30
building tree 4 of 30
building tree 5 of 30
building tree 6 of 30
building tree 7 of 30
building tree 8 of 30
building tree 9 of 30
building tree 10 of 30
building tree 11 of 30
building tree 12 of 30
building tree 13 of 30
building tree 14 of 30
building tree 15 of 30
building tree 16 of 30
building tree 17 of 30
building tree 18 of 30
building tree 19 of 30
building tree 20 of 30
building tree 21 of 30
building tree 22 of 30
building tree 23 of 30
building tree 24 of 30
building tree 25 of 30
building tree 26 of 30
building tree 27 of 30
building tree 28 of 30
building tree 29 of 30
building tree 30 of 30

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   14.8s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    6.6s finished



2.45039644082
building tree 1 of 30
building tree 2 of 30
building tree 3 of 30
building tree 4 of 30
building tree 5 of 30
building tree 6 of 30
building tree 7 of 30
building tree 8 of 30
building tree 9 of 30
building tree 10 of 30
building tree 11 of 30
building tree 12 of 30
building tree 13 of 30
building tree 14 of 30
building tree 15 of 30
building tree 16 of 30
building tree 17 of 30
building tree 18 of 30
building tree 19 of 30
building tree 20 of 30
building tree 21 of 30
building tree 22 of 30
building tree 23 of 30
building tree 24 of 30
building tree 25 of 30
building tree 26 of 30
building tree 27 of 30
building tree 28 of 30
building tree 29 of 30
building tree 30 of 30

[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   14.8s finished
[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:    4.4s finished



2.44922342067
[2.4509389176101113, 2.4503964408169558, 2.4492234206727344]
Average: 2.4501862597


In [14]:
addedColumns

['Year',
 'Month',
 'Day',
 'Hour',
 'Minute',
 'DayOfWeekRecode',
 'DistrictNORTHERN',
 'DistrictPARK',
 'DistrictINGLESIDE',
 'DistrictBAYVIEW',
 'DistrictRICHMOND',
 'DistrictCENTRAL',
 'DistrictTARAVAL',
 'DistrictTENDERLOIN',
 'DistrictMISSION',
 'DistrictSOUTHERN',
 'StreetCornerFlag',
 'BlockNumber',
 'CommonStreetARSON',
 'CommonStreetASSAULT',
 'CommonStreetBAD CHECKS',
 'CommonStreetBRIBERY',
 'CommonStreetBURGLARY',
 'CommonStreetDISORDERLY CONDUCT',
 'CommonStreetDRIVING UNDER THE INFLUENCE',
 'CommonStreetDRUG/NARCOTIC',
 'CommonStreetDRUNKENNESS',
 'CommonStreetEMBEZZLEMENT',
 'CommonStreetEXTORTION',
 'CommonStreetFAMILY OFFENSES',
 'CommonStreetFORGERY/COUNTERFEITING',
 'CommonStreetFRAUD',
 'CommonStreetGAMBLING',
 'CommonStreetKIDNAPPING',
 'CommonStreetLARCENY/THEFT',
 'CommonStreetLIQUOR LAWS',
 'CommonStreetLOITERING',
 'CommonStreetMISSING PERSON',
 'CommonStreetNON-CRIMINAL',
 'CommonStreetOTHER OFFENSES',
 'CommonStreetPORNOGRAPHY/OBSCENE MAT',
 'CommonStreetPRO

To make a submission file, we modified the submission function from [this script](https://www.kaggle.com/shifanmao/sf-crime/random-forest-2/code). 

In [19]:
def make_submission(clf, predictors, path='my_submission.csv'):
    '''This function will take in a trained model, a list of predictors, and an optional 
    filepath and create a submissision file for us.'''
   
    test_data = pd.read_csv('test.csv')
    
    test_data, newColumns = recodeData(test_data)
    
    #clf.fit(trainX[predictors], trainY)
    predictions = clf.predict_proba(test_data[newColumns])

    submission = pd.DataFrame({
        'Id': test_data.Id
    })
    
    for i in range(predictions.shape[1]):
        submission[Categories[i]] = predictions[:,i]
    submission.to_csv(path, index=False)

    print(" -- Wrote submission to file {}.".format(path))


In [18]:
clf.fit(X, y)
print "model fitted with all data"

building tree 1 of 30
building tree 2 of 30
building tree 3 of 30
building tree 4 of 30
building tree 5 of 30
building tree 6 of 30
building tree 7 of 30
building tree 8 of 30
building tree 9 of 30
building tree 10 of 30
building tree 11 of 30
building tree 12 of 30
building tree 13 of 30
building tree 14 of 30
building tree 15 of 30
building tree 16 of 30
building tree 17 of 30
building tree 18 of 30
building tree 19 of 30
building tree 20 of 30
building tree 21 of 30
building tree 22 of 30
building tree 23 of 30
building tree 24 of 30
building tree 25 of 30
building tree 26 of 30
building tree 27 of 30
building tree 28 of 30
building tree 29 of 30
building tree 30 of 30
model fitted with all data


[Parallel(n_jobs=1)]: Done  30 out of  30 | elapsed:   30.7s finished


In [None]:
make_submission(clf, columnsToUse)

Best Score: 	
2.44156

## Accounting for address data
Okay, so what I'm going to do here, is to go category by category and get the top streets for each crime category. Ideally we'd be doing a one-hot encoding of all the streets in the dataset, but unfortunately, we don't have enough memory to do that with the thousands of streets we have. As a result, we're doing something that feels a little like overfitting. 

In [16]:
#Group by category and address 2. 
countsByStreet = crimeData.groupby(['Category','Address1']).count().reset_index()
countsByStreet2 = crimeData.groupby(['Category','Address2']).count().reset_index()

#The number of most popular streets to filter out for all crimes
mostPopStreetsNum = 3
#The number of streets for each crime to actually get
howManyTopStreets = 5

#Get the most popular streets for all crimes
streetCounts =  crimeData.groupby(['Address1']).count().reset_index().sort_values(by =['Dates'], ascending=[0]).groupby('Address1').head(mostPopStreetsNum)
popStreets = streetCounts.head(mostPopStreetsNum)['Address1'].tolist()

streetCounts2 =  crimeData[crimeData.Address2 != ''].groupby(['Address2']).count().reset_index().sort_values(by =['Dates'], ascending=[0]).groupby('Address2').head(mostPopStreetsNum)
popStreets2 = streetCounts2.head(mostPopStreetsNum)['Address2'].tolist()

popStreets = list(set(popStreets + popStreets2))

#Loop through each crime and find popular streets that aren't in the overall list of most popular streets
popularCategories = {}
for category in Categories:
    categoryCounts = countsByStreet[countsByStreet.Category == category]
    categoryCounts2 = countsByStreet2[countsByStreet2.Category == category]
    categoryCounts2 = categoryCounts2[categoryCounts2.Address2 != '']
    
    maxCountsByStreet = categoryCounts.sort_values(by =['Category', 'Dates'], ascending=[1,0]).groupby('Category').head(howManyTopStreets)
    maxCountsByStreet2 = categoryCounts2.sort_values(by =['Category', 'Dates'], ascending=[1,0]).groupby('Category').head(howManyTopStreets)
    
    address1Streets = [street for street in maxCountsByStreet['Address1'].tolist() if not(street in popStreets)]
    address2Streets = [street for street in maxCountsByStreet2['Address2'].tolist() if not(street not in popStreets)]
    
    popularCategories[category] = list(set(address1Streets + address2Streets))


In [17]:
popularCategories

{'ARSON': ['ELLIS ST', 'EDDY ST', 'MISSION ST'],
 'ASSAULT': ['MARKET ST', 'TURK ST', 'EDDY ST', 'MISSION ST', 'JONES ST'],
 'BAD CHECKS': ['VANNESS AV', 'MISSION ST', 'CALIFORNIA ST'],
 'BRIBERY': ['6TH ST', 'OAKDALE AV', 'BRYANT ST', 'MISSION ST'],
 'BURGLARY': ['OFARRELL ST', 'SUTTER ST'],
 'DISORDERLY CONDUCT': ['ELLIS ST', 'CAPP ST', 'SHOTWELL ST', 'HAIGHT ST'],
 'DRIVING UNDER THE INFLUENCE': ['19TH AV', 'GEARY BL', 'MISSION ST'],
 'DRUG/NARCOTIC': ['TURK ST', 'ELLIS ST', 'JONES ST', 'MISSION ST'],
 'DRUNKENNESS': ['BROADWAY ST',
  '6TH ST',
  'MARKET ST',
  'MISSION ST',
  'HAIGHT ST'],
 'EMBEZZLEMENT': ['OFARRELL ST', 'MARKET ST', 'POST ST', 'MISSION ST'],
 'EXTORTION': ['PINE ST', 'GEARY BL'],
 'FAMILY OFFENSES': ['MARKET ST',
  'EDDY ST',
  'JONES ST',
  'POTRERO AV',
  'MISSION ST'],
 'FORGERY/COUNTERFEITING': ['VANNESS AV',
  'MARKET ST',
  'POWELL ST',
  'JONES ST',
  'MISSION ST'],
 'FRAUD': ['MARKET ST', 'POWELL ST', 'MISSION ST', 'STOCKTON ST'],
 'GAMBLING': ['JEFFERSON

From here, we can just copy this dictionary to the address recoding fuction. We've tried this with various permutations of the number of streets to filter out and the number of streets to actually find. Some of them appear to boost performance a little, but not tons. (Probably because we're doing a fair ammount of overfitting). 