# Model Iteration 1
Here, I want to implement a very simple model. To do this, I'm not going to create very many features, just recode some of the variables that are categorical like district. 

## Importing Everything!

In [None]:
% matplotlib inline

import shapefile
import pandas as pd
import numpy as np
import itertools

import matplotlib.pyplot as plt
from matplotlib.patches import Polygon
from matplotlib.collections import PatchCollection
from matplotlib import cm
from datetime import datetime
from ipywidgets import widgets  
from IPython.display import display

from sklearn import cross_validation
from sklearn.ensemble import RandomForestClassifier
from sklearn.cross_validation import KFold
from sklearn.cross_validation import train_test_split
from sklearn.datasets import load_iris
from sklearn import tree
from sklearn.metrics import log_loss


## Loading in the data
Now, I need to load in the data

In [None]:
crimeData = pd.read_csv('train.csv')

## Helper functions for recoding data
Here are the helper functions for recoding data. We'll add more as we create some new features

In [None]:
def recodeData(df, isTrain = False):
    '''This function takes in the dataframe that we get from loading in the 
    SF crime data and returns a re-coded dataframe that has all the 
    additional features we want to add and the categorical features recoded 
    and cleaned.
    '''

    #since the modifications are done in-place we don't return the dataframe. 
    #we do, however, return the list of all the columns we added.
    newDate = recodeDates(df)
    newDistrict = recodePoliceDistricts(df)
    
    addedColumns = [] 
    addedColumns += newDate
    addedColumns += newDistrict 

    if (isTrain):
        try: #prevents error if the coumns have already been removed
            columnsToDrop = ['Descript', 'Resolution']
            df.drop(columnsToDrop, axis=1, inplace=True)
        except:
            print "already recoded"

    return df, addedColumns

In [None]:
def recodeDates(df):
    '''This function takes in a dataframe and recodes the date field into 
    useable values. Here, we also recode the day of week.'''
    #Recode the dates column to year, month, day and hour columns
    df['DateTime'] = df['Dates'].apply(
        lambda x: datetime.strptime(x, '%Y-%m-%d %H:%M:%S'))

    df['Year'] = df['DateTime'].apply(lambda x: x.year)
    df['Month'] = df['DateTime'].apply(lambda x: x.month)
    df['Day'] = df['DateTime'].apply(lambda x: x.day)
    df['Hour'] = df['DateTime'].apply(lambda x: x.hour)
    df['Minute'] = df['DateTime'].apply(lambda x: x.minute)
    df['DayOfWeekRecode'] = df['DateTime'].apply(lambda x: x.weekday())
    df['MinuteOfWeek'] = df['DateTime'].apply(lambda x: x.weekday()*24*60 + x.hour*60 + x.minute)

    return ['Year', 'Month', 'Day', 'Hour', 'Minute', 'DayOfWeekRecode', 'MinuteOfWeek']

In [None]:
def recodePoliceDistricts(df):
    '''This function recodes the police district to a one-hot encoding 
    scheme.'''
    districts = df['PdDistrict'].unique().tolist()
    newColumns = []
    for district in districts:
        newColumns.append('District' + district)
        df['District' + district] = df['PdDistrict'].apply(
            lambda x: int(x == district))

    return newColumns

## Recoding Columns
Here, I want to do some recoding of the columns. To do this, I'm going to use our handy-dandy helper function.  

In [None]:
crimeData, addedColumns = recodeData(
    crimeData, isTrain = True)
crimeData.describe()

## Model Iteration 1
Now that I've done some recoding, I'm going to create my model. To do this, I'm going to do a random forest classifier. 

In [None]:
columnsToUse = addedColumns + ['X','Y']

X = crimeData[columnsToUse]
y = crimeData['Category']

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.5, random_state=42)

clf = RandomForestClassifier(n_estimators=10, max_depth=None, min_samples_split=1, random_state=0)
clf.fit(X_train, y_train)
probs = clf.predict_proba(X_test)

score = log_loss(y_test, probs)
print(score)

This submission function came from [this script](https://www.kaggle.com/shifanmao/sf-crime/random-forest-2/code)

We modified this slightly!

In [None]:
def make_submission(clf, path='my_submission.csv'):
    '''This function will take in a model and a an optional 
    filepath and create a submissision file for us.'''
   
    test_data = pd.read_csv('test.csv')
    
    test_data, columnsToUse = recodeData(test_data)
    
    X_test_data = test_data[columnsToUse]
    ids = test_data['Id']
    
    #Predict the probabilities of each crime. 
    y_prob = clf.predict_proba(X_test_data)

    with open(path, 'w') as f:
        f.write("ID,")
        f.write(','.join(clf.classes_))
        f.write('\n')

        for id, probs in zip(ids, y_prob):
#            probas = ','.join([id] + list(map(str, probs.tolist())))
            probas = ','.join([str(id)] + map(str,["%.2f" % elem for elem in list(map(float,probs.tolist()))]))
#            probas = ','.join(list(map(str, probs.tolist())))
            f.write(probas)
            f.write('\n')
    print(" -- Wrote submission to file {}.".format(path))


In [None]:
make_submission(clf)