In [1]:
from __future__ import print_function

import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt

plt.style.use('ggplot')
%matplotlib notebook

In [2]:
# Load data into dataframes
%time train = pd.read_csv(r'data/train.csv')
%time test = pd.read_csv(r'data/test.csv')


Wall time: 1.7 s
Wall time: 1.46 s


In [3]:
#Enrich
def enrich(df):
    #remove outliers
    df['Dates'] = pd.to_datetime(df.Dates)

    #turn dates into np.datetime64
    
    df['Hour'] = df.Dates.dt.hour
    df['Year'] = df.Dates.dt.year
    df['Month'] = df.Dates.dt.month
    df['DayOfWeek'] = df.Dates.dt.dayofweek
    df['Day'] = df.Dates.dt.day
    df['DayDates'] = df.Dates.dt.date #actual date without the hours
    df['MonthlyDates'] = df.Dates.map(lambda dt: datetime(dt.year, dt.month, 1))
    
    df = df[df.Y<40]
    
%time enrich(train)
%time enrich(test)

Wall time: 8.69 s
Wall time: 8.7 s


In [4]:
def build_data_for_sklearn(df, is_target=False):
    x_columns = ['Hour','DayOfWeek','Day','Month','Year','X','Y']
    y_columns = ['Category']
    
    #x_columns = ['Hour','DayOfWeek','Day','Month','Year','X','Y','Resolution']
    
    #categorical_columns = ['Category','Resolution']
    #categorical_columns = ['Resolution']

    #x_data = pd.get_dummies(df[x_columns],columns=categorical_columns)
    
    x_data = df[x_columns]
    y_data = None if is_target else df[y_columns]
    
    return x_data, y_data

In [5]:
%time x_data,y_data = build_data_for_sklearn(train)

Wall time: 15.6 ms


In [6]:
print(x_data.shape)
print(y_data.shape)

(878049, 7)
(878049, 1)


# Random Forest

In [7]:
from sklearn import ensemble
rf = ensemble.RandomForestClassifier(n_jobs=-1)

In [8]:
%time rf.fit(x_data, y_data)

  if __name__ == '__main__':


Wall time: 8.98 s


RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=-1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [9]:
x_target, dummy= build_data_for_sklearn(test,is_target=True)
x_target.shape

(884262, 7)

In [10]:
y_target = rf.predict(x_target)
y_target

  # Prepare submission  

In [13]:
%time sampleSubmission = pd.read_csv(r'data/sampleSubmission.csv')
sampleSubmission.columns

Wall time: 1.9 s


Index(['Id', 'ARSON', 'ASSAULT', 'BAD CHECKS', 'BRIBERY', 'BURGLARY',
       'DISORDERLY CONDUCT', 'DRIVING UNDER THE INFLUENCE', 'DRUG/NARCOTIC',
       'DRUNKENNESS', 'EMBEZZLEMENT', 'EXTORTION', 'FAMILY OFFENSES',
       'FORGERY/COUNTERFEITING', 'FRAUD', 'GAMBLING', 'KIDNAPPING',
       'LARCENY/THEFT', 'LIQUOR LAWS', 'LOITERING', 'MISSING PERSON',
       'NON-CRIMINAL', 'OTHER OFFENSES', 'PORNOGRAPHY/OBSCENE MAT',
       'PROSTITUTION', 'RECOVERED VEHICLE', 'ROBBERY', 'RUNAWAY',
       'SECONDARY CODES', 'SEX OFFENSES FORCIBLE', 'SEX OFFENSES NON FORCIBLE',
       'STOLEN PROPERTY', 'SUICIDE', 'SUSPICIOUS OCC', 'TREA', 'TRESPASS',
       'VANDALISM', 'VEHICLE THEFT', 'WARRANTS', 'WEAPON LAWS'],
      dtype='object')

In [26]:
tosubmit = pd.DataFrame({'Id' : test.Id.values, '' : y_target})
#tosubmit.set_index('Id',inplace=True)
tosubmit.head().T

Unnamed: 0,0,1,2,3,4
,LARCENY/THEFT,WEAPON LAWS,VANDALISM,OTHER OFFENSES,OTHER OFFENSES
Id,0,1,2,3,4


In [27]:
tosubmit = pd.get_dummies(tosubmit,prefix_sep='')
print(len(tosubmit.columns))
tosubmit.head().T

39


Unnamed: 0,0,1,2,3,4
Id,0.0,1.0,2.0,3.0,4.0
ARSON,0.0,0.0,0.0,0.0,0.0
ASSAULT,0.0,0.0,0.0,0.0,0.0
BAD CHECKS,0.0,0.0,0.0,0.0,0.0
BRIBERY,0.0,0.0,0.0,0.0,0.0
BURGLARY,0.0,0.0,0.0,0.0,0.0
DISORDERLY CONDUCT,0.0,0.0,0.0,0.0,0.0
DRIVING UNDER THE INFLUENCE,0.0,0.0,0.0,0.0,0.0
DRUG/NARCOTIC,0.0,0.0,0.0,0.0,0.0
DRUNKENNESS,0.0,0.0,0.0,0.0,0.0


In [35]:
categories = sorted(train.Category.unique())
for cat in categories:
    if cat not in tosubmit:
        print('adding null column for category:%s'%cat)
        tosubmit[cat]=0

adding null column for category:TREA


In [36]:
tosubmit = tosubmit[sampleSubmission.columns]
if len(tosubmit.columns) != (len(categories)+1):
    print('submit data is inconsistent with categories passed')

In [38]:
tosubmit.to_csv(r'data\rf_20160517.csv',index=False)
!explorer data