In [1]:
from __future__ import print_function

import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt

plt.style.use('ggplot')
%matplotlib notebook

In [2]:
# Load data into dataframes
%time train = pd.read_csv(r'data/train.csv', parse_dates=['Dates'])
%time test = pd.read_csv(r'data/test.csv', parse_dates=['Dates'])


CPU times: user 3.73 s, sys: 307 ms, total: 4.03 s
Wall time: 8.07 s
CPU times: user 3.12 s, sys: 295 ms, total: 3.41 s
Wall time: 7 s


In [3]:
#Enrich
def enrich(df):
    df['Hour'] = df.Dates.dt.hour
    df['Year'] = df.Dates.dt.year
    df['Month'] = df.Dates.dt.month
    df['DayOfWeek'] = df.Dates.dt.dayofweek
    #df['Day'] = df.Dates.dt.day
    #df['DayDates'] = df.Dates.dt.date #actual date without the hours
    #df['MonthlyDates'] = df.Dates.map(lambda dt: datetime(dt.year, dt.month, 1))

    #remove outliers
    #df = df[df.Y<40]
    
%time enrich(train)
%time enrich(test)

CPU times: user 484 ms, sys: 112 ms, total: 596 ms
Wall time: 2.24 s
CPU times: user 437 ms, sys: 80.8 ms, total: 518 ms
Wall time: 547 ms


In [4]:
def build_data_for_sklearn(df, is_target=False):
    x_columns = ['Hour','DayOfWeek','Month','Year','X','Y']
    y_column = 'Category'   
    x_categorical_columns = [] #['PdDistrict']
    x_columns.extend(x_categorical_columns)
    
    X = df[x_columns]
    if len(x_categorical_columns)>0:
        X = pd.get_dummies(X, columns=x_categorical_columns)
    
    y = None if is_target else df[y_column].values
    
    return X, y

In [5]:
%time X,y = build_data_for_sklearn(train)
print(X.shape)
print(y.shape)

CPU times: user 41.3 ms, sys: 66.6 ms, total: 108 ms
Wall time: 1.05 s
(878049, 6)
(878049,)


# KFolds, Cross Validation, Pipelines and Preprocessors

In [6]:
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import StratifiedKFold, cross_val_score
cv = StratifiedKFold(y,n_folds=5)

# Naive Bayes

In [7]:
from sklearn.naive_bayes import BernoulliNB, GaussianNB, MultinomialNB 
estimators = [
    Pipeline([
        ('scaler',StandardScaler()),
        ('bernoulli', BernoulliNB()),
    ]),
    Pipeline([
        ('scaler',StandardScaler()),
        ('gaussian',GaussianNB()),
    ]),
    Pipeline([
        ('scaler',MinMaxScaler()),
        ('multinomial',MultinomialNB()),
    ]),
]

In [12]:
from sklearn.cluster import KMeans
kmeans = KMeans(n_jobs=-1)
%time kmeans.fit(X)

descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensional Fortran
array, use 'a.T.view(...).T' instead
  obj_bytes_view = obj.view(self.np.uint8)
descriptor assignment is deprecated. To maintain
the Fortran contiguity of a multidimensio

CPU times: user 1.88 s, sys: 1.47 s, total: 3.35 s
Wall time: 2min 4s


KMeans(copy_x=True, init='k-means++', max_iter=300, n_clusters=8, n_init=10,
    n_jobs=-1, precompute_distances='auto', random_state=None, tol=0.0001,
    verbose=0)

In [14]:
len(kmeans.labels_)

878049

# Random Forest

In [9]:
from sklearn import ensemble
estimator = ensemble.RandomForestClassifier(n_jobs=-1,n_estimators=50)
%time score = cross_val_score(estimator,X,y,cv=cv)
score

KeyboardInterrupt: 

  # Prepare submission  

In [None]:
%time sampleSubmission = pd.read_csv(r'data/sampleSubmission.csv')
sampleSubmission.columns

In [None]:
tosubmit = pd.DataFrame({'Id' : test.Id.values, '' : y_target})
#tosubmit.set_index('Id',inplace=True)
tosubmit.head().T

In [None]:
tosubmit = pd.get_dummies(tosubmit,prefix_sep='')
print(len(tosubmit.columns))
tosubmit.head()

In [None]:
categories = sorted(train.Category.unique())
for cat in categories:
    if cat not in tosubmit:
        print('adding null column for category:%s'%cat)
        tosubmit[cat]=0

In [None]:
tosubmit = tosubmit[sampleSubmission.columns]
if len(tosubmit.columns) != (len(categories)+1):
    print('submit data is inconsistent with categories passed')

In [None]:
# tosubmit.to_csv(r'data\rf_20160517.csv',index=False)
# !explorer data

# Cross Validation

In [28]:
from sklearn.pipeline import Pipeline
from sklearn.cross_validation import cross_val_score, StratifiedKFold

%time cv=StratifiedKFold(y,n_folds=4,shuffle=True)

CPU times: user 9.12 s, sys: 119 ms, total: 9.24 s
Wall time: 9.37 s


# RandomForest

In [None]:
from sklearn.ensemble import RandomForestClassifier

rf = RandomForestClassifier(n_jobs=-1,n_estimators=50)
%time scores = cross_val_score(rf,X,y,cv=cv)
scores

# SGDClassifier

In [42]:
from sklearn.linear_model import SGDClassifier
clf = SGDClassifier()
%time scores = cross_val_score(clf,X,y,cv=cv)
scores

CPU times: user 1min 36s, sys: 1.29 s, total: 1min 37s
Wall time: 1min 37s


array([ 0.10511739,  0.14370641,  0.04946106,  0.19909339])