In [None]:
from __future__ import print_function

import numpy as np
import pandas as pd
from datetime import datetime
import matplotlib.pyplot as plt

plt.style.use('ggplot')
%matplotlib notebook

In [None]:
# Load data into dataframes
%time train = pd.read_csv(r'data/train.csv')
%time test = pd.read_csv(r'data/test.csv')


In [None]:
#Enrich
def enrich(df):
    #remove outliers
    df['Dates'] = pd.to_datetime(df.Dates)

    #turn dates into np.datetime64
    
    df['Hour'] = df.Dates.dt.hour
    df['Year'] = df.Dates.dt.year
    df['Month'] = df.Dates.dt.month
    df['DayOfWeek'] = df.Dates.dt.dayofweek
    df['Day'] = df.Dates.dt.day
    df['DayDates'] = df.Dates.dt.date #actual date without the hours
    df['MonthlyDates'] = df.Dates.map(lambda dt: datetime(dt.year, dt.month, 1))
    
    df = df[df.Y<40]
    
%time enrich(train)
%time enrich(test)

In [None]:
def build_data_for_sklearn(df, is_target=False):
    x_columns = ['Hour','DayOfWeek','Day','Month','Year','X','Y']
    y_column = 'Category'
    
    #x_columns = ['Hour','DayOfWeek','Day','Month','Year','X','Y','Resolution']
    
    #categorical_columns = ['Category','Resolution']
    #categorical_columns = ['Resolution']

    #x_data = pd.get_dummies(df[x_columns],columns=categorical_columns)
    
    x_data = df[x_columns]
    y_data = None if is_target else df[y_column].values
    
    return x_data, y_data

In [None]:
%time x_data,y_data = build_data_for_sklearn(train)

In [None]:
print(x_data.shape)
print(y_data.shape)

# Random Forest

In [None]:
from sklearn import ensemble
rf = ensemble.RandomForestClassifier(n_jobs=-1)

In [None]:
%time rf.fit(x_data, y_data)

In [None]:
x_target, dummy= build_data_for_sklearn(test,is_target=True)
x_target.shape

In [None]:
y_target = rf.predict(x_target)
y_target

  # Prepare submission  

In [None]:
%time sampleSubmission = pd.read_csv(r'data/sampleSubmission.csv')
sampleSubmission.columns

In [None]:
tosubmit = pd.DataFrame({'Id' : test.Id.values, '' : y_target})
#tosubmit.set_index('Id',inplace=True)
tosubmit.head().T

In [None]:
tosubmit = pd.get_dummies(tosubmit,prefix_sep='')
print(len(tosubmit.columns))
tosubmit.head()

In [None]:
categories = sorted(train.Category.unique())
for cat in categories:
    if cat not in tosubmit:
        print('adding null column for category:%s'%cat)
        tosubmit[cat]=0

In [None]:
tosubmit = tosubmit[sampleSubmission.columns]
if len(tosubmit.columns) != (len(categories)+1):
    print('submit data is inconsistent with categories passed')

In [None]:
# tosubmit.to_csv(r'data\rf_20160517.csv',index=False)
# !explorer data

# Cross Validation

In [None]:
from sklearn.cross_validation import cross_val_score, StratifiedKFold
from sklearn.ensemble import RandomForestClassifier

%time cv=StratifiedKFold(y_data,n_folds=5)
%time scores = cross_val_score(RandomForestClassifier(n_jobs=-1,n_estimators=20),x_data,y_data,cv=cv)
scores