In [1]:
import numpy as np
np.random.seed(123)

import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.sparse import csr_matrix, hstack

In [2]:
print "Data Preprocessing Begins..."
## Read data
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

## Set test loss to NaN
test['loss'] = np.nan

## Response and IDs
y = train['loss'].values
id_train = train['id'].values
id_test = test['id'].values

## Stack train test
ntrain = train.shape[0]
tr_te = pd.concat((train, test), axis = 0)

## Preprocessing and transforming to sparse data
sparse_data = []

f_cat = [f for f in tr_te.columns if 'cat' in f]
for f in f_cat:
    dummy = pd.get_dummies(tr_te[f].astype('category'))
    tmp = csr_matrix(dummy)
    sparse_data.append(tmp)

f_num = [f for f in tr_te.columns if 'cont' in f]
scaler = StandardScaler()
tmp = csr_matrix(scaler.fit_transform(tr_te[f_num]))
sparse_data.append(tmp)

del(tr_te, train, test) # Deleting tr_te, train, test like garbage collection

## sparse train and test data
xtr_te = hstack(sparse_data, format = 'csr')
xtrain = xtr_te[:ntrain, :]
xtest = xtr_te[ntrain:, :]

print('Dim train', xtrain.shape)
print('Dim test', xtest.shape)

del(xtr_te, sparse_data, tmp)

print "Data Preprocessing Ends..."

Data Preprocessing Begins...
('Dim train', (188318, 1190))
('Dim test', (125546, 1190))
Data Preprocessing Ends...


In [3]:
train = pd.DataFrame(xtrain.toarray())
target = pd.DataFrame(xtest.toarray())

In [4]:
print train.shape
print target.shape
print y.shape

(188318, 1190)
(125546, 1190)
(188318,)


# Random Forest

In [5]:
%%time

rf = RandomForestRegressor()
rf_params = {'n_estimators': [100], 'min_samples_leaf': [2], 'min_samples_split': [4]}
rf_grid = GridSearchCV(rf, rf_params, cv=5, verbose=1, n_jobs=2, scoring='neg_mean_absolute_error')
rf_grid.fit(train, y)
rf_score = rf_grid.cv_results_

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed: 175.1min finished


CPU times: user 1h 5min 18s, sys: 55.8 s, total: 1h 6min 14s
Wall time: 4h 5min 55s


In [9]:
np.sqrt(-rf_score['mean_test_score'])

array([ 1971.33230627])

In [7]:
rf_score['mean_test_score']

{'mean_fit_time': array([ 3642.52908106]),
 'mean_score_time': array([ 3.25899296]),
 'mean_test_score': array([-3886151.06174344]),
 'mean_train_score': array([-882690.69072449]),
 'param_min_samples_leaf': masked_array(data = [2],
              mask = [False],
        fill_value = ?),
 'param_min_samples_split': masked_array(data = [4],
              mask = [False],
        fill_value = ?),
 'param_n_estimators': masked_array(data = [100],
              mask = [False],
        fill_value = ?),
 'params': ({'min_samples_leaf': 2,
   'min_samples_split': 4,
   'n_estimators': 100},),
 'rank_test_score': array([1], dtype=int32),
 'split0_test_score': array([-3904584.8602431]),
 'split0_train_score': array([-887201.03108566]),
 'split1_test_score': array([-3911557.51979802]),
 'split1_train_score': array([-881482.54125619]),
 'split2_test_score': array([-4062504.0305523]),
 'split2_train_score': array([-851372.03587125]),
 'split3_test_score': array([-3894547.36808593]),
 'split3_train_s

# Support Vector Machine Regressor

In [None]:
%%time

from sklearn.svm import SVR

svr = SVR()
svr_params = {'kernel': ['rbf'], 'C': [1.0]}
svr_grid = GridSearchCV(svr, svr_params, cv=5, verbose=3, n_jobs=2, scoring='neg_mean_absolute_error')
svr_grid.fit(train, y)
svr_score = svr_grid.cv_results_

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] kernel=rbf, C=1.0 ...............................................
[CV] kernel=rbf, C=1.0 ...............................................


# K Nearnest Neighbour

In [None]:
%%time

from sklearn.neighbors import KNeighborsRegressor


