In [1]:
import numpy as np
np.random.seed(123)

import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.sparse import csr_matrix, hstack

In [2]:
print "Data Preprocessing Begins..."
## Read data
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

## Set test loss to NaN
test['loss'] = np.nan

## Response and IDs
y = train['loss'].values
id_train = train['id'].values
id_test = test['id'].values

## Stack train test
ntrain = train.shape[0]
tr_te = pd.concat((train, test), axis = 0)

## Preprocessing and transforming to sparse data
sparse_data = []

f_cat = [f for f in tr_te.columns if 'cat' in f]
for f in f_cat:
    dummy = pd.get_dummies(tr_te[f].astype('category'))
    tmp = csr_matrix(dummy)
    sparse_data.append(tmp)

f_num = [f for f in tr_te.columns if 'cont' in f]
scaler = StandardScaler()
tmp = csr_matrix(scaler.fit_transform(tr_te[f_num]))
sparse_data.append(tmp)

del(tr_te, train, test) # Deleting tr_te, train, test like garbage collection

## sparse train and test data
xtr_te = hstack(sparse_data, format = 'csr')
xtrain = xtr_te[:ntrain, :]
xtest = xtr_te[ntrain:, :]

print('Dim train', xtrain.shape)
print('Dim test', xtest.shape)

del(xtr_te, sparse_data, tmp)

print "Data Preprocessing Ends..."

Data Preprocessing Begins...
('Dim train', (188318, 1190))
('Dim test', (125546, 1190))
Data Preprocessing Ends...


In [3]:
train = pd.DataFrame(xtrain.toarray())
target = pd.DataFrame(xtest.toarray())

In [4]:
print train.shape
print target.shape
print y.shape

(188318, 1190)
(125546, 1190)
(188318,)


In [21]:
print y[0:10]

[  2213.18   1283.6    3005.09    939.85   2763.85   5142.87   1132.22
   3585.75  10280.2    6184.59]


# Linear Regression

In [5]:
from sklearn.linear_model import LinearRegression

In [6]:
%%time

lr = LinearRegression()
lr_params = {}
lr_grid = GridSearchCV(lr, lr_params, cv=5, verbose=5, scoring='neg_mean_absolute_error')
lr_grid.fit(train, y)
lr_score = lr_grid.cv_results_
print lr_score

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV]  ................................................................




[CV] ..................... , score=-255953457197.963135, total=  31.4s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   31.8s remaining:    0.0s


[CV] ..................... , score=-104286246548.485825, total=  29.9s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.0min remaining:    0.0s


[CV] ...................... , score=-83666974998.574509, total=  28.1s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  1.5min remaining:    0.0s


[CV] ..................... , score=-171772380297.980011, total=  29.4s
[CV]  ................................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  2.0min remaining:    0.0s


[CV] ...................... , score=-65855828358.473419, total=  28.1s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  2.5min finished


{'rank_test_score': array([1], dtype=int32), 'split4_test_score': array([ -6.58558284e+10]), 'mean_score_time': array([ 0.04852362]), 'std_test_score': array([  6.97607764e+10]), 'std_train_score': array([ 2.74936666]), 'split1_train_score': array([-1292.71955136]), 'split0_test_score': array([ -2.55953457e+11]), 'mean_test_score': array([ -1.36307163e+11]), 'std_score_time': array([ 0.00992465]), 'split2_train_score': array([-1285.02110505]), 'split0_train_score': array([-1286.62751503]), 'params': ({},), 'std_fit_time': array([ 1.21254671]), 'split4_train_score': array([-1289.64678564]), 'split2_test_score': array([ -8.36669750e+10]), 'split3_test_score': array([ -1.71772380e+11]), 'mean_train_score': array([-1288.1121719]), 'mean_fit_time': array([ 29.34211197]), 'split3_train_score': array([-1286.54590243]), 'split1_test_score': array([ -1.04286247e+11])}
CPU times: user 5min 50s, sys: 17.1 s, total: 6min 7s
Wall time: 3min 2s


In [19]:
print lr_grid.best_params_
print 'Testing Score: ', lr_score['mean_test_score']
print 'Training Score: ', lr_score['mean_train_score']
print lr_score['split1_test_score']
print lr_score['params']

{}
Testing Score:  [ -1.36307163e+11]
Training Score:  [-1288.1121719]
[ -1.04286247e+11]
({},)


# Ridge Regression

In [23]:
from sklearn.linear_model import Ridge

In [6]:
%%time

ridge = Ridge()
ridge_params = {'alpha': [0, 0.5, 1]}
ridge_grid = GridSearchCV(ridge, ridge_params, cv=5, verbose=10, scoring='neg_mean_absolute_error')
ridge_grid.fit(train, y)
ridge_score = ridge_grid.cv_results_
print ridge_score

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] alpha=0 .........................................................
[CV] ............ alpha=0, score=-12185129304286.949219, total=  59.1s
[CV] alpha=0 .........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:   59.5s remaining:    0.0s


[CV] ............ alpha=0, score=-20274655533760.578125, total=  53.3s
[CV] alpha=0 .........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:  1.9min remaining:    0.0s


[CV] ............ alpha=0, score=-15571520972675.888672, total=  51.9s
[CV] alpha=0 .........................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:  2.8min remaining:    0.0s


[CV] ............ alpha=0, score=-59076042165007.476562, total=  51.6s
[CV] alpha=0 .........................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:  3.6min remaining:    0.0s


[CV] ............ alpha=0, score=-19183642920069.519531, total=  51.4s
[CV] alpha=0.5 .......................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:  4.5min remaining:    0.0s


[CV] .................... alpha=0.5, score=-1307.883157, total=   7.0s
[CV] alpha=0.5 .......................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed:  4.6min remaining:    0.0s


[CV] .................... alpha=0.5, score=-1286.911971, total=   6.9s
[CV] alpha=0.5 .......................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed:  4.7min remaining:    0.0s


[CV] .................... alpha=0.5, score=-1302.040743, total=   6.9s
[CV] alpha=0.5 .......................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed:  4.8min remaining:    0.0s


[CV] .................... alpha=0.5, score=-1308.394153, total=   6.8s
[CV] alpha=0.5 .......................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed:  5.0min remaining:    0.0s


[CV] .................... alpha=0.5, score=-1296.707493, total=   6.9s
[CV] alpha=1 .........................................................
[CV] ...................... alpha=1, score=-1307.444786, total=   6.9s
[CV] alpha=1 .........................................................
[CV] ...................... alpha=1, score=-1286.661660, total=   6.8s
[CV] alpha=1 .........................................................
[CV] ...................... alpha=1, score=-1302.007380, total=   6.9s
[CV] alpha=1 .........................................................
[CV] ...................... alpha=1, score=-1308.164960, total=   6.8s
[CV] alpha=1 .........................................................
[CV] ...................... alpha=1, score=-1296.234262, total=   6.8s


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  5.7min finished


{'rank_test_score': array([3, 2, 1], dtype=int32), 'split4_test_score': array([ -1.91836429e+13,  -1.29670749e+03,  -1.29623426e+03]), 'mean_score_time': array([ 0.05279422,  0.04340591,  0.04262638]), 'std_test_score': array([  1.71470324e+13,   7.97694477e+00,   7.97698822e+00]), 'std_train_score': array([ 5.59252372,  2.42294137,  2.4823798 ]), 'split1_train_score': array([-1299.38312882, -1292.60559056, -1293.19705978]), 'split0_test_score': array([ -1.21851293e+13,  -1.30788316e+03,  -1.30744479e+03]), 'mean_test_score': array([ -2.52580509e+13,  -1.30038748e+03,  -1.30010259e+03]), 'std_score_time': array([ 0.01234605,  0.00145701,  0.00082212]), 'split2_train_score': array([-1293.17861484, -1286.14683794, -1286.63940474]), 'param_alpha': masked_array(data = [0 0.5 1],
             mask = [False False False],
       fill_value = ?)
, 'split0_train_score': array([-1286.28439212, -1288.12383397, -1288.67258803]), 'params': ({'alpha': 0}, {'alpha': 0.5}, {'alpha': 1}), 'std_fit_time

In [15]:
# alpha = 1 is the best

print ridge_grid.best_params_
print ridge_score['mean_test_score']
print ridge_score['params']

NameError: name 'ridge_grid' is not defined

##### 'mean_test_score': array([ -2.52580509e+13,  -1.30038748e+03,  -1.30010259e+03])
##### 'mean_train_score': [-1294.14163864, -1288.75891243, -1289.30182262]

# Lasso Regression

In [7]:
from sklearn.linear_model import Lasso

In [8]:
%%time

lasso = Lasso()
lasso_params = {'alpha': [0.001, 0.01, 0.1]}
lasso_grid = GridSearchCV(lasso, lasso_params, cv=5, verbose=10, scoring='neg_mean_absolute_error')
lasso_grid.fit(train, y)
lasso_score = lasso_grid.cv_results_
print lasso_score

Fitting 5 folds for each of 3 candidates, totalling 15 fits
[CV] alpha=0.001 .....................................................




[CV] .................. alpha=0.001, score=-1308.593914, total= 5.1min
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:  5.1min remaining:    0.0s


[CV] .................. alpha=0.001, score=-1288.165399, total= 5.3min
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed: 10.4min remaining:    0.0s


[CV] .................. alpha=0.001, score=-1302.029099, total= 5.4min
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed: 15.8min remaining:    0.0s


[CV] .................. alpha=0.001, score=-1308.748071, total= 5.4min
[CV] alpha=0.001 .....................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed: 21.2min remaining:    0.0s


[CV] .................. alpha=0.001, score=-1297.891505, total= 6.0min
[CV] alpha=0.01 ......................................................


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 27.2min remaining:    0.0s


[CV] ................... alpha=0.01, score=-1307.322043, total= 6.5min
[CV] alpha=0.01 ......................................................


[Parallel(n_jobs=1)]: Done   6 out of   6 | elapsed: 33.7min remaining:    0.0s


[CV] ................... alpha=0.01, score=-1287.231474, total= 6.8min
[CV] alpha=0.01 ......................................................


[Parallel(n_jobs=1)]: Done   7 out of   7 | elapsed: 40.5min remaining:    0.0s


[CV] ................... alpha=0.01, score=-1301.340764, total= 7.6min
[CV] alpha=0.01 ......................................................


[Parallel(n_jobs=1)]: Done   8 out of   8 | elapsed: 48.1min remaining:    0.0s


[CV] ................... alpha=0.01, score=-1307.551453, total= 6.6min
[CV] alpha=0.01 ......................................................


[Parallel(n_jobs=1)]: Done   9 out of   9 | elapsed: 54.7min remaining:    0.0s


[CV] ................... alpha=0.01, score=-1297.176764, total= 6.4min
[CV] alpha=0.1 .......................................................
[CV] .................... alpha=0.1, score=-1304.643747, total= 4.9min
[CV] alpha=0.1 .......................................................
[CV] .................... alpha=0.1, score=-1285.607694, total= 5.0min
[CV] alpha=0.1 .......................................................
[CV] .................... alpha=0.1, score=-1299.677042, total= 4.6min
[CV] alpha=0.1 .......................................................
[CV] .................... alpha=0.1, score=-1305.632310, total= 5.4min
[CV] alpha=0.1 .......................................................
[CV] .................... alpha=0.1, score=-1295.441861, total= 5.3min


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed: 86.4min finished


{'rank_test_score': array([3, 2, 1], dtype=int32), 'split4_test_score': array([-1297.89150534, -1297.1767642 , -1295.44186061]), 'mean_score_time': array([ 0.05173259,  0.06347156,  0.08040247]), 'std_test_score': array([ 7.65654665,  7.52813564,  7.28172265]), 'std_train_score': array([ 2.27338586,  2.29467585,  2.52681447]), 'split1_train_score': array([-1291.73544383, -1292.21290015, -1294.83260671]), 'split0_test_score': array([-1308.59391355, -1307.32204279, -1304.64374734]), 'mean_test_score': array([-1301.08557395, -1300.1244758 , -1298.20050603]), 'std_score_time': array([ 0.01179661,  0.01224095,  0.06874835]), 'split2_train_score': array([-1285.49265582, -1285.88230206, -1287.81182199]), 'param_alpha': masked_array(data = [0.001 0.01 0.1],
             mask = [False False False],
       fill_value = ?)
, 'split0_train_score': array([-1287.35798139, -1287.80445878, -1290.05850755]), 'params': ({'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1}), 'std_fit_time': array([ 18.65022

In [14]:
print lasso_grid.best_params_
print "Test score: ", lasso_score['mean_test_score']
print "Train score: ", lasso_score['mean_test_score']
print lasso_score['params']

{'alpha': 0.1}
[-1301.08557395 -1300.1244758  -1298.20050603]
({'alpha': 0.001}, {'alpha': 0.01}, {'alpha': 0.1})


In [None]:
### 'mean_train_score': array([-1287.96156639, -1288.39282065, -1290.57952161])

# Random Forest

In [5]:
%%time

rf = RandomForestRegressor()
rf_params = {'n_estimators': [100], 'min_samples_leaf': [2], 'min_samples_split': [4]}
rf_grid = GridSearchCV(rf, rf_params, cv=5, verbose=1, scoring='neg_mean_absolute_error')
rf_grid.fit(train, y)
rf_score = rf_grid.cv_results_

Fitting 5 folds for each of 1 candidates, totalling 5 fits


[Parallel(n_jobs=2)]: Done   5 out of   5 | elapsed: 175.1min finished


CPU times: user 1h 5min 18s, sys: 55.8 s, total: 1h 6min 14s
Wall time: 4h 5min 55s


In [9]:
np.sqrt(-rf_score['mean_test_score'])

array([ 1971.33230627])

In [7]:
rf_score['mean_test_score']

{'mean_fit_time': array([ 3642.52908106]),
 'mean_score_time': array([ 3.25899296]),
 'mean_test_score': array([-3886151.06174344]),
 'mean_train_score': array([-882690.69072449]),
 'param_min_samples_leaf': masked_array(data = [2],
              mask = [False],
        fill_value = ?),
 'param_min_samples_split': masked_array(data = [4],
              mask = [False],
        fill_value = ?),
 'param_n_estimators': masked_array(data = [100],
              mask = [False],
        fill_value = ?),
 'params': ({'min_samples_leaf': 2,
   'min_samples_split': 4,
   'n_estimators': 100},),
 'rank_test_score': array([1], dtype=int32),
 'split0_test_score': array([-3904584.8602431]),
 'split0_train_score': array([-887201.03108566]),
 'split1_test_score': array([-3911557.51979802]),
 'split1_train_score': array([-881482.54125619]),
 'split2_test_score': array([-4062504.0305523]),
 'split2_train_score': array([-851372.03587125]),
 'split3_test_score': array([-3894547.36808593]),
 'split3_train_s

# Support Vector Machine Regressor

In [None]:
%%time
# Takes too long to run

# from sklearn.svm import SVR

# svr = SVR()
# svr_params = {'kernel': ['rbf'], 'C': [1.0]}
# svr_grid = GridSearchCV(svr, svr_params, cv=5, verbose=3, scoring='neg_mean_absolute_error')
# svr_grid.fit(train, y)
# svr_score = svr_grid.cv_results_

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] kernel=rbf, C=1.0 ...............................................
[CV] kernel=rbf, C=1.0 ...............................................
