## Boxcox transform all continuous features
## Then, normalize all continuous features
## log1p target variable

In [1]:
import numpy as np
np.random.seed(123)

import pandas as pd
from sklearn.metrics import mean_absolute_error
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import RandomForestRegressor
from scipy.sparse import csr_matrix, hstack
from scipy.stats import skew
from scipy.stats import boxcox
from scipy import stats

In [2]:
print "Data Preprocessing Begins..."
## Read data
train = pd.read_csv('../data/train.csv')
test = pd.read_csv('../data/test.csv')

## Set test loss to NaN
test['loss'] = np.nan

## Response and IDs
y = np.log1p(train['loss'].values)
print 'BEFORE BOXCOX Skewness of target: ', skew(train['loss'])
print 'AFTER BOXCOX Skewness of target: ', skew(y)
id_train = train['id'].values
id_test = test['id'].values

## Stack train test
ntrain = train.shape[0]
tr_te = pd.concat((train, test), axis = 0)

## Preprocessing and transforming to sparse data
sparse_data = []

### Categorical Features
f_cat = [f for f in tr_te.columns if 'cat' in f]
for f in f_cat:
    dummy = pd.get_dummies(tr_te[f].astype('category'))
    tmp = csr_matrix(dummy)
    sparse_data.append(tmp)

### Continuous Features ###
skewed_feats = (tr_te[tr_te.dtypes[tr_te.dtypes == 'float64'].index]).apply(lambda x: skew(x.dropna()))
print 'BEFORE BOXCOX Skewness of all features: \n', skewed_feats
print '\nFeatures with skewness > 0.75: \n', skewed_feats[skewed_feats > 0.75]

# Skewness correction
f_num = [f for f in tr_te.columns if 'cont' in f]
for col in f_num:
    tr_te[col] = boxcox(tr_te[col]+1)[0]
    
# Normalizing
scaler = StandardScaler()
skewed_feats = (tr_te[tr_te.dtypes[tr_te.dtypes == 'float64'].index]).apply(lambda x: skew(x.dropna()))
print 'AFTER BOXCOX Skewness of all features: \n', skewed_feats
print '\nFeatures with skewness > 0.75: \n', skewed_feats[skewed_feats > 0.75]

tmp = csr_matrix(scaler.fit_transform(tr_te[f_num]))
sparse_data.append(tmp)

del(tr_te, train, test) # Deleting tr_te, train, test like garbage collection

### sparse train and test data
xtr_te = hstack(sparse_data, format = 'csr')
xtrain = xtr_te[:ntrain, :]
xtest = xtr_te[ntrain:, :]

print('Dim train', xtrain.shape)
print('Dim test', xtest.shape)

del(xtr_te, sparse_data, tmp)

print "Data Preprocessing Ends..."

Data Preprocessing Begins...
BEFORE BOXCOX Skewness of target:  3.79492814968
AFTER BOXCOX Skewness of target:  0.0966188199022
BEFORE BOXCOX Skewness of all features: 
cont1     0.513205
cont2    -0.311146
cont3    -0.007023
cont4     0.417559
cont5     0.679610
cont6     0.458413
cont7     0.825889
cont8     0.673237
cont9     1.067247
cont10    0.352116
cont11    0.281139
cont12    0.291997
cont13    0.376138
cont14    0.250673
loss      3.794928
dtype: float64

Features with skewness > 0.75: 
cont7    0.825889
cont9    1.067247
loss     3.794928
dtype: float64
AFTER BOXCOX Skewness of all features: 
cont1     0.004049
cont2    -0.101841
cont3    -0.027565
cont4     0.051627
cont5     0.205682
cont6     0.038833
cont7     0.054710
cont8     0.124111
cont9    -0.012046
cont10    0.020122
cont11    0.023397
cont12    0.024871
cont13    0.092823
cont14    0.068549
loss      3.794928
dtype: float64

Features with skewness > 0.75: 
loss    3.794928
dtype: float64
('Dim train', (188318, 1

In [3]:
train = pd.DataFrame(xtrain.toarray())
target = pd.DataFrame(xtest.toarray())

In [4]:
print train.shape
print target.shape
print y.shape

(188318, 1190)
(125546, 1190)
(188318,)


In [5]:
train.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,1180,1181,1182,1183,1184,1185,1186,1187,1188,1189
0,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,-0.892941,1.110234,-0.871854,-0.989339,1.123537,1.647215,0.450314,0.56387,1.403248,1.00762
1,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,1.0,0.0,...,1.559821,-0.135754,-0.112264,0.747696,-0.74906,-0.231931,-0.702751,-0.546698,0.683287,-0.839662
2,1.0,0.0,0.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,...,-0.245001,-1.016579,-1.040409,-1.236757,-1.615922,-0.941361,-0.467237,-0.508036,-1.696411,1.208015
3,0.0,1.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,-0.086043,-0.125015,-0.429909,-0.867203,-1.013009,-0.200335,-0.76122,-0.796694,0.657995,0.591953
4,1.0,0.0,0.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,...,1.12142,-1.816004,-1.698643,-1.486329,-2.073037,-1.737857,-1.508394,-1.529282,-1.303842,-0.155206


# Ridge Regression

In [8]:
from sklearn.linear_model import Ridge

In [9]:
%%time

ridge = Ridge()
ridge_params = {'alpha': [1]}
ridge_grid = GridSearchCV(ridge, ridge_params, cv=5, verbose=10, scoring='neg_mean_absolute_error')
ridge_grid.fit(train, y)
ridge_score = ridge_grid.cv_results_
print ridge_score

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] alpha=1 .........................................................
[CV] ......................... alpha=1, score=-0.441068, total=   9.3s
[CV] alpha=1 .........................................................


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    9.8s remaining:    0.0s


[CV] ......................... alpha=1, score=-0.437678, total=   8.7s
[CV] alpha=1 .........................................................


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   18.8s remaining:    0.0s


[CV] ......................... alpha=1, score=-0.443295, total=   7.8s
[CV] alpha=1 .........................................................


[Parallel(n_jobs=1)]: Done   3 out of   3 | elapsed:   27.0s remaining:    0.0s


[CV] ......................... alpha=1, score=-0.442910, total=   7.5s
[CV] alpha=1 .........................................................


[Parallel(n_jobs=1)]: Done   4 out of   4 | elapsed:   34.9s remaining:    0.0s


[CV] ......................... alpha=1, score=-0.438264, total=   7.9s


[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   43.2s remaining:    0.0s
[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed:   43.2s finished


{'rank_test_score': array([1], dtype=int32), 'split4_test_score': array([-0.43826449]), 'mean_score_time': array([ 0.0750154]), 'std_test_score': array([ 0.00231524]), 'std_train_score': array([ 0.00054402]), 'split1_train_score': array([-0.43878047]), 'split0_test_score': array([-0.44106821]), 'mean_test_score': array([-0.44064321]), 'std_score_time': array([ 0.03205147]), 'split2_train_score': array([-0.43737973]), 'param_alpha': masked_array(data = [1],
             mask = [False],
       fill_value = ?)
, 'split0_train_score': array([-0.43800907]), 'params': ({'alpha': 1},), 'std_fit_time': array([ 0.61320803]), 'split4_train_score': array([-0.43852283]), 'split2_test_score': array([-0.44329523]), 'split3_test_score': array([-0.44290993]), 'mean_train_score': array([-0.43804414]), 'mean_fit_time': array([ 8.18830957]), 'split3_train_score': array([-0.4375286]), 'split1_test_score': array([-0.43767816])}
CPU times: user 1min 41s, sys: 12.2 s, total: 1min 53s
Wall time: 51.4 s


In [10]:
# alpha = 1 is the best

print ridge_grid.best_params_
print ridge_score['mean_test_score']
print ridge_score['params']

{'alpha': 1}
[-0.44064321]
({'alpha': 1},)


# Why is MAE so low? Let's try a train_test_split

In [6]:
from sklearn.model_selection import train_test_split

In [20]:
y_pred = []
y_true = []
for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.30, random_state=42*i)
    ridge_train_test = Ridge(alpha=1)
    ridge_train_test.fit(X_train, y_train)
    y_pred.append(ridge_train_test.predict(X_test))
    y_true.append(y_test)

In [21]:
mae_func = lambda pred, true: mean_absolute_error(np.expm1(pred), np.expm1(true))
mae = map(mae_func, y_pred, y_true)
print mae

[1252.9339720442265, 1259.6141218380044, 1262.0755635083622, 1238.6085712513468, 1245.4792312021364]


# Random Forest

In [8]:
%%time

rf = RandomForestRegressor(n_estimators=100, min_samples_leaf=2, min_samples_split=4)

y_pred = []
y_true = []
for i in range(5):
    X_train, X_test, y_train, y_test = train_test_split(train, y, test_size=0.30, random_state=42*i)
    rf.fit(X_train, y_train)
    y_pred.append(rf.predict(X_test))
    y_true.append(y_test)

CPU times: user 2h 42min 37s, sys: 1min 11s, total: 2h 43min 49s
Wall time: 2h 47min 37s


In [9]:
mae_func = lambda pred, true: mean_absolute_error(np.expm1(pred), np.expm1(true))
mae = map(mae_func, y_pred, y_true)
print mae

[1205.8325374129463, 1208.3821283146708, 1216.9554735298184, 1209.6614144118996, 1197.4536438551063]


In [12]:
np.mean(mae)

1207.6570395048882

In [22]:
rf_final_pred = pd.DataFrame({'id':id_test, 'loss':np.expm1(rf.predict(target))})
rf_final_pred.to_csv('submission_rf.csv', index=False)

In [25]:
from sklearn.externals import joblib
joblib.dump(rf, 'rf_model.pkl', compress=9)

# >>> from sklearn.externals import joblib
# >>> model_clone = joblib.load('my_model.pkl')

['rf_model.pkl']