In [3]:
import pandas as pd
from sklearn.cross_validation import train_test_split
from sklearn import preprocessing
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import f_regression
import numpy as np
from sklearn import linear_model
from sklearn.grid_search import GridSearchCV
from sklearn.cross_validation import KFold
from sklearn.metrics import mean_absolute_error
import matplotlib.pyplot as plt
%matplotlib inline
from sklearn.preprocessing import LabelEncoder
import xgboost as xgb

In [5]:
train = pd.read_csv("self_train.csv",index_col=0)
X_test = pd.read_csv("self_test.csv",index_col=0)

In [6]:
X_train = train.drop("loss",axis=1)
Y_train = train['loss']

In [7]:
catFeatureslist = [x for x in train.columns[0:-1] if 'cat' in x]
for cf in catFeatureslist:
    le = LabelEncoder()
    le.fit(X_train[cf].unique())
    X_train[cf] = le.transform(X_train[cf])

In [8]:
for cf in catFeatureslist:
    le = LabelEncoder()
    le.fit(X_test[cf].unique())
    X_test[cf] = le.transform(X_test[cf])

In [9]:
xgb_model = xgb.XGBRegressor()

#when in doubt, use xgboost
# parameters = {'objective':['reg:linear'],
#               'learning_rate': [0.7], #so called `eta` value
#               'max_depth': [12],
#               'min_child_weight': [100],
#               'silent': [1],
#               'subsample': [0.7],
#               'colsample_bytree': [0.7],
#               'n_estimators': [100], #number of trees
#               'seed': [1337]}
parameters = {'nthread':[1], #when use hyperthread, xgboost may become slower
              'objective':['reg:linear'],
              'learning_rate': [0.06], #so called `eta` value
              'max_depth': [4,6,8,10],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.4,0.6,0.8,1],
              'n_estimators': [100], #number of trees
              'seed': [1337]}

In [10]:
def _score_func(estimator, X, y):
    return mean_absolute_error(np.expm1(y), np.expm1(estimator.predict(X)))

In [11]:
clf = GridSearchCV(xgb_model, parameters, 
                   cv=KFold(len(Y_train), n_folds = 5,shuffle = True), 
                   scoring=_score_func,
                   verbose=2, refit=True)

clf.fit(X_train, Y_train)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV] colsample_bytree=0.5, learning_rate=0.15, silent=1, objective=reg:linear, seed=1337, max_depth=8, subsample=0.9, n_estimators=300, nthread=4, min_child_weight=11 
[CV]  colsample_bytree=0.5, learning_rate=0.15, silent=1, objective=reg:linear, seed=1337, max_depth=8, subsample=0.9, n_estimators=300, nthread=4, min_child_weight=11 - 2.1min
[CV] colsample_bytree=0.5, learning_rate=0.15, silent=1, objective=reg:linear, seed=1337, max_depth=8, subsample=0.9, n_estimators=300, nthread=4, min_child_weight=11 
[CV]  colsample_bytree=0.5, learning_rate=0.15, silent=1, objective=reg:linear, seed=1337, max_depth=8, subsample=0.9, n_estimators=300, nthread=4, min_child_weight=11 - 2.1min
[CV] colsample_bytree=0.5, learning_rate=0.15, silent=1, objective=reg:linear, seed=1337, max_depth=8, subsample=0.9, n_estimators=300, nthread=4, min_child_weight=11 
[CV]  colsample_bytree=0.5, learning_rate=0.15, silent=1, objective=reg:linear, see

[Parallel(n_jobs=1)]: Done   5 out of   5 | elapsed: 10.5min finished


GridSearchCV(cv=sklearn.cross_validation.KFold(n=188318, n_folds=5, shuffle=True, random_state=None),
       error_score='raise',
       estimator=XGBRegressor(base_score=0.5, colsample_bylevel=1, colsample_bytree=1, gamma=0,
       learning_rate=0.1, max_delta_step=0, max_depth=3,
       min_child_weight=1, missing=None, n_estimators=100, nthread=-1,
       objective='reg:linear', reg_alpha=0, reg_lambda=1,
       scale_pos_weight=1, seed=0, silent=True, subsample=1),
       fit_params={}, iid=True, n_jobs=1,
       param_grid={'colsample_bytree': [0.5], 'learning_rate': [0.15], 'silent': [1], 'n_estimators': [300], 'seed': [1337], 'objective': ['reg:linear'], 'subsample': [0.9], 'max_depth': [8], 'nthread': [4], 'min_child_weight': [11]},
       pre_dispatch='2*n_jobs', refit=True,
       scoring=<function _score_func at 0x7f8a70677bf8>, verbose=2)

In [12]:
best_parameters, score, _ = max(clf.grid_scores_, key=lambda x: x[1])

In [13]:
score

1156.3303601152261

In [None]:
def xg_eval_mae(yhat, dtrain):
    y = dtrain.get_label()
    return 'mae', mean_absolute_error(np.exp(y)-shift,
                                      np.exp(yhat)-shift)

In [15]:
param = best_parameters
X_train, X_validation, Y_train, Y_validation = train_test_split(X_train, Y_train, train_size=0.9)
dtrain = xgb.DMatrix(X_train, label=Y_train)
dvalid = xgb.DMatrix(X_validation,Y_validation)
dtest = xgb.DMatrix(X_test)
evallist  = [(dvalid,'eval'), (dtrain,'train')]
bst = xgb.train(param,
                d_train,
                100000,
                evallist,
                early_stopping_rounds=50,
                feval = xg_eval_mae)

In [None]:
bst = xgb.train(param,
                dtrain,
                100000,
                evallist,
                early_stopping_rounds=50)
bst.save_model('xgboostfinal.model')

In [16]:
ypred = bst.predict(X_test)

In [17]:
prediction = np.expm1(ypred)

In [18]:
submit = pd.read_csv("sample_submission.csv")

In [19]:
submit["loss"] = prediction
submit.to_csv("xgbsubsetcvtrial.csv",index=False)

In [None]:
bst = xgb.train(param,
                dtrain,
                100000,
                evallist,
                early_stopping_rounds=50)


[0]	eval-mae:6.10635	train-mae:6.10862
Multiple eval metrics have been passed: 'train-mae' will be used for early stopping.

Will train until train-mae hasn't improved in 50 rounds.
[1]	eval-mae:5.19038	train-mae:5.19254
[2]	eval-mae:4.41213	train-mae:4.41397
[3]	eval-mae:3.75041	train-mae:3.75222
[4]	eval-mae:3.18824	train-mae:3.18988
[5]	eval-mae:2.71046	train-mae:2.71199
[6]	eval-mae:2.3044	train-mae:2.30598
[7]	eval-mae:1.96009	train-mae:1.96146
[8]	eval-mae:1.66835	train-mae:1.6699
[9]	eval-mae:1.42207	train-mae:1.42403
[10]	eval-mae:1.21581	train-mae:1.2182
[11]	eval-mae:1.04574	train-mae:1.0482
[12]	eval-mae:0.907695	train-mae:0.909452
[13]	eval-mae:0.797873	train-mae:0.798281
[14]	eval-mae:0.711371	train-mae:0.710618
[15]	eval-mae:0.643989	train-mae:0.64226
[16]	eval-mae:0.592412	train-mae:0.589586
[17]	eval-mae:0.553243	train-mae:0.549014
[18]	eval-mae:0.523904	train-mae:0.518307
[19]	eval-mae:0.501959	train-mae:0.495033
[20]	eval-mae:0.48514	train-mae:0.477381
[21]	eval-mae:0