In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn import preprocessing
import xgboost as xgb
color = sns.color_palette()

%matplotlib inline

pd.options.mode.chained_assignment = None  # default='warn'
pd.options.display.max_columns = 999



In [2]:
train = pd.read_csv("clean_train2.csv")
El = pd.read_csv("clean_train_El1.csv")
xgb = pd.read_csv("clean_train_Xgb.csv")
lgb = pd.read_csv("clean_train_Lgb.csv")
rf  = pd.read_csv("clean_train_Rf.csv")
ada = pd.read_csv("clean_train_Ada.csv")
gbm = pd.read_csv("clean_train_Gbm.csv")
nnt = pd.read_csv("clean_train_Keras.csv")

In [3]:
stack = El[["ID", "El1"]]
stack["xgb"] = xgb.Xgboost
stack["lgb"] = lgb.Lgb
stack["rf"] = rf.Rf
stack["ada"] = ada.Ada
stack["gbm"] = gbm.Gbm
stack["nnt"] = nnt.Keras

print stack.shape

(3693, 8)


In [4]:
y = train["y"]
#train = train.drop("y", axis=1)
print("Stack shape : ", stack.shape)

('Stack shape : ', (3693, 8))


In [5]:
#test = pd.read_csv("clean_test2.csv")
El = pd.read_csv("clean_test_El1.csv")
xgb = pd.read_csv("clean_test_Xgb.csv")
lgb = pd.read_csv("clean_test_Lgb.csv")
rf  = pd.read_csv("clean_test_Rf.csv")
ada = pd.read_csv("clean_test_Ada.csv")
gbm = pd.read_csv("clean_test_Gbm.csv")
nnt = pd.read_csv("clean_test_Keras.csv")

test = El[["ID", "El1"]]
test["xgb"] = xgb.Xgboost
test["lgb"] = lgb.Lgb
test["rf"] = rf.Rf
test["ada"] = ada.Ada
test["gbm"] = gbm.Gbm
test["nnt"] = nnt.Keras

print("Test shape : ", test.shape)

('Test shape : ', (4209, 8))


In [6]:
import xgboost as xgb
from sklearn.metrics import r2_score

class XGBoostReg():
    def __init__(self, num_boost_round=10, **kwargs):
        self.clf = None
        self.num_boost_round = num_boost_round
        self.params = kwargs
        self.params.update({'objective': 'reg:linear'})
 
    def fit(self, X, y, num_boost_round=None):
        num_boost_round = num_boost_round or self.num_boost_round
        dtrain = xgb.DMatrix(X, label=y)
        self.clf = xgb.train(params=self.params, dtrain=dtrain, num_boost_round=num_boost_round)
 
    def predict(self, X):
        dtest = xgb.DMatrix(X)
        return self.clf.predict(dtest)
 
    def score(self, X, y):
        Y = self.predict(X)
        return r2_score(y, Y)
 
    def get_params(self, deep=True):
        return self.params
 
    def set_params(self, **params):
        if 'num_boost_round' in params:
            self.num_boost_round = params.pop('num_boost_round')
        if 'objective' in params:
            del params['objective']
        self.params.update(params)
        return self

In [7]:
model = XGBoostReg(
        eval_metric = 'rmse',
        nthread = 4,
        eta = 0.004,
        max_depth = 4,
        subsample = 0.9,
        colsample_bytree = 1.0,
        silent = 1,
        )

parameters = {
        'num_boost_round': [1050],
        'eta': [0.005],
        'max_depth': [4],
        'subsample': [0.8],
        'colsample_bytree': [0.75, 0.8],
    }

In [8]:
from sklearn.grid_search import GridSearchCV
from sklearn.metrics import make_scorer

r2_scorer = make_scorer(r2_score)
grid = GridSearchCV(model, parameters, scoring=r2_scorer, cv=5)
grid_obj = grid.fit(stack, y)

grid_best = grid_obj.best_estimator_
print grid_best.params
print grid_best.num_boost_round



{'colsample_bytree': 0.75, 'silent': 1, 'eval_metric': 'rmse', 'nthread': 4, 'subsample': 0.8, 'eta': 0.005, 'objective': 'reg:linear', 'max_depth': 4}
1050


In [9]:
from sklearn.model_selection import StratifiedKFold
skf = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 44)

train_pred = [0 for i in range(stack.shape[0])]
test_pred = [0 for i in range(test.shape[0])]

for train_index, test_index in skf.split(train, y):
    x0, x1 = stack.iloc[train_index], stack.iloc[test_index]
    y0, y1 = y.iloc[train_index], y.iloc[test_index] 
    grid_best.fit(x0, y0)                
    
    pred = grid_best.predict(x1)
    print r2_score(y1, pred)
#    for ii, idx in enumerate(test_index):
#        train_pred[idx] = pred[ii]
    
    pred_test = grid_best.predict(test)
    for ii, val in enumerate(pred_test):
        test_pred[ii] += val



0.590478302926
0.64428195624
0.582384940321
0.603262574399
0.558582486976


In [10]:
# make predictions from CV and save results
y_pred = map(lambda x: x/5.0, test_pred)
output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': y_pred})
output.to_csv('sub_stack_xgb_cv.csv', index=False)

In [15]:
# make predictions and save results
grid_best.fit(stack, y)
y_pred = grid_best.predict(test)

output = pd.DataFrame({'id': test['ID'].astype(np.int32), 'y': y_pred})
output.to_csv('sub_stack_xgb.csv', index=False)