In [1]:
from collections import Counter
import numpy as np
import pandas as pd
import xgboost as xgb
from scipy.sparse import csr_matrix
from sklearn.model_selection import KFold


In [2]:
cd /Users/williamzhou/Documents/github/RussianHousing/data/best

/Users/williamzhou/Documents/github/RussianHousing/data/best


In [3]:
x_train = pd.read_csv('./x_train.csv')
y_train = pd.read_csv('./y_train.csv')
x_test  = pd.read_csv('./x_test.csv')

In [4]:
full_sq = x_test['full_sq']

In [5]:
x_train = np.array(x_train)
y_train = np.array(y_train)
x_test = np.array(x_test)


In [6]:
SEED = 0
NFOLDS = 10
NTHREADS = 4

In [7]:
def get_oof(clf, x_train, y_train, x_test):
    ntrain = x_train.shape[0]
    ntest = x_test.shape[0]
    oof_train = np.zeros((ntrain,))
    oof_test = np.zeros((ntest,))
    oof_test_skf = np.empty((NFOLDS, ntest))
    
    
    kf = KFold(n_splits=NFOLDS, shuffle=True, random_state=SEED).split(x_train)

    for i, (train_index, test_index) in enumerate(kf):
        print(len(train_index))
        print(len(test_index))
        x_tr = x_train[train_index]
        print(x_tr.shape)
        y_tr = y_train[train_index]
        print(y_train.shape)
        x_te = x_train[test_index]
        print(x_te.shape)
        
        print('Start training fold {}'.format(i))
        clf.train(x_tr, y_tr)
        print('Finish training fold {}'.format(i))
        oof_train[test_index] = clf.predict(x_te)
        oof_test_skf[i, :] = clf.predict(x_test)
        print('Finish predicting fold {}'.format(i))
    oof_test[:] = oof_test_skf.mean(axis=0)
    
    return oof_train.reshape(-1, 1), oof_test.reshape(-1, 1)


In [8]:
class XgbWrapper(object):
    def __init__(self, seed=0, params=None):
        self.param = params
        self.param['seed'] = seed
        self.nrounds = params.pop('nrounds', 1000)

    def train(self, x_train, y_train):
        dtrain = xgb.DMatrix(x_train, label=y_train)
        self.gbdt = xgb.train(self.param, dtrain, self.nrounds)

    def predict(self, x):
        return self.gbdt.predict(xgb.DMatrix(x))

In [9]:
class SklearnWrapper(object):
    def __init__(self, clf, seed=0, params=None):
        params['random_state'] = seed
        self.clf = clf(**params)

    def train(self, x_train, y_train):
        self.clf.fit(x_train, y_train)

    def predict(self, x):
        return self.clf.predict(x)

In [10]:

xgb1_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'early_stopping_rounds':20,
    'silent': 0}

xgb2_params = {
    'eta': 0.04,
    'max_depth': 5,
    'subsample': 1,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'early_stopping_rounds':20,
    'silent': 0}

xgb3_params = {
    'eta': 0.03,
    'max_depth': 5,
    'subsample': 1,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'early_stopping_rounds':20,
    'silent': 0}

xgb4_params = {
    'eta': 0.02,
    'max_depth': 5,
    'subsample': 1,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'early_stopping_rounds':20,
    'silent': 0}

et_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.5,
    'max_depth': 12,
    'min_samples_leaf': 2,
}

rf_params = {
    'n_jobs': 16,
    'n_estimators': 100,
    'max_features': 0.2,
    'max_depth': 8,
    'min_samples_leaf': 2,
}

In [11]:
%%time
dtrain = xgb.DMatrix(x_train, label=y_train)

xg1 = XgbWrapper(seed=SEED, params=xgb1_params)
xg1_train, xg1_test = get_oof(xg1, x_train, y_train, x_test)

xg2 = XgbWrapper(seed=SEED, params=xgb2_params)
xg2_train, xg2_test = get_oof(xg2, x_train, y_train, x_test)

xg3 = XgbWrapper(seed=SEED, params=xgb3_params)
xg3_train, xg3_test = get_oof(xg3, x_train, y_train, x_test)

xg4 = XgbWrapper(seed=SEED, params=xgb4_params)
xg4_train, xg4_test = get_oof(xg4, x_train, y_train, x_test)

et = SklearnWrapper(clf=ExtraTreesRegressor, seed=SEED, params=et_params)
et_train , et_test =  get_oof(et, x_train, y_train, x_test)

rf = SklearnWrapper(clf=RandomForestRegressor, seed=SEED, params=rf_params)
rf_train , rf_test =  get_oof(rf, x_train, y_train, x_test)


27423
3048
(27423, 40)
(30471, 1)
(3048, 40)
Start training fold 0


KeyboardInterrupt: 

In [None]:
X_train_stack = np.concatenate([x_train, 
                                pd.DataFrame(xg1_train),
                                pd.DataFrame(xg2_train),
                                pd.DataFrame(xg3_train),
                                pd.DataFrame(xg4_train),
                                pd.DataFrame(et_train),
                                pd.DataFrame(rf_train),], axis=1)
xgtrain_stack = xgb.DMatrix(X_train_stack,y_train)


X_test_stack = np.concatenate([x_test, 
                                pd.DataFrame(xg1_test),
                                pd.DataFrame(xg2_test),
                                pd.DataFrame(xg3_test),
                                pd.DataFrame(xg4_test),
                                pd.DataFrame(et_test),
                                pd.DataFrame(rf_test),], axis=1)

xgtest_stack = xgb.DMatrix(X_test_stack)

In [None]:
final_xgb_params = {
    'eta': 0.05,
    'max_depth': 5,
    'subsample': 0.7,
    'colsample_bytree': 0.7,
    'objective': 'reg:linear',
    'eval_metric': 'rmse',
    'silent': 0}

cv_output = xgb.cv(final_xgb_params, xgtrain_stack, 
                       num_boost_round=1000, 
                       early_stopping_rounds=20,
                       verbose_eval=50, show_stdv=False)
test_rmse = cv_output.loc[len(cv_output)-1,'test-rmse-mean']
print(test_rmse)  
num_boost_rounds = len(cv_output)
model = xgb.train(dict(final_xgb_params, silent=0), xgtrain_stack, num_boost_round=num_boost_rounds)


In [None]:
y_predict = model.predict(xgtest_stack)
y_predic_all_sq = (y_predict)*full_sq
df_sub = pd.DataFrame({'id': range(30474,30474+7662), 'price_doc': y_predic_all_sq})
df_sub.to_csv('./sub.csv', index=False)
df_sub.head()