In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import KFold
from sklearn.metrics import mean_absolute_error
import warnings
warnings.filterwarnings('ignore')
import gc

import lightgbm as lgb
from catboost import CatBoostRegressor

In [2]:
Train_data = pd.read_hdf('output/train_tree.h5', 'df')
Test_data = pd.read_hdf('output/test_tree.h5', 'df')

In [3]:
numerical_cols = Train_data.columns
feature_cols = [col for col in numerical_cols if col not in ['price','SaleID']]

In [4]:
X_data = Train_data[feature_cols]
X_test = Test_data[feature_cols]
print(X_data.shape)
print(X_test.shape)

(149999, 83)
(50000, 83)


In [5]:
X_data = np.array(X_data)
X_test = np.array(X_test)
Y_data = np.array(Train_data['price'])

In [6]:
# 自定义损失函数
def myFeval(preds, xgbtrain):
    label = xgbtrain.get_label()
    score = mean_absolute_error(np.expm1(label), np.expm1(preds))
    return 'myFeval', score, False

In [7]:
param = {'boosting_type': 'gbdt',
         'num_leaves': 31,
         'max_depth': -1,
         "lambda_l2": 2,  # 防止过拟合
         'min_data_in_leaf': 20,  # 防止过拟合，好像都不用怎么调
         'objective': 'regression_l1',
         'learning_rate': 0.1,
         "min_child_samples": 20,

         "feature_fraction": 0.8,
         "bagging_freq": 1,
         "bagging_fraction": 0.8,
         "bagging_seed": 11,
         "metric": 'mae',
         }

In [8]:
folds = KFold(n_splits=10, shuffle=True, random_state=2018)
oof_lgb = np.zeros(len(X_data))
predictions_lgb = np.zeros(len(X_test))
predictions_train_lgb = np.zeros(len(X_data))
for fold_, (trn_idx, val_idx) in enumerate(folds.split(X_data, Y_data)):
    print("fold n°{}".format(fold_ + 1))
    trn_data = lgb.Dataset(X_data[trn_idx], Y_data[trn_idx])
    val_data = lgb.Dataset(X_data[val_idx], Y_data[val_idx])

    num_round = 100000000
    clf = lgb.train(param, trn_data, num_round, valid_sets=[trn_data, val_data], verbose_eval=300,
                    early_stopping_rounds=300, feval=myFeval)
    oof_lgb[val_idx] = clf.predict(X_data[val_idx], num_iteration=clf.best_iteration)
    predictions_lgb += clf.predict(X_test, num_iteration=clf.best_iteration) / folds.n_splits
    predictions_train_lgb += clf.predict(X_data, num_iteration=clf.best_iteration) / folds.n_splits

print("lightgbm score: {:<8.8f}".format(mean_absolute_error(np.expm1(oof_lgb), np.expm1(Y_data))))

fold n°1
Training until validation scores don't improve for 300 rounds
[300]	training's l1: 0.120579	training's myFeval: 534.189	valid_1's l1: 0.126745	valid_1's myFeval: 565.699
[600]	training's l1: 0.112965	training's myFeval: 485.805	valid_1's l1: 0.121117	valid_1's myFeval: 531.142
[900]	training's l1: 0.108599	training's myFeval: 460.751	valid_1's l1: 0.118802	valid_1's myFeval: 517.842
[1200]	training's l1: 0.105277	training's myFeval: 441.469	valid_1's l1: 0.11716	valid_1's myFeval: 508.457
[1500]	training's l1: 0.102629	training's myFeval: 425.902	valid_1's l1: 0.116144	valid_1's myFeval: 503.295
[1800]	training's l1: 0.100525	training's myFeval: 413.355	valid_1's l1: 0.11557	valid_1's myFeval: 499.986
[2100]	training's l1: 0.0984783	training's myFeval: 403.099	valid_1's l1: 0.114993	valid_1's myFeval: 496.666
[2400]	training's l1: 0.0965147	training's myFeval: 393.271	valid_1's l1: 0.114509	valid_1's myFeval: 493.859
[2700]	training's l1: 0.0949543	training's myFeval: 385.464	

[1500]	training's l1: 0.102419	training's myFeval: 425.243	valid_1's l1: 0.113136	valid_1's myFeval: 487.401
[1800]	training's l1: 0.100093	training's myFeval: 412.263	valid_1's l1: 0.112532	valid_1's myFeval: 483.518
[2100]	training's l1: 0.0980607	training's myFeval: 402.136	valid_1's l1: 0.111977	valid_1's myFeval: 479.616
[2400]	training's l1: 0.0965235	training's myFeval: 393.108	valid_1's l1: 0.111652	valid_1's myFeval: 477.385
[2700]	training's l1: 0.0950681	training's myFeval: 385.927	valid_1's l1: 0.111258	valid_1's myFeval: 475.441
[3000]	training's l1: 0.0938239	training's myFeval: 379.378	valid_1's l1: 0.111131	valid_1's myFeval: 474.637
[3300]	training's l1: 0.0925504	training's myFeval: 373.109	valid_1's l1: 0.110991	valid_1's myFeval: 473.576
[3600]	training's l1: 0.0913526	training's myFeval: 367.954	valid_1's l1: 0.110852	valid_1's myFeval: 472.803
[3900]	training's l1: 0.0904091	training's myFeval: 363.325	valid_1's l1: 0.110636	valid_1's myFeval: 471.51
[4200]	traini

[300]	training's l1: 0.120554	training's myFeval: 533.11	valid_1's l1: 0.125482	valid_1's myFeval: 556.082
[600]	training's l1: 0.11294	training's myFeval: 485.418	valid_1's l1: 0.120213	valid_1's myFeval: 525.77
[900]	training's l1: 0.108367	training's myFeval: 459.731	valid_1's l1: 0.117521	valid_1's myFeval: 512.229
[1200]	training's l1: 0.104916	training's myFeval: 440.425	valid_1's l1: 0.116039	valid_1's myFeval: 503.9
[1500]	training's l1: 0.102402	training's myFeval: 425.626	valid_1's l1: 0.115172	valid_1's myFeval: 499.054
[1800]	training's l1: 0.100126	training's myFeval: 413.391	valid_1's l1: 0.114499	valid_1's myFeval: 495.447
[2100]	training's l1: 0.0982928	training's myFeval: 403.707	valid_1's l1: 0.113839	valid_1's myFeval: 492.447
[2400]	training's l1: 0.0965089	training's myFeval: 395.037	valid_1's l1: 0.113424	valid_1's myFeval: 490.345
[2700]	training's l1: 0.0950605	training's myFeval: 387.032	valid_1's l1: 0.113067	valid_1's myFeval: 488.655
[3000]	training's l1: 0.

Early stopping, best iteration is:
[3736]	training's l1: 0.0912376	training's myFeval: 365.584	valid_1's l1: 0.111378	valid_1's myFeval: 497.041
lightgbm score: 481.85991485


In [10]:
# 测试集输出
predictions = predictions_lgb
predictions[predictions < 0] = 0
sub = pd.DataFrame()
sub['SaleID'] = Test_data.SaleID
sub['price'] = predictions
sub.to_csv('submit/lgb_test.csv', index=False)

In [11]:
oof_lgb[oof_lgb < 0] = 0
sub = pd.DataFrame()
sub['SaleID'] = Train_data.SaleID
sub['price'] = oof_lgb
sub.to_csv('submit/lgb_train.csv', index=False)

In [12]:
kfolder = KFold(n_splits=10, shuffle=True, random_state=2018)
oof_cb = np.zeros(len(X_data))
predictions_cb = np.zeros(len(X_test))
predictions_train_cb = np.zeros(len(X_data))
kfold = kfolder.split(X_data, Y_data)
fold_ = 0
for train_index, vali_index in kfold:
    fold_ = fold_ + 1
    print("fold n°{}".format(fold_))
    k_x_train = X_data[train_index]
    k_y_train = Y_data[train_index]
    k_x_vali = X_data[vali_index]
    k_y_vali = Y_data[vali_index]
    cb_params = {
        'n_estimators': 100000000,
        'loss_function': 'MAE',
        'eval_metric': 'MAE',
        'learning_rate': 0.1,
        'depth': 6,
        'use_best_model': True,
        'subsample': 0.6,
        'bootstrap_type': 'Bernoulli',
        'reg_lambda': 3,
        'one_hot_max_size': 2,
    }
    model_cb = CatBoostRegressor(**cb_params)
    # train the model
    model_cb.fit(k_x_train, k_y_train, eval_set=[(k_x_vali, k_y_vali)], verbose=300, early_stopping_rounds=600)
    oof_cb[vali_index] = model_cb.predict(k_x_vali, ntree_end=model_cb.best_iteration_)
    predictions_cb += model_cb.predict(X_test, ntree_end=model_cb.best_iteration_) / kfolder.n_splits
    predictions_train_cb += model_cb.predict(X_data, ntree_end=model_cb.best_iteration_) / kfolder.n_splits

print("catboost score: {:<8.8f}".format(mean_absolute_error(np.expm1(oof_cb), np.expm1(Y_data))))

fold n°1
0:	learn: 0.9069449	test: 0.9138640	best: 0.9138640 (0)	total: 216ms	remaining: 250d 8h 5m 4s
300:	learn: 0.1301680	test: 0.1333727	best: 0.1333727 (300)	total: 8.75s	remaining: 33d 15h 34m 48s
600:	learn: 0.1204003	test: 0.1248898	best: 0.1248898 (600)	total: 17.9s	remaining: 34d 12h 38m 47s
900:	learn: 0.1154646	test: 0.1209636	best: 0.1209636 (900)	total: 29.8s	remaining: 38d 6h 23m 49s
1200:	learn: 0.1123289	test: 0.1186767	best: 0.1186767 (1200)	total: 41.4s	remaining: 39d 22h 47s
1500:	learn: 0.1099717	test: 0.1171527	best: 0.1171527 (1500)	total: 54.5s	remaining: 42d 1h 4m 15s
1800:	learn: 0.1080825	test: 0.1159750	best: 0.1159750 (1800)	total: 1m 5s	remaining: 42d 8h 7m 34s
2100:	learn: 0.1066571	test: 0.1151626	best: 0.1151626 (2100)	total: 1m 16s	remaining: 42d 9h 17m 51s
2400:	learn: 0.1052346	test: 0.1144977	best: 0.1144962 (2398)	total: 1m 28s	remaining: 42d 13h 20m 17s
2700:	learn: 0.1039842	test: 0.1138567	best: 0.1138562 (2699)	total: 1m 39s	remaining: 42d 18h 

12000:	learn: 0.0887819	test: 0.1084658	best: 0.1084465 (11776)	total: 7m 15s	remaining: 41d 23h 2m 20s
12300:	learn: 0.0885495	test: 0.1084305	best: 0.1084304 (12298)	total: 7m 25s	remaining: 41d 22h 49m 30s
12600:	learn: 0.0883196	test: 0.1084222	best: 0.1084109 (12580)	total: 7m 36s	remaining: 41d 22h 32m 35s
12900:	learn: 0.0880794	test: 0.1083715	best: 0.1083668 (12899)	total: 7m 47s	remaining: 41d 22h 10m 3s
13200:	learn: 0.0878161	test: 0.1083576	best: 0.1083576 (13200)	total: 7m 58s	remaining: 41d 21h 49m
13500:	learn: 0.0875820	test: 0.1083164	best: 0.1083125 (13461)	total: 8m 8s	remaining: 41d 21h 26m 12s
13800:	learn: 0.0873436	test: 0.1083594	best: 0.1083117 (13527)	total: 8m 19s	remaining: 41d 21h 11m 44s
14100:	learn: 0.0871182	test: 0.1083828	best: 0.1083117 (13527)	total: 8m 30s	remaining: 41d 21h 15m 37s
Stopped by overfitting detector  (600 iterations wait)

bestTest = 0.1083117331
bestIteration = 13527

Shrink model to first 13528 iterations.
fold n°3
0:	learn: 0.908

5400:	learn: 0.0973311	test: 0.1091964	best: 0.1091964 (5400)	total: 3m 20s	remaining: 43d 1h 4m 22s
5700:	learn: 0.0967648	test: 0.1090679	best: 0.1090640 (5696)	total: 3m 31s	remaining: 43d 40m 37s
6000:	learn: 0.0962278	test: 0.1090136	best: 0.1089803 (5902)	total: 3m 43s	remaining: 43d 1h 30m 49s
6300:	learn: 0.0956480	test: 0.1088438	best: 0.1088411 (6294)	total: 3m 54s	remaining: 43d 1h 35m 28s
6600:	learn: 0.0951755	test: 0.1087860	best: 0.1087791 (6578)	total: 4m 5s	remaining: 43d 1h 24m 47s
6900:	learn: 0.0946898	test: 0.1087868	best: 0.1087712 (6697)	total: 4m 16s	remaining: 43d 1h 13m 10s
7200:	learn: 0.0942199	test: 0.1086840	best: 0.1086807 (7195)	total: 4m 27s	remaining: 43d 1h 8m 22s
7500:	learn: 0.0938030	test: 0.1086357	best: 0.1086357 (7500)	total: 4m 39s	remaining: 43d 1h 58m 38s
7800:	learn: 0.0934271	test: 0.1085724	best: 0.1085724 (7800)	total: 4m 50s	remaining: 43d 1h 8m 11s
8100:	learn: 0.0930479	test: 0.1085632	best: 0.1085341 (7902)	total: 5m	remaining: 42d 23

9600:	learn: 0.0905356	test: 0.1105449	best: 0.1105449 (9600)	total: 5m 58s	remaining: 43d 5h 46m 58s
9900:	learn: 0.0902559	test: 0.1105459	best: 0.1105360 (9863)	total: 6m 9s	remaining: 43d 4h 41m 48s
10200:	learn: 0.0899465	test: 0.1104950	best: 0.1104923 (10188)	total: 6m 20s	remaining: 43d 3h 44m 38s
10500:	learn: 0.0896362	test: 0.1104405	best: 0.1104374 (10464)	total: 6m 31s	remaining: 43d 3h 31m 41s
10800:	learn: 0.0893353	test: 0.1103491	best: 0.1103491 (10796)	total: 6m 42s	remaining: 43d 3h 38m 29s
11100:	learn: 0.0890418	test: 0.1103015	best: 0.1102972 (11052)	total: 6m 53s	remaining: 43d 3h 31m 31s
11400:	learn: 0.0887856	test: 0.1102753	best: 0.1102723 (11399)	total: 7m 4s	remaining: 43d 3h 5m 40s
11700:	learn: 0.0885101	test: 0.1102601	best: 0.1102356 (11595)	total: 7m 16s	remaining: 43d 2h 56m 11s
12000:	learn: 0.0882324	test: 0.1102393	best: 0.1102325 (11831)	total: 7m 27s	remaining: 43d 2h 49m 38s
12300:	learn: 0.0879682	test: 0.1101916	best: 0.1101916 (12300)	total: 

3300:	learn: 0.1020718	test: 0.1120031	best: 0.1120031 (3300)	total: 2m 4s	remaining: 43d 14h 25m 8s
3600:	learn: 0.1012334	test: 0.1116486	best: 0.1116486 (3600)	total: 2m 15s	remaining: 43d 14h 37m
3900:	learn: 0.1003260	test: 0.1112642	best: 0.1112642 (3900)	total: 2m 27s	remaining: 43d 16h 8m 53s
4200:	learn: 0.0995667	test: 0.1109751	best: 0.1109751 (4200)	total: 2m 38s	remaining: 43d 14h 42m 39s
4500:	learn: 0.0988542	test: 0.1107023	best: 0.1107020 (4499)	total: 2m 49s	remaining: 43d 12h 50m 39s
4800:	learn: 0.0981698	test: 0.1105061	best: 0.1105061 (4800)	total: 3m	remaining: 43d 11h 41m 45s
5100:	learn: 0.0975283	test: 0.1103613	best: 0.1103556 (5096)	total: 3m 11s	remaining: 43d 12h 2m 36s
5400:	learn: 0.0968584	test: 0.1102282	best: 0.1102282 (5400)	total: 3m 22s	remaining: 43d 11h 24m 26s
5700:	learn: 0.0963462	test: 0.1101520	best: 0.1101338 (5681)	total: 3m 34s	remaining: 43d 10h 55m 25s
6000:	learn: 0.0957342	test: 0.1100792	best: 0.1100743 (5984)	total: 3m 45s	remaining

9600:	learn: 0.0908091	test: 0.1107380	best: 0.1107380 (9600)	total: 6m 1s	remaining: 43d 13h 18m 10s
9900:	learn: 0.0904811	test: 0.1106926	best: 0.1106916 (9898)	total: 6m 12s	remaining: 43d 12h 46m 9s
10200:	learn: 0.0901846	test: 0.1106291	best: 0.1106290 (10196)	total: 6m 23s	remaining: 43d 12h 37m 17s
10500:	learn: 0.0899051	test: 0.1105749	best: 0.1105749 (10500)	total: 6m 34s	remaining: 43d 12h 16m 14s
10800:	learn: 0.0896280	test: 0.1105568	best: 0.1105471 (10739)	total: 6m 45s	remaining: 43d 11h 23m 17s
11100:	learn: 0.0893162	test: 0.1105221	best: 0.1105220 (11099)	total: 6m 56s	remaining: 43d 10h 20m 3s
11400:	learn: 0.0890323	test: 0.1105013	best: 0.1104926 (11357)	total: 7m 7s	remaining: 43d 9h 42m 48s
11700:	learn: 0.0887208	test: 0.1104931	best: 0.1104776 (11669)	total: 7m 18s	remaining: 43d 9h 31m 45s
12000:	learn: 0.0884694	test: 0.1104668	best: 0.1104602 (11968)	total: 7m 29s	remaining: 43d 9h 4m 24s
12300:	learn: 0.0882144	test: 0.1104410	best: 0.1104398 (12295)	tot

6600:	learn: 0.0946078	test: 0.1081990	best: 0.1081980 (6598)	total: 4m 6s	remaining: 43d 3h 25m 53s
6900:	learn: 0.0941349	test: 0.1081847	best: 0.1081800 (6849)	total: 4m 17s	remaining: 43d 3h 37m 29s
7200:	learn: 0.0936445	test: 0.1080615	best: 0.1080615 (7200)	total: 4m 28s	remaining: 43d 3h 29m 46s
7500:	learn: 0.0931464	test: 0.1080313	best: 0.1080262 (7494)	total: 4m 39s	remaining: 43d 3h 31s
7800:	learn: 0.0927146	test: 0.1079750	best: 0.1079718 (7772)	total: 4m 50s	remaining: 43d 2h 9m 3s
8100:	learn: 0.0923321	test: 0.1078980	best: 0.1078980 (8100)	total: 5m 1s	remaining: 43d 2h 37m 46s
8400:	learn: 0.0919826	test: 0.1078785	best: 0.1078616 (8291)	total: 5m 12s	remaining: 43d 2h 24m 36s
8700:	learn: 0.0915531	test: 0.1078353	best: 0.1078341 (8659)	total: 5m 23s	remaining: 43d 2h 9m 14s
9000:	learn: 0.0911920	test: 0.1077642	best: 0.1077599 (8966)	total: 5m 34s	remaining: 43d 59m 35s
9300:	learn: 0.0908523	test: 0.1077212	best: 0.1077140 (9191)	total: 5m 45s	remaining: 42d 23h

In [13]:
# 测试集输出
predictions = predictions_cb
predictions[predictions < 0] = 0
sub = pd.DataFrame()
sub['SaleID'] = Test_data.SaleID
sub['price'] = predictions
sub.to_csv('submit/cab_test.csv', index=False)

In [14]:
# 验证集输出
oof_cb[oof_cb < 0] = 0
sub = pd.DataFrame()
sub['SaleID'] = Train_data.SaleID
sub['price'] = oof_cb
sub.to_csv('submit/cab_train.csv', index=False)