In [21]:
import os
import pickle

import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_absolute_error, make_scorer
from xgboost.sklearn import XGBRegressor
from matplotlib import pyplot
import matplotlib.pyplot as plt
from bayes_opt import BayesianOptimization

# data_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'user_data')
# model_dir = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'models')
# output_dir = os.path.join(os.path.dirname(os.path.dirname(os.path.abspath(__file__))), 'prediction_result')

# data_path = os.path.join(data_dir, 'train_tree.csv')
# test_data_path = os.path.join(data_dir, 'test_tree.csv')
# model_path = os.path.join(model_dir, 'xgb.pkl')
# output_path = os.path.join(output_dir, 'xgb.csv')
data_path = './user_data/train_tree.csv'
test_data_path = './user_data/test_tree.csv'
model_path = './model/xgb.pkl'
output_path = './prediction_result/xgb.csv'

In [22]:
df_train = pd.read_csv(data_path)
df_test = pd.read_csv(test_data_path)

In [23]:
print(df_train.shape)
print(df_test.shape)

(143636, 595)
(50000, 594)


In [24]:
df_train.head()

Unnamed: 0,name,model,brand,power,kilometer,regionCode,price,v_0,v_1,v_2,...,v_12_multiply_v_13,v_12_div_v_13,v_12_add_v_14,v_12_minus_v_14,v_12_multiply_v_14,v_12_div_v_14,v_13_add_v_14,v_13_minus_v_14,v_13_multiply_v_14,v_13_div_v_14
0,7.504,7.35,7.547,60,12.5,8.45,7.523,43.34,3.967,0.05026,...,-1.925,-3.04394,-1.506,-3.336,-2.215,-2.646393,1.71,-0.11945,0.7275,0.869397
1,7.727,8.38,8.62,0,15.0,8.805,8.19,45.3,5.234,0.138,...,1.775,0.598188,-0.785,-1.276,-0.253,-4.197103,-1.478,-1.968,-0.4229,-7.016361
2,8.05,9.15,9.04,163,12.5,8.375,8.734,45.97,4.824,1.319,...,-1.304,-1.879853,1.335,1.795,-0.3599,-6.80688,-1.0625,-0.6025,0.1915,3.620964
3,8.19,8.96,8.52,193,15.0,8.08,7.785,45.7,4.492,-0.05063,...,1.224,0.205823,-0.9805,-0.02316,0.2402,1.048399,-2.918,-1.96,1.167,5.093704
4,8.05,6.93,7.688,68,5.0,9.125,8.555,44.38,2.031,0.5723,...,2.639,0.32849,2.855,-0.992,1.791,0.484075,4.758,0.911,5.453,1.473639


In [25]:
y = df_train.pop('price').values
X = df_train.values

In [19]:
def my_score(y_predict, y_true):
    '''
    自定义评估指标
    :return:
    '''
    print(y_true)
    print('===')
    print(y_predict)
    
    score = - mean_absolute_error(np.exp(y_true), np.nan_to_num(np.exp(y_predict)))
#     return 'exp_score', score
#     label = y_predict.get_label()
#     score = - mean_absolute_error(np.exp(label), np.exp(preds))
    return 'exp_score', score
# score = make_scorer(my_score,greater_is_better=True)

In [26]:
def log_transfer(func):  #定义一个将数据转为log的闭包函数
    def wrapper(y, yhat):
        print(y)
        print('='*100)
        print(yhat)
        result = func(np.exp(y), np.nan_to_num(np.exp(yhat)))
        return result
    return wrapper

In [27]:
def xgb_cv(n_estimators, learning_rate, gamma, min_child_weight, max_depth, colsample_bytree, subsample):
    param = {
        'objective': 'reg:squarederror',
        'matric': my_score,
        'random_state': 2020,
        "early_stopping_rounds": 50,
        #         "tree_method": "gpu_hist",
        #         "gpu_id": 0
    }
    param['n_estimators'] = int(n_estimators)
#     param['learning_rate'] = float(learning_rate)
#     param['gamma'] = float(gamma)
#     param['min_child_weight'] = float(min_child_weight)
#     param['max_depth'] = int(max_depth)
#     param['colsample_bytree'] = float(colsample_bytree)
    val = cross_val_score(XGBRegressor(**param),
                          X, y, scoring=make_scorer(log_transfer(mean_absolute_error)), cv=2).mean()
    return val


xgb_bo = BayesianOptimization(
    xgb_cv,
    {'n_estimators': (10, 100),
     'learning_rate': (0.03, 0.3),
     'gamma': (0, 0.5),
     'min_child_weight': (10, 200),
     'max_depth': (4, 10),
     'colsample_bytree': (0.1, 1),
     'subsample': (0.5, 1),
     }
)
xgb_bo.maximize()

|   iter    |  target   | colsam... |   gamma   | learni... | max_depth | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------------------
[7.523 8.19  8.734 ... 7.41  6.746 8.1  ]
[7.6312566 7.5287952 8.688154  ... 7.368878  6.8022404 7.016978 ]
[9.734 8.945 9.58  ... 8.92  8.516 8.45 ]
[9.778195 8.885736 9.446625 ... 8.903965 8.554507 8.452547]
| [0m 1       [0m | [0m 590.6   [0m | [0m 0.3231  [0m | [0m 0.1612  [0m | [0m 0.1591  [0m | [0m 4.585   [0m | [0m 169.4   [0m | [0m 90.01   [0m | [0m 0.9212  [0m |


KeyboardInterrupt: 

In [None]:
xgb_bo.max['params']

In [None]:
X