In [2]:
import warnings
warnings.filterwarnings('ignore')

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import missingno as msno
#显示所有列
pd.set_option('display.max_columns', None)
#显示所有行
pd.set_option('display.max_rows', None)
#设置value的显示长度为100，默认为50
pd.set_option('max_colwidth',100)

def reduce_mem_usage(df):

    start_mem = df.memory_usage().sum() 
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() 
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    return df

data = pd.read_csv('data1.csv', sep=',')
data = reduce_mem_usage(data)

endl_name = ['x77', 'x30', 'x33', 'x61', 'x26', 'x2', 'x274', 'x6', 'x28', 'x54', 'x162', 'x320', 'x315', 'x245', 'x273', 'x191', 'x169', 'x130', 'x182', 'x317', 'x123', 'x4', 'x174', 'x16']

X = data[endl_name]
Y = data['y']

Memory usage of dataframe is 553928.00 MB
Memory usage after optimization is: 147028.00 MB
Decreased by 73.5%


In [50]:
from xgboost.sklearn import XGBRegressor
from bayes_opt import BayesianOptimization
from sklearn.metrics import mean_absolute_error, mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score
from sklearn.metrics import make_scorer
# xx_train, xx_test, yy_train, yy_test = train_test_split(X, Y, test_size=0.3, random_state=0)

def xgb_cv(n_estimators, max_depth, subsample, colsample_bytree, learning_rate, min_child_weight):
    res = cross_val_score( 
        XGBRegressor(n_estimators=int(n_estimators), 
                     max_depth=int(max_depth),
                     learning_rate=learning_rate, 
#                      scale_pos_weight=1.0,
#                      base_score=0.5, 
                     random_state=10, 
                     subsample=float(subsample),
                     colsample_bytree = float(colsample_bytree),
#                      min_child_weight = int(min_child_weight)
                     
        ),
        X, Y, scoring=make_scorer(mean_squared_error)
    ).mean()
    return 1-res

gbdt_op = BayesianOptimization(
        xgb_cv,
       {
           
            'n_estimators': (100, 500),
            'max_depth': (3, 15),
           
            'subsample': (0.5, 1),
            'colsample_bytree': (0.5, 1),
            'learning_rate':(0.01, 0.2),
            'min_child_weight':(2,100)
#             'min_child_weight': (0, 20),
#             'max_delta_step': (0, 2),
        },
        random_state=66,
    )

gbdt_op.maximize()

|   iter    |  target   | colsam... | learni... | max_depth | min_ch... | n_esti... | subsample |
-------------------------------------------------------------------------------------------------
| [0m 1       [0m | [0m 0.9561  [0m | [0m 0.5771  [0m | [0m 0.0354  [0m | [0m 7.352   [0m | [0m 68.55   [0m | [0m 177.8   [0m | [0m 0.6256  [0m |
| [0m 2       [0m | [0m 0.9536  [0m | [0m 0.8792  [0m | [0m 0.1159  [0m | [0m 9.178   [0m | [0m 47.84   [0m | [0m 134.9   [0m | [0m 0.9145  [0m |
| [0m 3       [0m | [0m 0.954   [0m | [0m 0.6493  [0m | [0m 0.01596 [0m | [0m 11.14   [0m | [0m 90.54   [0m | [0m 305.8   [0m | [0m 0.7696  [0m |
| [95m 4       [0m | [95m 0.9564  [0m | [95m 0.8322  [0m | [95m 0.1305  [0m | [95m 7.241   [0m | [95m 4.611   [0m | [95m 166.1   [0m | [95m 0.9397  [0m |
| [0m 5       [0m | [0m 0.9508  [0m | [0m 0.5339  [0m | [0m 0.08013 [0m | [0m 4.386   [0m | [0m 11.44   [0m | [0m 133.5   [0m | [0m 0

In [55]:
gbdt_op.max

{'target': 0.9563555765897036,
 'params': {'colsample_bytree': 0.8321637581704533,
  'learning_rate': 0.13047078350081956,
  'max_depth': 7.241033617708029,
  'min_child_weight': 4.6109662584884,
  'n_estimators': 166.1159358189439,
  'subsample': 0.9396594956925226}}

In [91]:
# xgbmodel = XGBRegressor(n_estimators=424, 
#                      max_depth=14,
#                      learning_rate= 0.15354501758169445, 
#                      random_state=10, 
#                      subsample=0.8245215977200513,
#                      colsample_bytree = 0.8109203086197454）
                        
xgbmodel = XGBRegressor(n_estimators=100, max_depth=2,
                     learning_rate=0.13, 
                     random_state=66, subsample=0.7,
                     colsample_bytree = 0.6)
# xgbmodel.fit(x_train,y_train)
# xx_train, xx_test, yy_train, yy_test = train_test_split(X, Y, test_size=0.3, random_state=0)

xgbmodel.fit(X, Y)

XGBRegressor(base_score=0.5, booster=None, colsample_bylevel=1,
             colsample_bynode=1, colsample_bytree=0.6, gamma=0, gpu_id=-1,
             importance_type='gain', interaction_constraints=None,
             learning_rate=0.13, max_delta_step=0, max_depth=2,
             min_child_weight=1, missing=nan, monotone_constraints=None,
             n_estimators=100, n_jobs=0, num_parallel_tree=1,
             objective='reg:squarederror', random_state=66, reg_alpha=0,
             reg_lambda=1, scale_pos_weight=1, subsample=0.7, tree_method=None,
             validate_parameters=False, verbosity=None)

In [92]:
y_pred= xgbmodel.predict(X)

In [100]:
mean_squared_error(Y, y_pred), mean_absolute_error(Y, y_pred)

(0.014129029, 0.09144071)

In [94]:
xgbmodel2 = XGBRegressor(n_estimators=166, max_depth=7,
                     learning_rate=0.13, 
                     random_state=66, subsample=0.93,
                     colsample_bytree = 0.83)

In [101]:
error = y_pred - Y

In [104]:
(error**2).mean()**0.5

0.11886555937279743

In [110]:
y_pred.reshape(-1,1)

array([[1.3753949 ],
       [1.2590351 ],
       [1.303601  ],
       [1.2804633 ],
       [1.341224  ],
       [1.3209068 ],
       [1.3029152 ],
       [1.3273654 ],
       [1.3671188 ],
       [1.3938401 ],
       [1.2995529 ],
       [1.4129889 ],
       [1.3944073 ],
       [1.3380663 ],
       [1.5775281 ],
       [1.4713745 ],
       [1.309644  ],
       [1.4339995 ],
       [1.2985985 ],
       [1.2679257 ],
       [1.3407257 ],
       [1.1758838 ],
       [1.4165163 ],
       [1.4423895 ],
       [1.464776  ],
       [1.4648402 ],
       [1.4266336 ],
       [1.3427565 ],
       [1.3837293 ],
       [1.3390483 ],
       [1.3870273 ],
       [1.3572204 ],
       [1.0919614 ],
       [1.057198  ],
       [1.2064166 ],
       [1.3623703 ],
       [1.3425831 ],
       [1.2996393 ],
       [1.2571728 ],
       [1.2932085 ],
       [1.4180137 ],
       [1.2559315 ],
       [1.4409219 ],
       [1.2180843 ],
       [1.1851115 ],
       [1.213416  ],
       [1.2504911 ],
       [1.231

In [98]:
y_pred2= xgbmodel2.predict(X)

In [99]:
mean_squared_error(Y, y_pred2)

7.455699e-07

In [90]:
mean_squared_error(yy_train, y_train_pred2), mean_squared_error(yy_test, y_test_pred2)

(6.0845326e-07, 0.04700239)