# 导入模块

In [16]:
import joblib
import warnings
import numpy as np
import pandas as pd
from typing import List
import statsmodels.api as sm
import matplotlib.pyplot as plt
from skopt import BayesSearchCV
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import roc_curve, auc

warnings.filterwarnings("ignore")
pd.set_option("display.width", 10000)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)
plt.rcParams['font.sans-serif'] = ['FangSong']
plt.rcParams['axes.unicode_minus'] = False
plt.style.use("Solarize_Light2")

# 加载数据

In [2]:
train = pd.read_csv("../dataset/train.csv")
train.head()

Unnamed: 0,id,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_index,cons_conf_index,lending_rate3m,nr_employed,subscribe
0,1,51,admin.,divorced,professional.course,no,yes,yes,cellular,aug,mon,4621,1,112,2,failure,1.4,90.81,-35.53,0.69,5219.74,no
1,2,50,services,married,high.school,unknown,yes,no,cellular,may,mon,4715,1,412,2,nonexistent,-1.8,96.33,-40.58,4.05,4974.79,yes
2,3,48,blue-collar,divorced,basic.9y,no,no,no,cellular,apr,wed,171,0,1027,1,failure,-1.8,96.33,-44.74,1.5,5022.61,no
3,4,26,entrepreneur,single,high.school,yes,yes,yes,cellular,aug,fri,359,26,998,0,nonexistent,1.4,97.08,-35.55,5.11,5222.87,yes
4,5,45,admin.,single,university.degree,no,no,no,cellular,nov,tue,3178,1,240,4,success,-3.4,89.82,-33.83,1.17,4884.7,no


# 特征工程

In [26]:
# 列出需要标准化的数值型特征和需要独热编码的类别型特征
numeric_features = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp_var_rate', 
                    'cons_price_index', 'cons_conf_index', 'lending_rate3m', 'nr_employed']
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
# 定义ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('poly', PolynomialFeatures(degree=3, include_bias=False), numeric_features),  # 标准化数值型特征
        ('cat', OneHotEncoder(), categorical_features)  # 独热编码类别型特征
    ])

# 定义包含数据处理和特征工程的Pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor)  # 数据处理和特征工程
])

# 在训练集上拟合数据处理和特征工程的Pipeline
X_transformed = preprocessing_pipeline.fit_transform(train)

y = train["subscribe"].apply(lambda x: 1 if x == "yes" else 0)

# 贝叶斯搜索

In [29]:
%%time
# 定义参数搜索范围
param_space = {
    'num_leaves': (5, 50),
    'max_depth': (3, 15),
    'learning_rate': (0.01, 0.2, 'log-uniform'),
    'n_estimators': (50, 200),
    'subsample_for_bin': (20000, 300000),
    'class_weight_ratio': (1, 10),
    'min_split_gain': (0.0, 1.0),
    'min_child_weight': (0.5, 10),
    'min_child_samples': (5, 20),
    'subsample': (0.5, 1.0),
    'subsample_freq': (1, 10),
    'colsample_bytree': (0.5, 1.0),
    'reg_alpha': (0.0, 1.0),
    'reg_lambda': (0.0, 1.0),
    'importance_type': ['split', 'gain']
}

lgbm_model = LGBMClassifier(boosting_type="gbdt", objective="binary", random_state=12)

def objective_function(params):
    # 将权重比例转换为字典形式
    class_weight = {0: 1, 1: params['class_weight_ratio']}
    params.pop('class_weight_ratio')  # 从参数中移除权重比例
    lgbm_model.set_params(class_weight=class_weight, **params)
    return -np.mean(cross_val_score(lgbm_model, X_transformed, y, cv=5, scoring='roc_auc'))

# 使用BayesSearchCV进行贝叶斯调参
bayes_search = BayesSearchCV(
    lgbm_model,
    param_space,
    n_iter=50,
    cv=5,
    scoring='roc_auc',
    random_state=42,
    n_jobs=-1
)

# 执行贝叶斯搜索
bayes_search.fit(X_transformed, y)

# 输出最佳参数
print("Best parameters found: ", bayes_search.best_params_)

# 输出最佳交叉验证分数
print("Best cross-validation score: {:.4f}".format(-bayes_search.best_score_))

Best parameters found:  OrderedDict([('class_weight_ratio', 6), ('colsample_bytree', 0.6308093864100461), ('importance_type', 'split'), ('learning_rate', 0.022858304724422775), ('max_depth', 14), ('min_child_samples', 20), ('min_child_weight', 1.2657660576147527), ('min_split_gain', 1.0), ('n_estimators', 132), ('num_leaves', 35), ('reg_alpha', 0.34232009882038966), ('reg_lambda', 0.43458471477802046), ('subsample', 0.8907117965787229), ('subsample_for_bin', 168097), ('subsample_freq', 6)])
Best cross-validation score: -0.8909
CPU times: total: 4min 8s
Wall time: 6min 2s


# 训练模型

In [31]:
# 划分训练集与验证集
x_train, x_valid, y_train, y_valid = train_test_split(X_transformed, y, test_size=0.3, random_state=12)

# 得到贝叶斯搜索得到的最佳参数
best_params = bayes_search.best_params_

# 将最佳参数设置到LightGBM模型中
lgbm_model.set_params(**best_params)

# 训练模型
lgbm_model.fit(x_train, y_train, 
               eval_set=[(x_train, y_train), (x_valid, y_valid)], 
               eval_metric=['binary_logloss', 'auc'], 
               early_stopping_rounds=10)

[1]	training's binary_logloss: 0.38306	training's auc: 0.870385	valid_1's binary_logloss: 0.379432	valid_1's auc: 0.857163
[2]	training's binary_logloss: 0.376582	training's auc: 0.879533	valid_1's binary_logloss: 0.373654	valid_1's auc: 0.863683
[3]	training's binary_logloss: 0.370562	training's auc: 0.890197	valid_1's binary_logloss: 0.36828	valid_1's auc: 0.871294
[4]	training's binary_logloss: 0.364961	training's auc: 0.893263	valid_1's binary_logloss: 0.363251	valid_1's auc: 0.872811
[5]	training's binary_logloss: 0.359834	training's auc: 0.893181	valid_1's binary_logloss: 0.358787	valid_1's auc: 0.87377
[6]	training's binary_logloss: 0.354913	training's auc: 0.895521	valid_1's binary_logloss: 0.354325	valid_1's auc: 0.874383
[7]	training's binary_logloss: 0.350353	training's auc: 0.897814	valid_1's binary_logloss: 0.350287	valid_1's auc: 0.875273
[8]	training's binary_logloss: 0.346114	training's auc: 0.898519	valid_1's binary_logloss: 0.346571	valid_1's auc: 0.875492
[9]	trainin

[74]	training's binary_logloss: 0.239733	training's auc: 0.92816	valid_1's binary_logloss: 0.267279	valid_1's auc: 0.884824
[75]	training's binary_logloss: 0.239053	training's auc: 0.928607	valid_1's binary_logloss: 0.267017	valid_1's auc: 0.884931
[76]	training's binary_logloss: 0.238392	training's auc: 0.928939	valid_1's binary_logloss: 0.26665	valid_1's auc: 0.885129
[77]	training's binary_logloss: 0.237756	training's auc: 0.929233	valid_1's binary_logloss: 0.266293	valid_1's auc: 0.885225
[78]	training's binary_logloss: 0.23712	training's auc: 0.92959	valid_1's binary_logloss: 0.265988	valid_1's auc: 0.885394
[79]	training's binary_logloss: 0.236498	training's auc: 0.929838	valid_1's binary_logloss: 0.265771	valid_1's auc: 0.885361
[80]	training's binary_logloss: 0.235874	training's auc: 0.930087	valid_1's binary_logloss: 0.265477	valid_1's auc: 0.885415
[81]	training's binary_logloss: 0.235298	training's auc: 0.930278	valid_1's binary_logloss: 0.26512	valid_1's auc: 0.885565
[82]	

# 使用模型

模型得分：686/0.96

In [32]:
test = pd.read_csv("../dataset/test.csv")
use_test = test.iloc[:, 1:]

test_transformed = preprocessing_pipeline.transform(use_test)
test["subscribe"] = lgbm_model.predict(test_transformed)
test["subscribe"] = test["subscribe"].apply(lambda x: "yes" if x == 1 else "no")

test[["id", "subscribe"]].to_csv("../results/submission.csv", index=False)