# 导入模块

In [10]:
import time
import json
import joblib
import logging
import warnings
import pandas as pd
from typing import List
from skopt import BayesSearchCV
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PolynomialFeatures
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from lightgbm import LGBMClassifier

warnings.filterwarnings("ignore")
pd.set_option("display.width", 10000)
pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)

# 加载数据

In [2]:
train = pd.read_csv("../dataset/train.csv")
train.head()

Unnamed: 0,id,age,job,marital,education,default,housing,loan,contact,month,day_of_week,duration,campaign,pdays,previous,poutcome,emp_var_rate,cons_price_index,cons_conf_index,lending_rate3m,nr_employed,subscribe
0,1,51,admin.,divorced,professional.course,no,yes,yes,cellular,aug,mon,4621,1,112,2,failure,1.4,90.81,-35.53,0.69,5219.74,no
1,2,50,services,married,high.school,unknown,yes,no,cellular,may,mon,4715,1,412,2,nonexistent,-1.8,96.33,-40.58,4.05,4974.79,yes
2,3,48,blue-collar,divorced,basic.9y,no,no,no,cellular,apr,wed,171,0,1027,1,failure,-1.8,96.33,-44.74,1.5,5022.61,no
3,4,26,entrepreneur,single,high.school,yes,yes,yes,cellular,aug,fri,359,26,998,0,nonexistent,1.4,97.08,-35.55,5.11,5222.87,yes
4,5,45,admin.,single,university.degree,no,no,no,cellular,nov,tue,3178,1,240,4,success,-3.4,89.82,-33.83,1.17,4884.7,no


# 特征工程

In [3]:
# 列出需要标准化的数值型特征和需要独热编码的类别型特征
numeric_features = ['age', 'duration', 'campaign', 'pdays', 'previous', 'emp_var_rate', 
                    'cons_price_index', 'cons_conf_index', 'lending_rate3m', 'nr_employed']
categorical_features = ['job', 'marital', 'education', 'default', 'housing', 'loan', 'contact', 'month', 'day_of_week', 'poutcome']
# 定义ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('poly', PolynomialFeatures(degree=4, include_bias=False), numeric_features),  # 标准化数值型特征
        ('cat', OneHotEncoder(), categorical_features)  # 独热编码类别型特征
    ])

# 定义包含数据处理和特征工程的Pipeline
preprocessing_pipeline = Pipeline([
    ('preprocessor', preprocessor)  # 数据处理和特征工程
])

# 在训练集上拟合数据处理和特征工程的Pipeline
X_transformed = preprocessing_pipeline.fit_transform(train)

y = train["subscribe"].apply(lambda x: 1 if x == "yes" else 0)

In [4]:
joblib.dump(preprocessing_pipeline, "../preprocessing_pipeline.pkl")

['../preprocessing_pipeline.pkl']

# 贝叶斯搜索

In [5]:
%%time
# 定义参数搜索范围
param_space = {
    'num_leaves': (5, 200),
    'max_depth': (3, 15),
    'learning_rate': (0.01, 0.1, 'log-uniform'),
    'n_estimators': (50, 500),
    'subsample_for_bin': (20000, 300000),
    'class_weight_ratio': (1, 10),
    'min_split_gain': (0.0, 1.0),
    'min_child_weight': (0.5, 10),
    'min_child_samples': (5, 20),
    'subsample': (0.5, 1.0),
    'subsample_freq': (1, 10),
    'colsample_bytree': (0.5, 1.0),
    'reg_alpha': (0.0, 1.0),
    'reg_lambda': (0.0, 1.0),
    'importance_type': ['split', 'gain']
}

lgbm_model = LGBMClassifier(boosting_type="gbdt", objective="binary", random_state=12)

def objective_function(params):
    # 将权重比例转换为字典形式
    class_weight = {0: 1, 1: params['class_weight_ratio']}
    params.pop('class_weight_ratio')
    lgbm_model.set_params(class_weight=class_weight, **params)
    return -np.mean(cross_val_score(lgbm_model, X_transformed, y, cv=5, scoring='roc_auc'))

# 使用BayesSearchCV进行贝叶斯调参
bayes_search = BayesSearchCV(
    lgbm_model,
    param_space,
    n_iter=100,
    cv=5,
    scoring='roc_auc',
    random_state=12,
    n_jobs=-1
)

# 执行贝叶斯搜索
bayes_search.fit(X_transformed, y)

# 输出最佳参数
print("Best parameters found: ", bayes_search.best_params_)

# 输出最佳交叉验证分数
print("Best cross-validation score: {:.4f}".format(-bayes_search.best_score_))

Best parameters found:  OrderedDict([('class_weight_ratio', 7), ('colsample_bytree', 0.8023081537462333), ('importance_type', 'split'), ('learning_rate', 0.019998641631705356), ('max_depth', 7), ('min_child_samples', 20), ('min_child_weight', 7.787166513959646), ('min_split_gain', 0.5390892508171149), ('n_estimators', 333), ('num_leaves', 155), ('reg_alpha', 0.7036459700774684), ('reg_lambda', 0.9016214271191245), ('subsample', 1.0), ('subsample_for_bin', 41379), ('subsample_freq', 1)])
Best cross-validation score: -0.8903
CPU times: total: 22min 36s
Wall time: 1h 30min 7s


# 训练模型

In [6]:
# 划分训练集与验证集
x_train, x_valid, y_train, y_valid = train_test_split(X_transformed, y, test_size=0.3, random_state=12)

# 得到贝叶斯搜索得到的最佳参数
best_params = bayes_search.best_params_

# 将最佳参数设置到LightGBM模型中
lgbm_model.set_params(**best_params)

# 训练模型
lgbm_model.fit(x_train, y_train, 
               eval_set=[(x_train, y_train), (x_valid, y_valid)], 
               eval_metric=['binary_logloss', 'auc'], 
               early_stopping_rounds=10)
joblib.dump(lgbm_model, "best_model.pkl")

[1]	training's binary_logloss: 0.383879	training's auc: 0.88821	valid_1's binary_logloss: 0.380204	valid_1's auc: 0.854273
[2]	training's binary_logloss: 0.37815	training's auc: 0.898361	valid_1's binary_logloss: 0.375209	valid_1's auc: 0.858519
[3]	training's binary_logloss: 0.372961	training's auc: 0.901436	valid_1's binary_logloss: 0.370586	valid_1's auc: 0.865249
[4]	training's binary_logloss: 0.368057	training's auc: 0.903737	valid_1's binary_logloss: 0.366312	valid_1's auc: 0.868396
[5]	training's binary_logloss: 0.363303	training's auc: 0.905719	valid_1's binary_logloss: 0.362198	valid_1's auc: 0.869127
[6]	training's binary_logloss: 0.358771	training's auc: 0.906803	valid_1's binary_logloss: 0.35833	valid_1's auc: 0.868884
[7]	training's binary_logloss: 0.354511	training's auc: 0.907477	valid_1's binary_logloss: 0.354619	valid_1's auc: 0.869628
[8]	training's binary_logloss: 0.350411	training's auc: 0.907676	valid_1's binary_logloss: 0.351164	valid_1's auc: 0.869214
[9]	trainin

[67]	training's binary_logloss: 0.243866	training's auc: 0.934423	valid_1's binary_logloss: 0.273478	valid_1's auc: 0.883968
[68]	training's binary_logloss: 0.243046	training's auc: 0.93471	valid_1's binary_logloss: 0.273021	valid_1's auc: 0.884087
[69]	training's binary_logloss: 0.242176	training's auc: 0.935017	valid_1's binary_logloss: 0.272635	valid_1's auc: 0.883963
[70]	training's binary_logloss: 0.241311	training's auc: 0.935331	valid_1's binary_logloss: 0.27221	valid_1's auc: 0.884078
[71]	training's binary_logloss: 0.240477	training's auc: 0.935559	valid_1's binary_logloss: 0.27189	valid_1's auc: 0.884075
[72]	training's binary_logloss: 0.239644	training's auc: 0.935812	valid_1's binary_logloss: 0.271505	valid_1's auc: 0.884026
[73]	training's binary_logloss: 0.238899	training's auc: 0.936028	valid_1's binary_logloss: 0.271102	valid_1's auc: 0.884066
[74]	training's binary_logloss: 0.23808	training's auc: 0.936286	valid_1's binary_logloss: 0.270745	valid_1's auc: 0.884141
[75]

[135]	training's binary_logloss: 0.205496	training's auc: 0.950358	valid_1's binary_logloss: 0.259561	valid_1's auc: 0.886553
[136]	training's binary_logloss: 0.205134	training's auc: 0.950535	valid_1's binary_logloss: 0.259504	valid_1's auc: 0.886576
[137]	training's binary_logloss: 0.204866	training's auc: 0.950654	valid_1's binary_logloss: 0.259396	valid_1's auc: 0.88666
[138]	training's binary_logloss: 0.204396	training's auc: 0.950936	valid_1's binary_logloss: 0.259317	valid_1's auc: 0.886693
[139]	training's binary_logloss: 0.203922	training's auc: 0.951224	valid_1's binary_logloss: 0.259251	valid_1's auc: 0.886688
[140]	training's binary_logloss: 0.203627	training's auc: 0.951332	valid_1's binary_logloss: 0.259215	valid_1's auc: 0.886681
[141]	training's binary_logloss: 0.203365	training's auc: 0.951405	valid_1's binary_logloss: 0.259191	valid_1's auc: 0.886687
[142]	training's binary_logloss: 0.20312	training's auc: 0.951497	valid_1's binary_logloss: 0.25914	valid_1's auc: 0.88

[210]	training's binary_logloss: 0.188424	training's auc: 0.959147	valid_1's binary_logloss: 0.257153	valid_1's auc: 0.887612
[211]	training's binary_logloss: 0.18829	training's auc: 0.959202	valid_1's binary_logloss: 0.257136	valid_1's auc: 0.887613
[212]	training's binary_logloss: 0.188115	training's auc: 0.959277	valid_1's binary_logloss: 0.257106	valid_1's auc: 0.887638
[213]	training's binary_logloss: 0.188065	training's auc: 0.959292	valid_1's binary_logloss: 0.257119	valid_1's auc: 0.887634
[214]	training's binary_logloss: 0.187946	training's auc: 0.959342	valid_1's binary_logloss: 0.257076	valid_1's auc: 0.887662
[215]	training's binary_logloss: 0.187804	training's auc: 0.9594	valid_1's binary_logloss: 0.257024	valid_1's auc: 0.887727
[216]	training's binary_logloss: 0.18752	training's auc: 0.959595	valid_1's binary_logloss: 0.25708	valid_1's auc: 0.887674
[217]	training's binary_logloss: 0.187453	training's auc: 0.959629	valid_1's binary_logloss: 0.257083	valid_1's auc: 0.8876

['best_model.pkl']

In [11]:
# 获取当前时间戳
timestamp = time.time()

# 将时间戳转换为 struct_time 对象
struct_time = time.localtime(timestamp)

best_score = lgbm_model.best_score_
score_dict = {
    time.strftime('%Y-%m-%d', struct_time): {
        "TRAIN_AUC": best_score["training"]["auc"],
        "TRAIN_LOSS": best_score["training"]["binary_logloss"],
        "VALID_AUC": best_score["valid_1"]["auc"],
        "VALID_LOSS": best_score["valid_1"]["binary_logloss"]
    }
}

with open("../logging/best_score_log.json", "w") as json_file:
    json.dump(score_dict, json_file)

In [14]:
best_iteration = lgbm_model.best_iteration_
# 将 struct_time 对象格式化为字符串
formatted_date = time.strftime('%Y-%m-%d %H:%M:%S', struct_time)
log = f"{formatted_date} best_iteration={best_iteration} best_score={best_score}"

# 配置日志
logging.basicConfig(filename='../logging/model_train_logging.log', level=logging.INFO)

logging.info(log)
log

"2023-11-19 22:52:32 best_iteration=215 best_score=defaultdict(<class 'collections.OrderedDict'>, {'training': OrderedDict([('binary_logloss', 0.1878041416622663), ('auc', 0.9594001455201256)]), 'valid_1': OrderedDict([('binary_logloss', 0.2570238353747708), ('auc', 0.8877268814589666)])})"

# 使用模型

模型得分：686/0.96

In [15]:
test = pd.read_csv("../dataset/test.csv")
use_test = test.iloc[:, 1:]

test_transformed = preprocessing_pipeline.transform(use_test)
test["subscribe"] = lgbm_model.predict(test_transformed)
test["subscribe"] = test["subscribe"].apply(lambda x: "yes" if x == 1 else "no")

test[["id", "subscribe"]].to_csv("../results/submission_v1.3.csv", index=False)