In [None]:
import numpy as np
import pandas as pd
import optuna
from optuna.integration import LightGBMPruningCallback
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split, GridSearchCV, StratifiedKFold
from sklearn.metrics import accuracy_score, classification_report
import lightgbm as lgb
from lightgbm import log_evaluation, early_stopping
import warnings

# 忽略警告
warnings.filterwarnings("ignore")

# 1. 数据导入与预处理

In [None]:
# 导入数据集
train_data = pd.read_csv('train.csv')
test_data = pd.read_csv('test.csv')
full_data = pd.merge(train_data, test_data, how='outer')

train_data.info()
test_data.info()
full_data.info()

# 数据预处理

# 去掉无意义列
drop_columns = ['user_id', 'term_brand', 'term_price', 'change_equip_period_avg', 'join_date']
full_data.drop(drop_columns, axis=1, inplace=True)

# 字符串列分类编码
factorize_columns = ['zfk_type', 'jt_5gwl_flag']
full_data['zfk_type'] = full_data['zfk_type'].factorize()[0]
full_data['jt_5gwl_flag'] = full_data['jt_5gwl_flag'].factorize()[0]
full_data['jt_5gzd_flag'] = full_data['jt_5gzd_flag'].factorize()[0]
full_data['avg3_llb_flag'] = full_data['avg3_llb_flag'].factorize()[0]


# 填充缺失值
missing_columns = full_data.isnull().any()
for col in missing_columns[missing_columns].index:
    full_data[col].fillna(full_data[col].mean(), inplace=True)

# area_code整体值过大，缩小范围
full_data['area_code'] -= full_data['area_code'].min()

# 调整数据类型
full_data['age'] = full_data['age'].astype('float64')
full_data['sl_flag'] = full_data['sl_flag'].astype('int64')
full_data['sl_type'] = full_data['sl_type'].astype('int64')
full_data['jt_5gzd_flag'] = full_data['jt_5gzd_flag'].astype('int64')

# 归一化
normal_columns = train_data.select_dtypes(include='float64').columns
for col in normal_columns:
    full_data[col] = (full_data[col] - full_data[col].mean()) / full_data[col].std()

# 保存
full_data.to_csv('processed_data.csv', index=False)

In [None]:
# 导入数据
processed_data = pd.read_csv('processed_data.csv')
X = processed_data.drop('sample_flag', axis=1)
y = processed_data['sample_flag'] - 1
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
# sample_flag从1开始，转换为从0开始
y_train = y_train - 1
y_test = y_test - 1

# 2. 模型调参

### 2.1 自动调参

In [None]:
# 先使用Optuna库进行自动调参，得到相对较优的参数，便于后续调优
def objective(trial, X, y):
    # 参数网格
    param_grid = {
        "n_estimators": trial.suggest_int("n_estimators", 100, 1000, step=100),
        "learning_rate": trial.suggest_float("learning_rate", 0.01, 0.3),
        "num_leaves": trial.suggest_int("num_leaves", 20, 250, step=10),
        "max_depth": trial.suggest_int("max_depth", -1, 8, step=1),
        "min_data_in_leaf": trial.suggest_int("min_data_in_leaf", 50, 300, step=50),
        "max_bin": trial.suggest_int("max_bin", 1, 255, step=10),
        "lambda_l1": trial.suggest_int("lambda_l1", 0, 50, step=5),
        "lambda_l2": trial.suggest_int("lambda_l2", 0, 50, step=5),
        "min_gain_to_split": trial.suggest_float("min_gain_to_split", 0, 15),
        "bagging_fraction": trial.suggest_float("bagging_fraction", 0.2, 0.95, step=0.1),
        "bagging_freq": trial.suggest_categorical("bagging_freq", [1]),
        "feature_fraction": trial.suggest_float("feature_fraction", 0.2, 0.95, step=0.1),
        "random_state": 2021,
        'early_stopping_rounds': 200,
        'num_class': 3,
        'verbose': -1,
        'metric': 'multi_logloss'
    }
    # 5折交叉验证
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=1121218)

    cv_scores = np.empty(5)
    for idx, (train_idx, test_idx) in enumerate(cv.split(X, y)):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y[train_idx], y[test_idx]

        # LGBM建模
        model = lgb.LGBMClassifier(objective="multiclass", **param_grid)
        model.fit(
            X_train,
            y_train,
            eval_set=[(X_test, y_test)],
            eval_metric='multi_logloss',
            callbacks=[LightGBMPruningCallback(trial, 'multi_logloss')],
            verbose=False,
        )
        # 模型预测
        preds = model.predict(X_test)
        # 优化指标accuracy_score最大
        cv_scores[idx] = accuracy_score(y_test, preds)

    return np.mean(cv_scores)


study = optuna.create_study(direction="minimize", study_name="LGBM Classifier")
func = lambda trial: objective(trial, X, y)
study.optimize(func, n_trials=20)

In [None]:
# 打印最优参数
print(f"\tBest Accuracy: {study.best_value:.5f}")
print(f"\tBest params:")
for key, value in study.best_params.items():
    print(f"\t\t{key}: {value}")


# Best value (accuracy): 0.88743
# Best params:
#     n_estimators: 300
#     learning_rate: 0.23028022823241723
#     num_leaves: 240
#     max_depth: 4
#     min_data_in_leaf: 150
#     max_bin: 50
#     lambda_l1: 35
#     lambda_l2: 0
#     min_gain_to_split: 14.897198600908983
#     bagging_fraction: 0.5
#     bagging_freq: 1
#     feature_fraction: 0.6000000000000001

In [None]:
# 使用自动调参得到的参数进行模型训练
params = study.best_params
model_autobest = lgb.LGBMClassifier(objective="multiclass", **params)
model_autobest.fit(X_train, y_train)
y_pred = model_autobest.predict(X_test)

# 将模型的训练过程可视化
lgb.plot_metric(model_autobest)
plt.show()

# 模型评估
print('accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

### 2.2 进一步调参

#### 2.2.1 寻找最优的迭代次数

In [None]:
# 将自动调参得到的参数导入作为初始参数，这里将学习率设置的较大，便于手动调参
params_test = study.best_params
params_test['learning_rate'] = 0.1

In [None]:
# 寻找最优的迭代次数
data_train = lgb.Dataset(X_train, y_train)
call_backs = [log_evaluation(period=50), early_stopping(stopping_rounds=30)]
cv_results = lgb.cv(params_test, data_train, num_boost_round=1000, nfold=5, stratified=False, shuffle=True, metrics='multi_logloss', seed=0)

best_n_estimators = len(cv_results['valid multi_logloss-mean'])
params_test['n_estimators'] = best_n_estimators

print('best n_estimators:', best_n_estimators)
print('best cv score:', pd.Series(cv_results['valid multi_logloss-mean']).max())

#### 2.2.2 搜索最优max_depth和num_leaves

In [None]:
params_test_1 = {
    'max_depth': range(1, 8, 1),
    'num_leaves': range(200, 250, 5),
    }

In [None]:
gbm = lgb.LGBMClassifier(**params_test)
gsearch = GridSearchCV(gbm, param_grid=params_test_1, scoring='accuracy', cv=5, n_jobs=4, error_score='raise')
gsearch.fit(X_train, y_train)

best_max_depth = gsearch.best_params_['max_depth']
best_num_leaves = gsearch.best_params_['num_leaves']
params_test['max_depth'] = best_max_depth

print('参数的最佳取值:{0}'.format(gsearch.best_params_))
print('最佳模型得分:{0}'.format(gsearch.best_score_))
print(gsearch.cv_results_['mean_test_score'])
print(gsearch.cv_results_['params'])

#### 2.2.3 搜索最优min_data_in_leaf和max_bin in

In [None]:
params_test_2 = {
    'min_data_in_leaf': range(100, 200, 10),
    'max_bin': range(20, 80, 5),
    }

In [None]:
gbm = lgb.LGBMClassifier(**params_test)
gsearch = GridSearchCV(gbm, param_grid=params_test_2, scoring='accuracy', cv=5, n_jobs=4, error_score='raise')
gsearch.fit(X_train, y_train)

best_min_data_in_leaf = gsearch.best_params_['min_data_in_leaf']
best_max_bin = gsearch.best_params_['max_bin']
params_test['min_data_in_leaf'] = best_min_data_in_leaf
params_test['max_bin'] = best_max_bin

print('参数的最佳取值:{0}'.format(gsearch.best_params_))
print('最佳模型得分:{0}'.format(gsearch.best_score_))
print(gsearch.cv_results_['mean_test_score'])
print(gsearch.cv_results_['params'])

#### 2.2.4 搜索最优bagging_fraction和feature_fraction

In [None]:
params_test_3 = {
    'bagging_fraction': [i/10 for i in range(2, 10)],
    'feature_fraction': [i/10 for i in range(2, 10)],
    }

In [None]:
gbm = lgb.LGBMClassifier(**params_test)
gsearch = GridSearchCV(gbm, param_grid=params_test_3, scoring='accuracy', cv=5, n_jobs=4, error_score='raise')
gsearch.fit(X_train, y_train)

best_bagging_fraction = gsearch.best_params_['bagging_fraction']
best_feature_fraction = gsearch.best_params_['feature_fraction']
params_test['bagging_fraction'] = best_bagging_fraction
params_test['feature_fraction'] = best_feature_fraction

print('参数的最佳取值:{0}'.format(gsearch.best_params_))
print('最佳模型得分:{0}'.format(gsearch.best_score_))
print(gsearch.cv_results_['mean_test_score'])
print(gsearch.cv_results_['params'])

#### 2.2.5 搜索最优lambda_l1和lambda_l2

In [None]:
params_test_4 = {
    'lambda_l1': range(20, 50, 2),
    'lambda_l2': range(0, 20, 2),
    }

In [None]:
gbm = lgb.LGBMClassifier(**params_test)
gsearch = GridSearchCV(gbm, param_grid=params_test_4, scoring='accuracy', cv=5, n_jobs=4, error_score='raise')
gsearch.fit(X_train, y_train)

best_lambda_l1 = gsearch.best_params_['lambda_l1']
best_lambda_l2 = gsearch.best_params_['lambda_l2']
params_test['lambda_l1'] = best_lambda_l1
params_test['lambda_l2'] = best_lambda_l2

print('参数的最佳取值:{0}'.format(gsearch.best_params_))
print('最佳模型得分:{0}'.format(gsearch.best_score_))
print(gsearch.cv_results_['mean_test_score'])
print(gsearch.cv_results_['params'])

## 3.模型训练

In [None]:
params_best = params_test
params_best['learning_rate'] = study.best_params['learning_rate']
model_best = lgb.LGBMClassifier(**params_best)
model_best.fit(X_train, y_train)
y_pred = model_best.predict(X_test)

# 将模型的训练过程可视化
lgb.plot_metric(model_best)
plt.show()

# 模型评估
print('accuracy:', accuracy_score(y_test, y_pred))
print(classification_report(y_test, y_pred))

# 4.模型预测

In [None]:
# 测试集的数据处理、预测、输出
test_data = pd.read_csv("../output/DATA/testB.csv")
test_data = test_data.drop(columns=["area_code", "join_date", "term_brand", "term_price", "change_equip_period_avg"])
test_data["zfk_type"] = test_data["zfk_type"].map({'是': 1, '否': 0})
test_data["jt_5gwl_flag"] = test_data["jt_5gwl_flag"].map({"is_5gwl_user": 1})
test_data["avg3_llb_flag"] = test_data["avg3_llb_flag"].fillna(test_data["avg3_llb_flag"].mode()[0])
test_data["sl_flag"] = test_data["sl_flag"].fillna(test_data["sl_flag"].mode()[0])
test_data["sl_type"] = test_data["sl_type"].fillna(test_data["sl_type"].mode()[0])
test_data["jt_5gwl_flag"] = test_data["jt_5gwl_flag"].fillna(int(0))
test_data = test_data.fillna(test_data.mean())
test_data["avg3_llb_flag"] = test_data["avg3_llb_flag"].astype("int64")
test_data["sl_flag"] = test_data["sl_flag"].astype("int64")
test_data["sl_type"] = test_data["sl_type"].astype("int64")
test_data["jt_5gwl_flag"] = test_data["jt_5gwl_flag"].astype("int64")
for col in test_data.columns:
    if test_data[col].dtype == 'float64':
        test_data[col] = (test_data[col] - test_data[col].mean()) / test_data[col].std()
test = np.array(test_data.drop("user_id", axis=1))

In [None]:
# 模型预测
predict = model_best.predict(test, num_iteration=gbm.best_iteration)
predict = np.argmax(predict, axis=1) + 1
test_data['prediction'] = predict

In [None]:
# 输出预测结果
output = test_data[["user_id", "prediction"]]
output = output.rename(columns={"prediction": "predtype"})
output.to_csv("../output/output.csv", index = False, encoding = "utf-8")