<a href="https://colab.research.google.com/github/mamekin05108/signatecup2024summer/blob/main/lightcat.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [2]:
!pip install catboost



In [3]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
import lightgbm as lgb
from sklearn import metrics
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold

import xgboost as xgb
from xgboost import XGBClassifier


from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool
from sklearn.isotonic import IsotonicRegression


# warningsを非表示にする
warnings.filterwarnings("ignore")

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [4]:
df_train = pd.read_csv("/content/drive/MyDrive/Signate/2024summer/fdata/df_train_allnum.csv")
df_test = pd.read_csv("/content/drive/MyDrive/Signate/2024summer/fdata/df_test_allnum.csv")
ss = pd.read_csv("/content/drive/MyDrive/Signate/2024summer/data/sample_submit.csv", header=None)

In [5]:
target = "ProdTaken"


cols_category = [
    "Gender",
    "ProductPitched",
    "Designation",
    "MaritalStatus",
    "OwnCar",
    "Occupation",
    "TypeofContact",
    "Agebin",
    "Incomebin",
    "Occupation_Designation"
]


cols_category=[]

In [6]:
train_y = df_train[target]
train_x = df_train.drop(target, axis=1)

In [7]:
test_x = df_test
test_x = test_x.drop(target, axis=1)

In [8]:
test_x.head()

Unnamed: 0,id,Age,TypeofContact,DurationOfPitch,Gender,NumberOfPersonVisiting,NumberOfFollowups,NumberOfTrips,Passport,MonthlyIncome,...,PitchSatisfactionScore_5,Designation_AVP,Designation_Executive,Designation_Manager,Designation_Senior Manager,Designation_VP,MaritalStatus_未婚,MaritalStatus_独身,MaritalStatus_結婚,MaritalStatus_離婚
0,3489,0.938701,1,-0.2121,0,-1.908669,0.361786,2.196148,0,1.863454,...,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0
1,3490,-0.922728,1,-0.35674,1,-1.908669,0.361786,0.459028,1,-0.742793,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,1.0,0.0
2,3491,-1.439792,1,-0.790661,1,-1.908669,0.361786,-1.278092,0,-1.272115,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0
3,3492,-1.853443,0,-1.079941,0,-1.908669,0.361786,-1.278092,0,-1.273769,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0
4,3493,0.214812,0,-1.079941,0,-1.908669,0.361786,-1.278092,0,-1.155267,...,0.0,0.0,1.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0


# モデル


In [9]:
!pip install optuna




In [10]:
import optuna

In [11]:
# OptunaでLightGBMのパラメータをチューニングする関数
def objective_lgb(trial, X, y):
    params = {
        'n_estimators': trial.suggest_int('n_estimators', 100, 5000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'random_seed': 0
    }

    model = lgb.LGBMClassifier(**params)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=815)
    auc_scores = []

    for train_idx, val_idx in cv.split(X, y):
        trn_x, trn_y = X.iloc[train_idx], y.iloc[train_idx]
        val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
        model.fit(trn_x, trn_y, eval_set=[(val_x, val_y)], eval_metric='auc', callbacks=[lgb.early_stopping(100, verbose=False)])
        preds = model.predict_proba(val_x)[:, 1]
        auc_scores.append(roc_auc_score(val_y, preds))

    return np.mean(auc_scores)

Best LightGBM parameters: {'n_estimators': 3310, 'learning_rate': 0.054222174375827735, 'colsample_bytree': 0.5021459390774948, 'subsample': 0.8448409944410017}

In [12]:
# OptunaでCatBoostのパラメータをチューニングする関数
def objective_cat(trial, X, y, cols_category):
    params = {
        'iterations': trial.suggest_int('iterations', 100, 5000),
        'learning_rate': trial.suggest_loguniform('learning_rate', 1e-4, 1e-1),
        'colsample_bylevel': trial.suggest_uniform('colsample_bylevel', 0.5, 1.0),
        'random_seed': 0,
        'verbose': 100,
        'use_best_model': True
    }

    model = CatBoostClassifier(**params)
    cv = StratifiedKFold(n_splits=5, shuffle=True, random_state=815)
    auc_scores = []

    for train_idx, val_idx in cv.split(X, y):
        trn_x, trn_y = X.iloc[train_idx], y.iloc[train_idx]
        val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]
        train_pool = Pool(data=trn_x, label=trn_y, cat_features=cols_category)
        val_pool = Pool(data=val_x, label=val_y, cat_features=cols_category)
        model.fit(train_pool, eval_set=val_pool, verbose=False)
        preds = model.predict_proba(val_x)[:, 1]
        auc_scores.append(roc_auc_score(val_y, preds))

    return np.mean(auc_scores)

Best CatBoost parameters: {'iterations': 2429, 'learning_rate': 0.015201103669488437, 'colsample_bylevel': 0.6320629073348337}

In [13]:
# Optunaのスタディを作成し、最適なパラメータを見つける
def tune_params(X, y, cols_category):
    def opt_lgb(trial):
        return objective_lgb(trial, X, y)

    def opt_cat(trial):
        return objective_cat(trial, X, y, cols_category)

    study_lgb = optuna.create_study(direction='maximize')
    study_lgb.optimize(opt_lgb, n_trials=50)
    print(f"Best LightGBM parameters: {study_lgb.best_params}")

    study_cat = optuna.create_study(direction='maximize')
    study_cat.optimize(opt_cat, n_trials=50)
    print(f"Best CatBoost parameters: {study_cat.best_params}")

    return study_lgb.best_params, study_cat.best_params

In [14]:
# ベースモデルの学習とメタ特徴量の生成
def train_base_models_with_optuna(X, y, cols_category):
    best_params_lgb, best_params_cat = tune_params(X, y, cols_category)

    print("Final LightGBM Parameters:", best_params_lgb)
    print("Final CatBoost Parameters:", best_params_cat)

    base_models = {
        'LightGBM': lgb.LGBMClassifier(**best_params_lgb),
        'CatBoost': CatBoostClassifier(**best_params_cat, early_stopping_rounds=100)
    }

    meta_features, oof_predictions = train_base_models(X, y, base_models, cols_category)

    return base_models, meta_features, oof_predictions

In [15]:
def train_base_models(X, y, models, cols_category, n_splits=5):
    meta_features = np.zeros((len(X), len(models)))
    oof_predictions = np.zeros(len(X))

    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=815)

    for i, (model_name, model) in enumerate(models.items()):
        print(f"Training {model_name}...")
        oof_predictions_model = np.zeros(len(X))

        for fold, (trn_idx, val_idx) in enumerate(cv.split(X, y), start=1):
            trn_x, trn_y = X.iloc[trn_idx], y.iloc[trn_idx]
            val_x, val_y = X.iloc[val_idx], y.iloc[val_idx]

            if isinstance(model, lgb.LGBMClassifier):
                model.fit(
                    trn_x, trn_y,
                    eval_set=[(val_x, val_y)],
                    callbacks=[lgb.early_stopping(100, verbose=False)],
                    categorical_feature=cols_category,
                )
            elif isinstance(model, CatBoostClassifier):
                train_pool = Pool(data=trn_x, label=trn_y, cat_features=cols_category)
                val_pool = Pool(data=val_x, label=val_y, cat_features=cols_category)
                model.fit(train_pool, eval_set=val_pool, verbose=False)
            else:
                model.fit(trn_x, trn_y)

            oof_predictions_model[val_idx] = model.predict_proba(val_x)[:, 1]

            auc = roc_auc_score(val_y, oof_predictions_model[val_idx])
            print(f"  Fold {fold}, AUC: {auc:.4f}")

        meta_features[:, i] = oof_predictions_model
        oof_predictions += oof_predictions_model / len(models)

        auc = roc_auc_score(y, oof_predictions_model)
        print(f"{model_name} OOF AUC: {auc:.4f}")

    return meta_features, oof_predictions

In [16]:
def train_meta_model(meta_features, y, meta_model, n_splits=5):
    oof_predictions = np.zeros(len(y))
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=815)

    for fold, (trn_idx, val_idx) in enumerate(cv.split(meta_features, y), start=1):
        trn_x, trn_y = meta_features[trn_idx], y.iloc[trn_idx]
        val_x, val_y = meta_features[val_idx], y.iloc[val_idx]

        meta_model.fit(trn_x, trn_y)
        oof_predictions[val_idx] = meta_model.predict_proba(val_x)[:, 1]

        auc = roc_auc_score(val_y, oof_predictions[val_idx])
        print(f"Meta Model Fold {fold}, AUC: {auc:.4f}")

    auc = roc_auc_score(y, oof_predictions)
    print(f"Meta Model OOF AUC: {auc:.4f}")

    return oof_predictions

In [17]:
def stacking_predict(X, base_models, meta_model):
    meta_features = np.column_stack([model.predict_proba(X)[:, 1] for _, model in base_models.items()])
    return meta_model.predict_proba(meta_features)[:, 1]

In [None]:
# ベースモデルの学習とメタ特徴量の生成
base_models, meta_features, oof_predictions_base = train_base_models_with_optuna(train_x, train_y, cols_category)


[I 2024-08-18 16:21:10,280] A new study created in memory with name: no-name-a330f39c-008d-405d-a899-ffd6ad7ab723


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000567 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785
[LightGBM] [Info] Number of positive: 397, number of negative: 2394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000518 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 41
[LightGBM] [Info] [binary:Bo

[I 2024-08-18 16:21:12,408] Trial 0 finished with value: 0.8293935495963878 and parameters: {'n_estimators': 4600, 'learning_rate': 0.056908769933251474, 'colsample_bytree': 0.5840907599010865, 'subsample': 0.8437817320143863}. Best is trial 0 with value: 0.8293935495963878.


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000546 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785
[LightGBM] [Info] Number of positive: 397, number of negative: 2394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000506 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 41
[LightGBM] [Info] [binary:Bo

[I 2024-08-18 16:21:15,906] Trial 1 finished with value: 0.8251752858369684 and parameters: {'n_estimators': 2574, 'learning_rate': 0.004077761250969111, 'colsample_bytree': 0.7152525240096559, 'subsample': 0.7590287547349139}. Best is trial 0 with value: 0.8293935495963878.


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000560 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785
[LightGBM] [Info] Number of positive: 397, number of negative: 2394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000482 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 41
[LightGBM] [Info] [binary:Bo

[I 2024-08-18 16:21:19,064] Trial 2 finished with value: 0.8250694254452329 and parameters: {'n_estimators': 3658, 'learning_rate': 0.0002347975473564264, 'colsample_bytree': 0.6172443565315799, 'subsample': 0.9931899154638488}. Best is trial 0 with value: 0.8293935495963878.


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000557 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785
[LightGBM] [Info] Number of positive: 397, number of negative: 2394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000540 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 41
[LightGBM] [Info] [binary:Bo

[I 2024-08-18 16:21:21,803] Trial 3 finished with value: 0.786732338798282 and parameters: {'n_estimators': 4703, 'learning_rate': 0.00020482292419493584, 'colsample_bytree': 0.9960150851867944, 'subsample': 0.6637715445790826}. Best is trial 0 with value: 0.8293935495963878.


[LightGBM] [Info] Number of positive: 397, number of negative: 2394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000560 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 41
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.142243 -> initscore=-1.796785
[LightGBM] [Info] Start training from score -1.796785
[LightGBM] [Info] Number of positive: 397, number of negative: 2394
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000477 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 683
[LightGBM] [Info] Number of data points in the train set: 2791, number of used features: 41
[LightGBM] [Info] [binary:Bo

In [None]:
meta_features

In [None]:
meta_features

In [None]:
params_logistic = {
    "penalty": 'l2',             # L2正則化
    "C": 1.0,                    # 正則化の強さ
    "solver": 'lbfgs',           # 最適化アルゴリズム
    "max_iter": 100,             # 最大反復回数
    #"class_weight":'balanced',    # クラスの重みづけ
    "random_state": 1506            # 乱数シード
}

In [None]:
# メタモデルの定義と学習
from sklearn.linear_model import LogisticRegression
meta_model = LogisticRegression(**params_logistic)
oof_predictions_meta = train_meta_model(meta_features, train_y, meta_model)


Meta Model Fold 1, AUC: 0.8172
Meta Model Fold 2, AUC: 0.8446
Meta Model Fold 3, AUC: 0.8359
Meta Model Fold 4, AUC: 0.8211
Meta Model Fold 5, AUC: 0.8641
Meta Model OOF AUC: 0.8356

In [None]:
final_auc = roc_auc_score(train_y, oof_predictions_meta)
print(f"Final Stacking Model OOF AUC: {final_auc:.8f}")

In [None]:
test_predictions = stacking_predict(test_x, oof_predictions_base, meta_model)

In [None]:
count = 0

In [None]:
from datetime import datetime
import pytz
# カウント変数をインクリメント
count += 1
# 日本時間を取得
japan_tz = pytz.timezone('Asia/Tokyo')
now = datetime.now(japan_tz)
timestamp = now.strftime("%Y%m%d_%H%M%S")

file_name = f"/content/drive/MyDrive/Signate/2024summer/lightcatxgb_en_{timestamp}_{count:03d}.csv"
ss[1] = test_predictions
ss.to_csv(file_name, header=False, index=False)