<a href="https://colab.research.google.com/github/mamekin05108/signatecup2024summer/blob/main/Catboost_20240831.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
import re
import gc
import random
import os
import copy
import lightgbm as lgb
from sklearn import metrics
from sklearn.metrics import f1_score, roc_auc_score
from sklearn.model_selection import StratifiedKFold

# warningsを非表示にする
warnings.filterwarnings("ignore")

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.



In [None]:

!pip install catboost



In [None]:
pd.set_option('display.max_columns',100)
pd.set_option('display.max_rows',100)

In [None]:

from sklearn.calibration import calibration_curve, CalibratedClassifierCV
from sklearn.metrics import roc_auc_score
from catboost import CatBoostClassifier, Pool

In [None]:
df_train = pd.read_csv("/content/drive/MyDrive/Signate/2024summer/fdata/df_trainV13C.csv")
df_test = pd.read_csv("/content/drive/MyDrive/Signate/2024summer/fdata/df_testV13C.csv")
ss = pd.read_csv("/content/drive/MyDrive/Signate/2024summer/data/sample_submit.csv", header=None)

In [None]:
def summarize_dataframe(df):
    rows = []
    for col in df.columns:
        dtype = df[col].dtype
        na_count = df[col].isnull().sum()
        unique_count = len(df[col].unique())
        na_rate = na_count / len(df) if len(df) > 0 else None
        rows.append([col, dtype, na_count, unique_count, na_rate])

    summary_df = pd.DataFrame(rows, columns=["列名", "列の型", "NaNである行の数", "値の種類", "NaNの率"])
    return summary_df

In [None]:
summary_df = summarize_dataframe(df_train)
summary_df

Unnamed: 0,列名,列の型,NaNである行の数,値の種類,NaNの率
0,id,int64,0,3489,0.0
1,Age,int64,0,44,0.0
2,TypeofContact,float64,0,2,0.0
3,CityTier,float64,0,3,0.0
4,DurationOfPitch,int64,0,33,0.0
5,Occupation,float64,0,3,0.0
6,Gender,float64,0,1,0.0
7,NumberOfPersonVisiting,float64,0,4,0.0
8,NumberOfFollowups,int64,0,6,0.0
9,ProductPitched,float64,0,5,0.0


In [None]:
# Config cell
target = "ProdTaken"

In [None]:
# デフォルトの特徴量リスト
default_categorical_features = [
    'TypeofContact', 'CityTier', 'Occupation', 'Gender', 'ProductPitched',
    'PreferredPropertyStar', 'Passport', 'PitchSatisfactionScore',
    'Designation', 'MaritalStatus', 'OwnCar'
]

default_numerical_features = [
    'id', 'Age', 'DurationOfPitch', 'MonthlyIncome', 'NumberOfPersonVisiting',
    'NumberOfFollowups', 'NumberOfTrips', 'NumberOfChildren'
]

# 追加の数値特徴量
added_numerical_features = [
    'HasChildren', 'IncomePerChild', 'AnnualIncomePerTrip',
    'PitchPlusFollowups', 'AdultTravelers'
]

# 数値特徴量の結合
numerical_features = default_numerical_features + added_numerical_features

# データフレームからすべての特徴量を取得
features = df_train.columns.tolist()

# カテゴリカル特徴量の決定
categorical_features = copy.deepcopy(features)
for feature in numerical_features:
    if feature in categorical_features:
        categorical_features.remove(feature)

print(f'数値特徴量: {numerical_features}')
print(f'カテゴリカル特徴量: {categorical_features}')

# 学習から除外する特徴量の削除
remove_list = ['id', target, 'MaritalStatus','NumberOfPersonVisiting','NumberOfFollowups > NumberOfPersonVisiting','NumberOfChildren']
for item in remove_list:
    if item in features:
        features.remove(item)
    if item in numerical_features:
        numerical_features.remove(item)
    if item in categorical_features:
        categorical_features.remove(item)

print(f'学習に使用する特徴量: {features}')

# カテゴリカル特徴量を整数型に変換
for col in categorical_features:
    df_train[col] = df_train[col].astype(int)
    df_test[col] = df_test[col].astype(int)

数値特徴量: ['id', 'Age', 'DurationOfPitch', 'MonthlyIncome', 'NumberOfPersonVisiting', 'NumberOfFollowups', 'NumberOfTrips', 'NumberOfChildren', 'HasChildren', 'IncomePerChild', 'AnnualIncomePerTrip', 'PitchPlusFollowups', 'AdultTravelers']
カテゴリカル特徴量: ['TypeofContact', 'CityTier', 'Occupation', 'Gender', 'ProductPitched', 'PreferredPropertyStar', 'Passport', 'PitchSatisfactionScore', 'Designation', 'ProdTaken', 'MaritalStatus', 'OwnCar', 'ProductPitched_Designation', 'Single', 'NumberOfFollowups > NumberOfPersonVisiting']
学習に使用する特徴量: ['Age', 'TypeofContact', 'CityTier', 'DurationOfPitch', 'Occupation', 'Gender', 'NumberOfFollowups', 'ProductPitched', 'PreferredPropertyStar', 'NumberOfTrips', 'Passport', 'PitchSatisfactionScore', 'Designation', 'MonthlyIncome', 'OwnCar', 'IncomePerChild', 'AnnualIncomePerTrip', 'PitchPlusFollowups', 'AdultTravelers', 'ProductPitched_Designation', 'Single']


In [None]:
cols_category=categorical_features

In [None]:
train_y = df_train[target]
train_x = df_train.drop(target, axis=1)

In [None]:

params_cat = {
    "iterations": 1000,  # n_estimatorsに相当
    'eval_metric' : 'AUC',
    'depth':1,
    'l2_leaf_reg' : 8,
    "learning_rate": 0.25,
    "colsample_bylevel": 0.8,  # colsample_bytreeに相当
    'one_hot_max_size' : 25,
    "random_seed": 0,
    "verbose": 0,  #
    "use_best_model": True,  # 早期停止のための設定
}

In [None]:
params_cat = {
        'learning_rate': 0.25,
        # 'eval_metric': 'AUC',
        'depth':1,
        'l2_leaf_reg' : 6,
        'iterations':1000,
        'random_seed': 0,
        'one_hot_max_size' : 25,

}

In [None]:

import pickle

In [None]:
def seed_everything(seed):
    random.seed(seed)
    np.random.seed(seed)
    os.environ['PYTHONHASHSEED'] = str(seed)



In [None]:
list_feature_importances = []  # 特徴量重要度を格納するリスト

seeds = [10,20,30,40,50]
list_predictions_test = []
list_Mean_AUC = []

for seed in seeds:

    seed_everything(seed)

    cv = StratifiedKFold(n_splits=7, shuffle=True, random_state=seed)
    oof_preds_y = np.zeros(len(train_x))
    preds_y = np.zeros(len(df_test))
    list_cutoff = []
    list_models = []

    for fold, (trn_idx, val_idx) in enumerate(cv.split(train_x, train_y)):
        trn_x = train_x[features].iloc[trn_idx, :]
        trn_y = train_y[trn_idx]
        val_x = train_x[features].iloc[val_idx, :]
        val_y = train_y[val_idx]
        train_pool = Pool(data=trn_x, label=trn_y, cat_features=cols_category)
        val_pool = Pool(data=val_x, label=val_y, cat_features=cols_category)

        model_cat = CatBoostClassifier(**params_cat, early_stopping_rounds=100)
        model_cat.fit(train_pool, eval_set=[val_pool],early_stopping_rounds = 100,
                  verbose = 0,use_best_model = True)

        #valid_y_proba = model_cat.predict_proba(val_pool)[:, 1]
        valid_y_proba = model_cat.predict_proba(val_pool)[:, 1]
        list_models.append(model_cat)
        pickle.dump(model_cat, open(f'/content/drive/MyDrive/Signate/2024summer/models/catboost_fold_v12_K7_{seed}_{fold}.pkl', 'wb'))

        oof_preds_y[val_idx] = valid_y_proba
        del trn_x, trn_y, val_x, val_y, train_pool, val_pool
        gc.collect()

    AUC_score=roc_auc_score(train_y, oof_preds_y)
    print(f"AUC_score: {AUC_score}")
    list_Mean_AUC.append(np.mean(AUC_score))

    preds_y_proba=np.zeros(len(df_test))
    for model in list_models:
        feature_names = model.feature_names_
        preds_y_proba += model.predict_proba(df_test[feature_names])[:, 1] / len(list_models)
    preds_y = preds_y_proba

    list_predictions_test.append(preds_y)


AUC_score: 0.841088644164452
AUC_score: 0.8403704311430078
AUC_score: 0.8416279764146375
AUC_score: 0.8381764517586805
AUC_score: 0.8416797576905282


In [None]:
# 最終的な予測結果を DataFrame に変換して保存
df_predictions = pd.DataFrame(np.column_stack(list_predictions_test), columns=[f'Proba_Seed_{seed}' for seed in seeds])
df_predictions['Average_Proba'] = df_predictions.mean(axis=1)

In [None]:
df_AUC = pd.DataFrame(np.column_stack(list_Mean_AUC), columns=[f'Proba_Seed_{seed}' for seed in seeds])
df_AUC['Average_AUC'] = df_AUC.mean(axis=1)

In [None]:
df_predictions

Unnamed: 0,Proba_Seed_10,Proba_Seed_20,Proba_Seed_30,Proba_Seed_40,Proba_Seed_50,Average_Proba
0,0.071049,0.071739,0.072591,0.068145,0.074423,0.071589
1,0.403821,0.372773,0.366884,0.349073,0.354165,0.369343
2,0.304962,0.316022,0.321858,0.316791,0.317718,0.315470
3,0.472093,0.492378,0.494121,0.481974,0.485823,0.485278
4,0.243213,0.259144,0.243238,0.258914,0.251560,0.251214
...,...,...,...,...,...,...
3484,0.206250,0.202307,0.205076,0.208217,0.204923,0.205355
3485,0.011218,0.009937,0.009274,0.011775,0.010483,0.010538
3486,0.596365,0.582106,0.582728,0.580441,0.600576,0.588443
3487,0.198281,0.210940,0.203729,0.217925,0.217524,0.209680


In [None]:
df_AUC

Unnamed: 0,Proba_Seed_10,Proba_Seed_20,Proba_Seed_30,Proba_Seed_40,Proba_Seed_50,Average_AUC
0,0.841089,0.84037,0.841628,0.838176,0.84168,0.840589


In [None]:
df_AUC

Unnamed: 0,Proba_Seed_10,Proba_Seed_20,Proba_Seed_30,Proba_Seed_40,Proba_Seed_50,Average_AUC
0,0.841089,0.84037,0.841628,0.838176,0.84168,0.840589


In [None]:
count = 199

In [None]:

from datetime import datetime
import pytz
# カウント変数をインクリメント
count += 1
# 日本時間を取得
japan_tz = pytz.timezone('Asia/Tokyo')
now = datetime.now(japan_tz)
timestamp = now.strftime("%Y%m%d_%H%M%S")

file_name = f"/content/drive/MyDrive/Signate/2024summer/catboost_{timestamp}_{count:03d}.csv"
ss[1] = df_predictions['Average_Proba']
ss.to_csv(file_name, header=False, index=False)

In [None]:
df_predictions

Unnamed: 0,Proba_Seed_10,Proba_Seed_20,Proba_Seed_30,Proba_Seed_40,Proba_Seed_50,Average_Proba
0,0.071049,0.071739,0.072591,0.068145,0.074423,0.071589
1,0.403821,0.372773,0.366884,0.349073,0.354165,0.369343
2,0.304962,0.316022,0.321858,0.316791,0.317718,0.315470
3,0.472093,0.492378,0.494121,0.481974,0.485823,0.485278
4,0.243213,0.259144,0.243238,0.258914,0.251560,0.251214
...,...,...,...,...,...,...
3484,0.206250,0.202307,0.205076,0.208217,0.204923,0.205355
3485,0.011218,0.009937,0.009274,0.011775,0.010483,0.010538
3486,0.596365,0.582106,0.582728,0.580441,0.600576,0.588443
3487,0.198281,0.210940,0.203729,0.217925,0.217524,0.209680
