In [1]:
# Cell 1: Library Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import roc_auc_score
import logging
import os
from datetime import datetime
import klib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
import xgboost as xgb


In [2]:
# Cell 2: Setup Logging
class Logger:
    def __init__(self):
        self.logger = self.setup_logging()
    
    def setup_logging(self):
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        log_file_name = f'training_{timestamp}.log'

        if os.path.exists('training.log'):
            os.remove('training.log')

        logger = logging.getLogger(__name__)
        logger.setLevel(logging.INFO)

        console_handler = logging.StreamHandler()
        file_handler = logging.FileHandler(log_file_name)

        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        console_handler.setFormatter(formatter)
        file_handler.setFormatter(formatter)

        logger.addHandler(console_handler)
        logger.addHandler(file_handler)
        return logger
    
    def info(self, message):
        self.logger.info(message)
    
    def error(self, message):
        self.logger.error(message)

logger = Logger()
logger.info("Setup and Imports complete.")


2024-07-14 00:46:13,556 - __main__ - INFO - Setup and Imports complete.


In [3]:
# Cell 3: Data Handling Class
class DataHandler:
    def __init__(self, logger):
        self.logger = logger

    def load_data(self, train_path, test_path):
        try:
            train_df = pd.read_csv(train_path)
            test_df = pd.read_csv(test_path)
            self.logger.info("Datasets loaded successfully.")
            return train_df, test_df
        except Exception as e:
            self.logger.error(f"Error loading datasets: {e}")
            return None, None

    def rename_columns(self, df, column_mapping):
        df.rename(columns=column_mapping, inplace=True)
        self.logger.info("Columns renamed.")
        return df

data_handler = DataHandler(logger)
train_df, test_df = data_handler.load_data("klib_full_trainset.csv", "klib_full_testset.csv")

# Ensure datasets are loaded correctly
if train_df is not None and test_df is not None:
    new_column_names = {
        'gender': 'Gender',
        'age': 'Age',
        'driving_license': 'Driving_License',
        'region_code': 'Region_Code',
        'previously_insured': 'Previously_Insured',
        'vehicle_age': 'Vehicle_Age',
        'vehicle_damage': 'Vehicle_Damage',
        'annual_premium': 'Annual_Premium',
        'policy_sales_channel': 'Policy_Sales_Channel',
        'vintage': 'Vintage',
        'response': 'Response'
    }

    train_df = data_handler.rename_columns(train_df, new_column_names)
    test_df = data_handler.rename_columns(test_df, new_column_names)

    # Split the data into training and validation sets before preprocessing
    X = train_df.drop(columns=['Response'])
    y = train_df['Response']
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    logger.info(f"Training set shape: {X_train.shape}")
    logger.info(f"Validation set shape: {X_val.shape}")
else:
    raise ValueError("Failed to load datasets. Check the file paths and try again.")


2024-07-14 00:46:25,148 - __main__ - INFO - Datasets loaded successfully.
2024-07-14 00:46:25,151 - __main__ - INFO - Columns renamed.
2024-07-14 00:46:25,152 - __main__ - INFO - Columns renamed.
2024-07-14 00:46:30,173 - __main__ - INFO - Training set shape: (9172186, 9)
2024-07-14 00:46:30,174 - __main__ - INFO - Validation set shape: (2293047, 9)


In [4]:
# Cell 4: Feature Engineering Classes
class InteractionFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.feature_names = None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['Age_Annual_Premium'] = X['Age'] * X['Annual_Premium']
        X['Age_Vintage'] = X['Age'] * X['Vintage']
        X['Annual_Premium_Vintage'] = X['Annual_Premium'] * X['Vintage']
        X['Age_Region_Code'] = X['Age'] * X['Region_Code']
        X['Vintage_Region_Code'] = X['Vintage'] * X['Region_Code']
        X['Annual_Premium_Region_Code'] = X['Annual_Premium'] * X['Region_Code']
        self.feature_names = X.columns.tolist()
        return X

class PolynomialFeatureGeneration(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
        self.feature_names = None

    def fit(self, X, y=None):
        self.poly.fit(X[['Age', 'Annual_Premium', 'Vintage']])
        return self

    def transform(self, X):
        poly_features = self.poly.transform(X[['Age', 'Annual_Premium', 'Vintage']])
        poly_feature_names = self.poly.get_feature_names_out(['Age', 'Annual_Premium', 'Vintage'])
        poly_df = pd.DataFrame(poly_features, columns=[f'poly_{name.replace(" ", "_")}' for name in poly_feature_names], index=X.index)
        X = pd.concat([X, poly_df], axis=1)
        self.feature_names = X.columns.tolist()
        return X

logger.info("Custom transformers defined.")


2024-07-14 00:46:30,184 - __main__ - INFO - Custom transformers defined.


In [5]:
# Cell 5: Preprocessing and Model Preparation
class PreprocessingPipeline:
    def __init__(self, logger):
        self.logger = logger
        self.pipeline = self.create_pipeline()

    def create_pipeline(self):
        pipeline = Pipeline([
            ('interactions', InteractionFeatures()),
            ('poly_features', PolynomialFeatureGeneration()),
            ('scaling', StandardScaler(with_mean=False))  # Preserve feature names
        ])
        self.logger.info("Preprocessing pipeline defined.")
        return pipeline

    def preprocess_data(self, X_train, X_val, y_train):
        X_train_preprocessed = self.pipeline.fit_transform(X_train, y_train)
        X_val_preprocessed = self.pipeline.transform(X_val)

        feature_names = self.pipeline.named_steps['poly_features'].feature_names
        X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=feature_names)
        X_val_preprocessed = pd.DataFrame(X_val_preprocessed, columns=feature_names)

        self.logger.info(f"Training set after preprocessing: {X_train_preprocessed.shape}")
        self.logger.info(f"Validation set after preprocessing: {X_val_preprocessed.shape}")
        return X_train_preprocessed, X_val_preprocessed

# Usage Example
preprocessor = PreprocessingPipeline(logger)
X_train_preprocessed, X_val_preprocessed = preprocessor.preprocess_data(X_train, X_val, y_train)

2024-07-14 00:46:30,197 - __main__ - INFO - Preprocessing pipeline defined.
2024-07-14 00:46:35,547 - __main__ - INFO - Training set after preprocessing: (9172186, 21)
2024-07-14 00:46:35,548 - __main__ - INFO - Validation set after preprocessing: (2293047, 21)


In [6]:
# Cell 6: Hyperparameter Optimization with Optuna
def objective(trial):
    param = {
        'eval_metric': 'auc',
        'early_stopping_rounds': 100,
        'lambda': trial.suggest_float('lambda', 1e-3, 10.0, log=True),
        'alpha': trial.suggest_float('alpha', 1e-3, 10.0, log=True),
        'colsample_bytree': trial.suggest_categorical('colsample_bytree', [0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 1.0]),
        'subsample': trial.suggest_categorical('subsample', [0.4, 0.5, 0.6, 0.7, 0.8, 1.0]),
        'learning_rate': trial.suggest_categorical('learning_rate', [0.008, 0.01, 0.012, 0.014, 0.016, 0.018, 0.02]),
        'n_estimators': trial.suggest_int('n_estimators', 5, 1000),
        'max_depth': trial.suggest_categorical('max_depth', [5, 7, 9, 11, 13, 15, 17]),
        'random_state': 42,
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 300),
    }
    
    y_train_series = pd.Series(y_train)  # Convert y_train to pandas Series
    ratio = float(y_train_series.value_counts()[0]) / y_train_series.value_counts()[1]
    model = xgb.XGBClassifier(**param, scale_pos_weight=ratio)
    
    model.fit(
        X_train_preprocessed, y_train_series,
        eval_set=[(X_val_preprocessed, y_val)],
        verbose=True
    )
    
    y_pred = model.predict_proba(X_val_preprocessed)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    return auc

logger.info("Starting hyperparameter optimization with Optuna.")
study = optuna.create_study(direction='maximize')
study.optimize(objective, n_trials=20)

logger.info(f"Best trial parameters: {study.best_trial.params}")
logger.info(f"Best trial AUC: {study.best_trial.value}")


2024-07-14 00:46:35,558 - __main__ - INFO - Starting hyperparameter optimization with Optuna.
[I 2024-07-14 00:46:35,559] A new study created in memory with name: no-name-b07b39f3-c720-4491-9cc0-daca6ea42c42


[0]	validation_0-auc:0.86208
[1]	validation_0-auc:0.86274
[2]	validation_0-auc:0.86380
[3]	validation_0-auc:0.86382
[4]	validation_0-auc:0.86373
[5]	validation_0-auc:0.86373
[6]	validation_0-auc:0.86373
[7]	validation_0-auc:0.86398
[8]	validation_0-auc:0.86395
[9]	validation_0-auc:0.86398
[10]	validation_0-auc:0.86395
[11]	validation_0-auc:0.86395
[12]	validation_0-auc:0.86411
[13]	validation_0-auc:0.86408
[14]	validation_0-auc:0.86413
[15]	validation_0-auc:0.86416
[16]	validation_0-auc:0.86416
[17]	validation_0-auc:0.86415
[18]	validation_0-auc:0.86416
[19]	validation_0-auc:0.86416
[20]	validation_0-auc:0.86419
[21]	validation_0-auc:0.86429
[22]	validation_0-auc:0.86429
[23]	validation_0-auc:0.86429
[24]	validation_0-auc:0.86437
[25]	validation_0-auc:0.86443
[26]	validation_0-auc:0.86448
[27]	validation_0-auc:0.86453
[28]	validation_0-auc:0.86453
[29]	validation_0-auc:0.86453
[30]	validation_0-auc:0.86456
[31]	validation_0-auc:0.86458
[32]	validation_0-auc:0.86458
[33]	validation_0-au

[I 2024-07-14 00:54:19,932] Trial 0 finished with value: 0.8769453978423168 and parameters: {'lambda': 0.004302549695256806, 'alpha': 3.510644059149492, 'colsample_bytree': 1.0, 'subsample': 0.7, 'learning_rate': 0.008, 'n_estimators': 580, 'max_depth': 13, 'min_child_weight': 186}. Best is trial 0 with value: 0.8769453978423168.


[0]	validation_0-auc:0.84021
[1]	validation_0-auc:0.86081
[2]	validation_0-auc:0.85968
[3]	validation_0-auc:0.85377
[4]	validation_0-auc:0.85665
[5]	validation_0-auc:0.85934
[6]	validation_0-auc:0.86109
[7]	validation_0-auc:0.85943
[8]	validation_0-auc:0.86036
[9]	validation_0-auc:0.85938
[10]	validation_0-auc:0.85710
[11]	validation_0-auc:0.85786
[12]	validation_0-auc:0.86008
[13]	validation_0-auc:0.86040
[14]	validation_0-auc:0.85992
[15]	validation_0-auc:0.85858
[16]	validation_0-auc:0.85983
[17]	validation_0-auc:0.86051
[18]	validation_0-auc:0.86063
[19]	validation_0-auc:0.86041
[20]	validation_0-auc:0.86080
[21]	validation_0-auc:0.86003
[22]	validation_0-auc:0.85904
[23]	validation_0-auc:0.85948
[24]	validation_0-auc:0.85981
[25]	validation_0-auc:0.86006
[26]	validation_0-auc:0.86025
[27]	validation_0-auc:0.85956
[28]	validation_0-auc:0.85977
[29]	validation_0-auc:0.86031
[30]	validation_0-auc:0.86082
[31]	validation_0-auc:0.86122
[32]	validation_0-auc:0.86160
[33]	validation_0-au

[I 2024-07-14 01:08:10,083] Trial 1 finished with value: 0.8780962564969836 and parameters: {'lambda': 0.33181469384367357, 'alpha': 0.39546097010226333, 'colsample_bytree': 0.4, 'subsample': 1.0, 'learning_rate': 0.008, 'n_estimators': 988, 'max_depth': 11, 'min_child_weight': 171}. Best is trial 1 with value: 0.8780962564969836.


[0]	validation_0-auc:0.86502
[1]	validation_0-auc:0.86661
[2]	validation_0-auc:0.86712
[3]	validation_0-auc:0.86755
[4]	validation_0-auc:0.86776
[5]	validation_0-auc:0.86829
[6]	validation_0-auc:0.86839
[7]	validation_0-auc:0.86873
[8]	validation_0-auc:0.86886
[9]	validation_0-auc:0.86890
[10]	validation_0-auc:0.86888
[11]	validation_0-auc:0.86890
[12]	validation_0-auc:0.86896
[13]	validation_0-auc:0.86899
[14]	validation_0-auc:0.86912
[15]	validation_0-auc:0.86922
[16]	validation_0-auc:0.86924
[17]	validation_0-auc:0.86928
[18]	validation_0-auc:0.86937
[19]	validation_0-auc:0.86944
[20]	validation_0-auc:0.86950
[21]	validation_0-auc:0.86954
[22]	validation_0-auc:0.86955
[23]	validation_0-auc:0.86962
[24]	validation_0-auc:0.86966
[25]	validation_0-auc:0.86969
[26]	validation_0-auc:0.86974
[27]	validation_0-auc:0.86981
[28]	validation_0-auc:0.86984
[29]	validation_0-auc:0.86988
[30]	validation_0-auc:0.86994
[31]	validation_0-auc:0.86997
[32]	validation_0-auc:0.87004
[33]	validation_0-au

[I 2024-07-14 01:15:52,338] Trial 2 finished with value: 0.8799084503501599 and parameters: {'lambda': 1.4156872114675867, 'alpha': 0.022247928649320205, 'colsample_bytree': 1.0, 'subsample': 0.8, 'learning_rate': 0.02, 'n_estimators': 427, 'max_depth': 17, 'min_child_weight': 35}. Best is trial 2 with value: 0.8799084503501599.


[0]	validation_0-auc:0.85691
[1]	validation_0-auc:0.85692
[2]	validation_0-auc:0.85700
[3]	validation_0-auc:0.85706
[4]	validation_0-auc:0.85707
[5]	validation_0-auc:0.85710
[6]	validation_0-auc:0.85713
[7]	validation_0-auc:0.85713
[8]	validation_0-auc:0.85713
[9]	validation_0-auc:0.85713
[10]	validation_0-auc:0.85720
[11]	validation_0-auc:0.85745
[12]	validation_0-auc:0.85810
[13]	validation_0-auc:0.85809
[14]	validation_0-auc:0.85833
[15]	validation_0-auc:0.85840
[16]	validation_0-auc:0.85837
[17]	validation_0-auc:0.85845
[18]	validation_0-auc:0.85843
[19]	validation_0-auc:0.85850
[20]	validation_0-auc:0.85852
[21]	validation_0-auc:0.85854
[22]	validation_0-auc:0.85859
[23]	validation_0-auc:0.85848
[24]	validation_0-auc:0.85862
[25]	validation_0-auc:0.85875
[26]	validation_0-auc:0.85877
[27]	validation_0-auc:0.85877
[28]	validation_0-auc:0.85879
[29]	validation_0-auc:0.85883
[30]	validation_0-auc:0.85882
[31]	validation_0-auc:0.85883
[32]	validation_0-auc:0.85884
[33]	validation_0-au

[I 2024-07-14 01:21:52,983] Trial 3 finished with value: 0.8728974393702207 and parameters: {'lambda': 1.6164536277344024, 'alpha': 2.906631860168991, 'colsample_bytree': 1.0, 'subsample': 1.0, 'learning_rate': 0.008, 'n_estimators': 553, 'max_depth': 9, 'min_child_weight': 258}. Best is trial 2 with value: 0.8799084503501599.


[0]	validation_0-auc:0.84087
[1]	validation_0-auc:0.84144
[2]	validation_0-auc:0.84905
[3]	validation_0-auc:0.85129
[4]	validation_0-auc:0.85278
[5]	validation_0-auc:0.85414
[6]	validation_0-auc:0.85383
[7]	validation_0-auc:0.85373
[8]	validation_0-auc:0.85366
[9]	validation_0-auc:0.85354
[10]	validation_0-auc:0.85354
[11]	validation_0-auc:0.85365
[12]	validation_0-auc:0.85295
[13]	validation_0-auc:0.85293
[14]	validation_0-auc:0.85297
[15]	validation_0-auc:0.85305
[16]	validation_0-auc:0.85333
[17]	validation_0-auc:0.85324
[18]	validation_0-auc:0.85315
[19]	validation_0-auc:0.85300
[20]	validation_0-auc:0.85297
[21]	validation_0-auc:0.85303
[22]	validation_0-auc:0.85285
[23]	validation_0-auc:0.85289
[24]	validation_0-auc:0.85292
[25]	validation_0-auc:0.85283
[26]	validation_0-auc:0.85281
[27]	validation_0-auc:0.85291
[28]	validation_0-auc:0.85299
[29]	validation_0-auc:0.85286
[30]	validation_0-auc:0.85302
[31]	validation_0-auc:0.85307
[32]	validation_0-auc:0.85306
[33]	validation_0-au

[I 2024-07-14 01:24:41,844] Trial 4 finished with value: 0.8656246680241338 and parameters: {'lambda': 0.009524685582041361, 'alpha': 0.018159409254738006, 'colsample_bytree': 0.9, 'subsample': 0.8, 'learning_rate': 0.02, 'n_estimators': 270, 'max_depth': 5, 'min_child_weight': 51}. Best is trial 2 with value: 0.8799084503501599.


[0]	validation_0-auc:0.86544
[1]	validation_0-auc:0.86704
[2]	validation_0-auc:0.86825
[3]	validation_0-auc:0.86853
[4]	validation_0-auc:0.86869
[5]	validation_0-auc:0.86874
[6]	validation_0-auc:0.86883
[7]	validation_0-auc:0.86907
[8]	validation_0-auc:0.86914
[9]	validation_0-auc:0.86917
[10]	validation_0-auc:0.86917
[11]	validation_0-auc:0.86931
[12]	validation_0-auc:0.86945
[13]	validation_0-auc:0.86944
[14]	validation_0-auc:0.86948
[15]	validation_0-auc:0.86956
[16]	validation_0-auc:0.86957
[17]	validation_0-auc:0.86962
[18]	validation_0-auc:0.86968
[19]	validation_0-auc:0.86976
[20]	validation_0-auc:0.86979
[21]	validation_0-auc:0.86986
[22]	validation_0-auc:0.86988
[23]	validation_0-auc:0.86993
[24]	validation_0-auc:0.86995
[25]	validation_0-auc:0.87001
[26]	validation_0-auc:0.87006
[27]	validation_0-auc:0.87011
[28]	validation_0-auc:0.87014
[29]	validation_0-auc:0.87018
[30]	validation_0-auc:0.87021
[31]	validation_0-auc:0.87023
[32]	validation_0-auc:0.87029
[33]	validation_0-au

[I 2024-07-14 01:38:57,739] Trial 5 finished with value: 0.8808186687737728 and parameters: {'lambda': 0.0012938762247671051, 'alpha': 0.4528042170268573, 'colsample_bytree': 1.0, 'subsample': 0.6, 'learning_rate': 0.02, 'n_estimators': 817, 'max_depth': 17, 'min_child_weight': 127}. Best is trial 5 with value: 0.8808186687737728.


[0]	validation_0-auc:0.85669
[1]	validation_0-auc:0.85713
[2]	validation_0-auc:0.85823
[3]	validation_0-auc:0.85839
[4]	validation_0-auc:0.85847
[5]	validation_0-auc:0.85865
[6]	validation_0-auc:0.85864
[7]	validation_0-auc:0.85876
[8]	validation_0-auc:0.85865
[9]	validation_0-auc:0.85879
[10]	validation_0-auc:0.85879
[11]	validation_0-auc:0.85886
[12]	validation_0-auc:0.85890
[13]	validation_0-auc:0.85895
[14]	validation_0-auc:0.85901
[15]	validation_0-auc:0.85933
[16]	validation_0-auc:0.85933
[17]	validation_0-auc:0.85935
[18]	validation_0-auc:0.85944
[19]	validation_0-auc:0.85941
[20]	validation_0-auc:0.85949
[21]	validation_0-auc:0.85953
[22]	validation_0-auc:0.85955
[23]	validation_0-auc:0.85960
[24]	validation_0-auc:0.85985
[25]	validation_0-auc:0.86024
[26]	validation_0-auc:0.86029
[27]	validation_0-auc:0.86036
[28]	validation_0-auc:0.86049
[29]	validation_0-auc:0.86058
[30]	validation_0-auc:0.86075
[31]	validation_0-auc:0.86076
[32]	validation_0-auc:0.86082
[33]	validation_0-au

[I 2024-07-14 01:48:43,767] Trial 6 finished with value: 0.8790224363867613 and parameters: {'lambda': 0.026792302517231765, 'alpha': 0.0942313096333794, 'colsample_bytree': 1.0, 'subsample': 0.5, 'learning_rate': 0.018, 'n_estimators': 735, 'max_depth': 9, 'min_child_weight': 79}. Best is trial 5 with value: 0.8808186687737728.


[0]	validation_0-auc:0.84736
[1]	validation_0-auc:0.86626
[2]	validation_0-auc:0.86464
[3]	validation_0-auc:0.85911
[4]	validation_0-auc:0.86175
[5]	validation_0-auc:0.86466
[6]	validation_0-auc:0.86641
[7]	validation_0-auc:0.86523
[8]	validation_0-auc:0.86630
[9]	validation_0-auc:0.86513
[10]	validation_0-auc:0.86317
[11]	validation_0-auc:0.86392
[12]	validation_0-auc:0.86611
[13]	validation_0-auc:0.86648
[14]	validation_0-auc:0.86591
[15]	validation_0-auc:0.86476
[16]	validation_0-auc:0.86603
[17]	validation_0-auc:0.86674
[18]	validation_0-auc:0.86694
[19]	validation_0-auc:0.86662
[20]	validation_0-auc:0.86706
[21]	validation_0-auc:0.86639
[22]	validation_0-auc:0.86552
[23]	validation_0-auc:0.86604
[24]	validation_0-auc:0.86643
[25]	validation_0-auc:0.86669
[26]	validation_0-auc:0.86701
[27]	validation_0-auc:0.86642
[28]	validation_0-auc:0.86666
[29]	validation_0-auc:0.86720
[30]	validation_0-auc:0.86767
[31]	validation_0-auc:0.86805
[32]	validation_0-auc:0.86838
[33]	validation_0-au

[I 2024-07-14 01:56:10,218] Trial 7 finished with value: 0.8776486742924686 and parameters: {'lambda': 0.37646028897471573, 'alpha': 0.012993133324736603, 'colsample_bytree': 0.4, 'subsample': 0.6, 'learning_rate': 0.012, 'n_estimators': 378, 'max_depth': 17, 'min_child_weight': 290}. Best is trial 5 with value: 0.8808186687737728.


[0]	validation_0-auc:0.85265
[1]	validation_0-auc:0.86054
[2]	validation_0-auc:0.86079
[3]	validation_0-auc:0.85552
[4]	validation_0-auc:0.85963
[5]	validation_0-auc:0.86143
[6]	validation_0-auc:0.86257
[7]	validation_0-auc:0.86156
[8]	validation_0-auc:0.86197
[9]	validation_0-auc:0.86195
[10]	validation_0-auc:0.86208
[11]	validation_0-auc:0.86201
[12]	validation_0-auc:0.86277
[13]	validation_0-auc:0.86313
[14]	validation_0-auc:0.86335
[15]	validation_0-auc:0.86330
[16]	validation_0-auc:0.86356
[17]	validation_0-auc:0.86369
[18]	validation_0-auc:0.86370
[19]	validation_0-auc:0.86352
[20]	validation_0-auc:0.86365
[21]	validation_0-auc:0.86373
[22]	validation_0-auc:0.86388
[23]	validation_0-auc:0.86383
[24]	validation_0-auc:0.86399
[25]	validation_0-auc:0.86409
[26]	validation_0-auc:0.86407
[27]	validation_0-auc:0.86385
[28]	validation_0-auc:0.86397
[29]	validation_0-auc:0.86408
[30]	validation_0-auc:0.86421
[31]	validation_0-auc:0.86431
[32]	validation_0-auc:0.86432
[33]	validation_0-au

[I 2024-07-14 02:02:08,199] Trial 8 finished with value: 0.8741323057993174 and parameters: {'lambda': 8.445838544541342, 'alpha': 2.5496200375981757, 'colsample_bytree': 0.6, 'subsample': 0.5, 'learning_rate': 0.01, 'n_estimators': 416, 'max_depth': 11, 'min_child_weight': 78}. Best is trial 5 with value: 0.8808186687737728.


[0]	validation_0-auc:0.84912
[1]	validation_0-auc:0.85786
[2]	validation_0-auc:0.85817
[3]	validation_0-auc:0.85292
[4]	validation_0-auc:0.85714
[5]	validation_0-auc:0.85884
[6]	validation_0-auc:0.86001
[7]	validation_0-auc:0.85950
[8]	validation_0-auc:0.85989
[9]	validation_0-auc:0.86001
[10]	validation_0-auc:0.86015
[11]	validation_0-auc:0.86002
[12]	validation_0-auc:0.86071
[13]	validation_0-auc:0.86085
[14]	validation_0-auc:0.86110
[15]	validation_0-auc:0.86121
[16]	validation_0-auc:0.86149
[17]	validation_0-auc:0.86164
[18]	validation_0-auc:0.86169
[19]	validation_0-auc:0.86157
[20]	validation_0-auc:0.86188
[21]	validation_0-auc:0.86185
[22]	validation_0-auc:0.86196
[23]	validation_0-auc:0.86189
[24]	validation_0-auc:0.86203
[25]	validation_0-auc:0.86215
[26]	validation_0-auc:0.86215
[27]	validation_0-auc:0.86203
[28]	validation_0-auc:0.86215
[29]	validation_0-auc:0.86227
[30]	validation_0-auc:0.86237
[31]	validation_0-auc:0.86245
[32]	validation_0-auc:0.86246
[33]	validation_0-au

[I 2024-07-14 02:07:21,173] Trial 9 finished with value: 0.875998742232396 and parameters: {'lambda': 0.1401579778110949, 'alpha': 0.03718457879987855, 'colsample_bytree': 0.6, 'subsample': 0.4, 'learning_rate': 0.02, 'n_estimators': 366, 'max_depth': 9, 'min_child_weight': 48}. Best is trial 5 with value: 0.8808186687737728.


[0]	validation_0-auc:0.82921
[1]	validation_0-auc:0.85549
[2]	validation_0-auc:0.85422
[3]	validation_0-auc:0.85308
[4]	validation_0-auc:0.84702
[5]	validation_0-auc:0.84124
[6]	validation_0-auc:0.84906
[7]	validation_0-auc:0.84432
[8]	validation_0-auc:0.84186
[9]	validation_0-auc:0.83956
[10]	validation_0-auc:0.83611
[11]	validation_0-auc:0.84087
[12]	validation_0-auc:0.84527
[13]	validation_0-auc:0.84785
[14]	validation_0-auc:0.84632
[15]	validation_0-auc:0.84402
[16]	validation_0-auc:0.84728
[17]	validation_0-auc:0.84965
[18]	validation_0-auc:0.84764
[19]	validation_0-auc:0.84624
[20]	validation_0-auc:0.84467
[21]	validation_0-auc:0.84313
[22]	validation_0-auc:0.84155
[23]	validation_0-auc:0.84366
[24]	validation_0-auc:0.84221
[25]	validation_0-auc:0.84424
[26]	validation_0-auc:0.84288
[27]	validation_0-auc:0.84157
[28]	validation_0-auc:0.84343
[29]	validation_0-auc:0.84238
[30]	validation_0-auc:0.84439
[31]	validation_0-auc:0.84621
[32]	validation_0-auc:0.84758
[33]	validation_0-au

[I 2024-07-14 02:07:57,180] Trial 10 finished with value: 0.8554862738418751 and parameters: {'lambda': 0.0020227310746188344, 'alpha': 0.0011095920816129696, 'colsample_bytree': 0.3, 'subsample': 0.6, 'learning_rate': 0.014, 'n_estimators': 41, 'max_depth': 7, 'min_child_weight': 127}. Best is trial 5 with value: 0.8808186687737728.


[0]	validation_0-auc:0.85867
[1]	validation_0-auc:0.86630
[2]	validation_0-auc:0.86836
[3]	validation_0-auc:0.86774
[4]	validation_0-auc:0.86855
[5]	validation_0-auc:0.86924
[6]	validation_0-auc:0.86932
[7]	validation_0-auc:0.86933
[8]	validation_0-auc:0.86969
[9]	validation_0-auc:0.86982
[10]	validation_0-auc:0.86991
[11]	validation_0-auc:0.87023
[12]	validation_0-auc:0.87016
[13]	validation_0-auc:0.87036
[14]	validation_0-auc:0.87034
[15]	validation_0-auc:0.87060
[16]	validation_0-auc:0.87075
[17]	validation_0-auc:0.87080
[18]	validation_0-auc:0.87087
[19]	validation_0-auc:0.87090
[20]	validation_0-auc:0.87093
[21]	validation_0-auc:0.87094
[22]	validation_0-auc:0.87089
[23]	validation_0-auc:0.87108
[24]	validation_0-auc:0.87105
[25]	validation_0-auc:0.87107
[26]	validation_0-auc:0.87109
[27]	validation_0-auc:0.87121
[28]	validation_0-auc:0.87126
[29]	validation_0-auc:0.87128
[30]	validation_0-auc:0.87136
[31]	validation_0-auc:0.87136
[32]	validation_0-auc:0.87136
[33]	validation_0-au

[I 2024-07-14 02:22:00,898] Trial 11 finished with value: 0.8789689110260487 and parameters: {'lambda': 3.020688839711756, 'alpha': 0.4556500912961612, 'colsample_bytree': 0.8, 'subsample': 0.8, 'learning_rate': 0.02, 'n_estimators': 811, 'max_depth': 17, 'min_child_weight': 2}. Best is trial 5 with value: 0.8808186687737728.


[0]	validation_0-auc:0.86040
[1]	validation_0-auc:0.86717
[2]	validation_0-auc:0.86686
[3]	validation_0-auc:0.86265
[4]	validation_0-auc:0.86603
[5]	validation_0-auc:0.86752
[6]	validation_0-auc:0.86903
[7]	validation_0-auc:0.86777
[8]	validation_0-auc:0.86862
[9]	validation_0-auc:0.86755
[10]	validation_0-auc:0.86802
[11]	validation_0-auc:0.86816
[12]	validation_0-auc:0.86889
[13]	validation_0-auc:0.86924
[14]	validation_0-auc:0.86883
[15]	validation_0-auc:0.86894
[16]	validation_0-auc:0.86931
[17]	validation_0-auc:0.86964
[18]	validation_0-auc:0.87004
[19]	validation_0-auc:0.86983
[20]	validation_0-auc:0.87008
[21]	validation_0-auc:0.87016
[22]	validation_0-auc:0.87032
[23]	validation_0-auc:0.87030
[24]	validation_0-auc:0.87036
[25]	validation_0-auc:0.87044
[26]	validation_0-auc:0.87059
[27]	validation_0-auc:0.87031
[28]	validation_0-auc:0.87052
[29]	validation_0-auc:0.87083
[30]	validation_0-auc:0.87114
[31]	validation_0-auc:0.87136
[32]	validation_0-auc:0.87149
[33]	validation_0-au

[I 2024-07-14 02:37:34,819] Trial 12 finished with value: 0.8807582450087627 and parameters: {'lambda': 0.03750078277030615, 'alpha': 0.0029189986241410113, 'colsample_bytree': 0.5, 'subsample': 0.8, 'learning_rate': 0.016, 'n_estimators': 734, 'max_depth': 17, 'min_child_weight': 131}. Best is trial 5 with value: 0.8808186687737728.


[0]	validation_0-auc:0.85768
[1]	validation_0-auc:0.86495
[2]	validation_0-auc:0.86490
[3]	validation_0-auc:0.86054
[4]	validation_0-auc:0.86402
[5]	validation_0-auc:0.86544
[6]	validation_0-auc:0.86701
[7]	validation_0-auc:0.86572
[8]	validation_0-auc:0.86651
[9]	validation_0-auc:0.86552
[10]	validation_0-auc:0.86594
[11]	validation_0-auc:0.86609
[12]	validation_0-auc:0.86680
[13]	validation_0-auc:0.86717
[14]	validation_0-auc:0.86680
[15]	validation_0-auc:0.86697
[16]	validation_0-auc:0.86734
[17]	validation_0-auc:0.86769
[18]	validation_0-auc:0.86804
[19]	validation_0-auc:0.86784
[20]	validation_0-auc:0.86809
[21]	validation_0-auc:0.86818
[22]	validation_0-auc:0.86838
[23]	validation_0-auc:0.86836
[24]	validation_0-auc:0.86842
[25]	validation_0-auc:0.86852
[26]	validation_0-auc:0.86868
[27]	validation_0-auc:0.86837
[28]	validation_0-auc:0.86858
[29]	validation_0-auc:0.86889
[30]	validation_0-auc:0.86921
[31]	validation_0-auc:0.86942
[32]	validation_0-auc:0.86957
[33]	validation_0-au

[I 2024-07-14 02:52:41,260] Trial 13 finished with value: 0.8807139943984365 and parameters: {'lambda': 0.03077090468814122, 'alpha': 0.0010100731657061073, 'colsample_bytree': 0.5, 'subsample': 0.6, 'learning_rate': 0.016, 'n_estimators': 790, 'max_depth': 15, 'min_child_weight': 130}. Best is trial 5 with value: 0.8808186687737728.


[0]	validation_0-auc:0.86056
[1]	validation_0-auc:0.86776
[2]	validation_0-auc:0.86869
[3]	validation_0-auc:0.86799
[4]	validation_0-auc:0.86929
[5]	validation_0-auc:0.86962
[6]	validation_0-auc:0.87014
[7]	validation_0-auc:0.86985
[8]	validation_0-auc:0.87020
[9]	validation_0-auc:0.87027
[10]	validation_0-auc:0.87056
[11]	validation_0-auc:0.87071
[12]	validation_0-auc:0.87096
[13]	validation_0-auc:0.87105
[14]	validation_0-auc:0.87117
[15]	validation_0-auc:0.87118
[16]	validation_0-auc:0.87119
[17]	validation_0-auc:0.87129
[18]	validation_0-auc:0.87136
[19]	validation_0-auc:0.87139
[20]	validation_0-auc:0.87142
[21]	validation_0-auc:0.87153
[22]	validation_0-auc:0.87154
[23]	validation_0-auc:0.87159
[24]	validation_0-auc:0.87162
[25]	validation_0-auc:0.87160
[26]	validation_0-auc:0.87166
[27]	validation_0-auc:0.87152
[28]	validation_0-auc:0.87155
[29]	validation_0-auc:0.87162
[30]	validation_0-auc:0.87172
[31]	validation_0-auc:0.87171
[32]	validation_0-auc:0.87175
[33]	validation_0-au

[I 2024-07-14 03:11:23,265] Trial 14 finished with value: 0.8810535336572811 and parameters: {'lambda': 0.0014741474126016326, 'alpha': 0.004272229741122167, 'colsample_bytree': 0.7, 'subsample': 0.7, 'learning_rate': 0.016, 'n_estimators': 990, 'max_depth': 17, 'min_child_weight': 212}. Best is trial 14 with value: 0.8810535336572811.


[0]	validation_0-auc:0.86068
[1]	validation_0-auc:0.86780
[2]	validation_0-auc:0.86869
[3]	validation_0-auc:0.86800
[4]	validation_0-auc:0.86929
[5]	validation_0-auc:0.86962
[6]	validation_0-auc:0.87014
[7]	validation_0-auc:0.86985
[8]	validation_0-auc:0.87019
[9]	validation_0-auc:0.87026
[10]	validation_0-auc:0.87054
[11]	validation_0-auc:0.87064
[12]	validation_0-auc:0.87089
[13]	validation_0-auc:0.87101
[14]	validation_0-auc:0.87113
[15]	validation_0-auc:0.87113
[16]	validation_0-auc:0.87114
[17]	validation_0-auc:0.87125
[18]	validation_0-auc:0.87134
[19]	validation_0-auc:0.87136
[20]	validation_0-auc:0.87139
[21]	validation_0-auc:0.87150
[22]	validation_0-auc:0.87151
[23]	validation_0-auc:0.87156
[24]	validation_0-auc:0.87158
[25]	validation_0-auc:0.87157
[26]	validation_0-auc:0.87162
[27]	validation_0-auc:0.87148
[28]	validation_0-auc:0.87151
[29]	validation_0-auc:0.87158
[30]	validation_0-auc:0.87168
[31]	validation_0-auc:0.87167
[32]	validation_0-auc:0.87171
[33]	validation_0-au

[I 2024-07-14 03:28:45,049] Trial 15 finished with value: 0.8810665547964497 and parameters: {'lambda': 0.0011124463405821116, 'alpha': 0.4697409790780375, 'colsample_bytree': 0.7, 'subsample': 0.7, 'learning_rate': 0.016, 'n_estimators': 923, 'max_depth': 17, 'min_child_weight': 221}. Best is trial 15 with value: 0.8810665547964497.


[0]	validation_0-auc:0.84087
[1]	validation_0-auc:0.84904
[2]	validation_0-auc:0.85015
[3]	validation_0-auc:0.85069
[4]	validation_0-auc:0.85155
[5]	validation_0-auc:0.85164
[6]	validation_0-auc:0.85225
[7]	validation_0-auc:0.85316
[8]	validation_0-auc:0.85416
[9]	validation_0-auc:0.85412
[10]	validation_0-auc:0.85414
[11]	validation_0-auc:0.85406
[12]	validation_0-auc:0.85470
[13]	validation_0-auc:0.85466
[14]	validation_0-auc:0.85487
[15]	validation_0-auc:0.85481
[16]	validation_0-auc:0.85455
[17]	validation_0-auc:0.85461
[18]	validation_0-auc:0.85458
[19]	validation_0-auc:0.85472
[20]	validation_0-auc:0.85509
[21]	validation_0-auc:0.85508
[22]	validation_0-auc:0.85504
[23]	validation_0-auc:0.85495
[24]	validation_0-auc:0.85504
[25]	validation_0-auc:0.85506
[26]	validation_0-auc:0.85509
[27]	validation_0-auc:0.85504
[28]	validation_0-auc:0.85517
[29]	validation_0-auc:0.85510
[30]	validation_0-auc:0.85504
[31]	validation_0-auc:0.85518
[32]	validation_0-auc:0.85506
[33]	validation_0-au

[I 2024-07-14 03:40:14,651] Trial 16 finished with value: 0.8739344341387254 and parameters: {'lambda': 0.0010557434638112924, 'alpha': 0.006300852415284654, 'colsample_bytree': 0.7, 'subsample': 0.7, 'learning_rate': 0.016, 'n_estimators': 960, 'max_depth': 5, 'min_child_weight': 213}. Best is trial 15 with value: 0.8810665547964497.


[0]	validation_0-auc:0.85584
[1]	validation_0-auc:0.86362
[2]	validation_0-auc:0.86451
[3]	validation_0-auc:0.86387
[4]	validation_0-auc:0.86499
[5]	validation_0-auc:0.86521
[6]	validation_0-auc:0.86593
[7]	validation_0-auc:0.86571
[8]	validation_0-auc:0.86602
[9]	validation_0-auc:0.86602
[10]	validation_0-auc:0.86625
[11]	validation_0-auc:0.86630
[12]	validation_0-auc:0.86660
[13]	validation_0-auc:0.86680
[14]	validation_0-auc:0.86694
[15]	validation_0-auc:0.86690
[16]	validation_0-auc:0.86693
[17]	validation_0-auc:0.86706
[18]	validation_0-auc:0.86711
[19]	validation_0-auc:0.86712
[20]	validation_0-auc:0.86715
[21]	validation_0-auc:0.86725
[22]	validation_0-auc:0.86728
[23]	validation_0-auc:0.86730
[24]	validation_0-auc:0.86732
[25]	validation_0-auc:0.86728
[26]	validation_0-auc:0.86731
[27]	validation_0-auc:0.86719
[28]	validation_0-auc:0.86724
[29]	validation_0-auc:0.86730
[30]	validation_0-auc:0.86738
[31]	validation_0-auc:0.86741
[32]	validation_0-auc:0.86744
[33]	validation_0-au

[I 2024-07-14 03:54:59,696] Trial 17 finished with value: 0.8809029542665469 and parameters: {'lambda': 0.006472707460224358, 'alpha': 0.11045192142038793, 'colsample_bytree': 0.7, 'subsample': 0.7, 'learning_rate': 0.016, 'n_estimators': 912, 'max_depth': 13, 'min_child_weight': 237}. Best is trial 15 with value: 0.8810665547964497.


[0]	validation_0-auc:0.85861
[1]	validation_0-auc:0.86563
[2]	validation_0-auc:0.86664
[3]	validation_0-auc:0.86602
[4]	validation_0-auc:0.86716
[5]	validation_0-auc:0.86745
[6]	validation_0-auc:0.86802
[7]	validation_0-auc:0.86779
[8]	validation_0-auc:0.86812
[9]	validation_0-auc:0.86814
[10]	validation_0-auc:0.86848
[11]	validation_0-auc:0.86862
[12]	validation_0-auc:0.86887
[13]	validation_0-auc:0.86903
[14]	validation_0-auc:0.86914
[15]	validation_0-auc:0.86914
[16]	validation_0-auc:0.86914
[17]	validation_0-auc:0.86921
[18]	validation_0-auc:0.86927
[19]	validation_0-auc:0.86930
[20]	validation_0-auc:0.86932
[21]	validation_0-auc:0.86944
[22]	validation_0-auc:0.86945
[23]	validation_0-auc:0.86948
[24]	validation_0-auc:0.86949
[25]	validation_0-auc:0.86947
[26]	validation_0-auc:0.86952
[27]	validation_0-auc:0.86941
[28]	validation_0-auc:0.86943
[29]	validation_0-auc:0.86950
[30]	validation_0-auc:0.86958
[31]	validation_0-auc:0.86958
[32]	validation_0-auc:0.86961
[33]	validation_0-au

[I 2024-07-14 04:06:17,752] Trial 18 finished with value: 0.8805419347398724 and parameters: {'lambda': 0.003237815436660901, 'alpha': 9.333294705255685, 'colsample_bytree': 0.7, 'subsample': 0.7, 'learning_rate': 0.016, 'n_estimators': 647, 'max_depth': 15, 'min_child_weight': 209}. Best is trial 15 with value: 0.8810665547964497.


[0]	validation_0-auc:0.84568
[1]	validation_0-auc:0.85567
[2]	validation_0-auc:0.85615
[3]	validation_0-auc:0.85654
[4]	validation_0-auc:0.85733
[5]	validation_0-auc:0.85730
[6]	validation_0-auc:0.85837
[7]	validation_0-auc:0.85856
[8]	validation_0-auc:0.85891
[9]	validation_0-auc:0.85872
[10]	validation_0-auc:0.85861
[11]	validation_0-auc:0.85848
[12]	validation_0-auc:0.85888
[13]	validation_0-auc:0.85894
[14]	validation_0-auc:0.85909
[15]	validation_0-auc:0.85899
[16]	validation_0-auc:0.85888
[17]	validation_0-auc:0.85896
[18]	validation_0-auc:0.85900
[19]	validation_0-auc:0.85911
[20]	validation_0-auc:0.85923
[21]	validation_0-auc:0.85934
[22]	validation_0-auc:0.85942
[23]	validation_0-auc:0.85943
[24]	validation_0-auc:0.85941
[25]	validation_0-auc:0.85948
[26]	validation_0-auc:0.85952
[27]	validation_0-auc:0.85945
[28]	validation_0-auc:0.85948
[29]	validation_0-auc:0.85949
[30]	validation_0-auc:0.85954
[31]	validation_0-auc:0.85960
[32]	validation_0-auc:0.85957
[33]	validation_0-au

[I 2024-07-14 04:18:09,405] Trial 19 finished with value: 0.8778279849296188 and parameters: {'lambda': 0.014559541547030692, 'alpha': 0.11539996258701654, 'colsample_bytree': 0.7, 'subsample': 0.7, 'learning_rate': 0.018, 'n_estimators': 890, 'max_depth': 7, 'min_child_weight': 292}. Best is trial 15 with value: 0.8810665547964497.
2024-07-14 04:18:09,407 - __main__ - INFO - Best trial parameters: {'lambda': 0.0011124463405821116, 'alpha': 0.4697409790780375, 'colsample_bytree': 0.7, 'subsample': 0.7, 'learning_rate': 0.016, 'n_estimators': 923, 'max_depth': 17, 'min_child_weight': 221}
2024-07-14 04:18:09,408 - __main__ - INFO - Best trial AUC: 0.8810665547964497


In [7]:
# Train the final model using the best parameters
best_params = study.best_trial.params
final_model = xgb.XGBClassifier(**best_params)
final_model.fit(X_train_preprocessed, y_train)

# Validate the final model
final_preds = final_model.predict_proba(X_val_preprocessed)[:, 1]
final_auc = roc_auc_score(y_val, final_preds)
logger.info(f"Final model AUC on validation set: {final_auc}")

2024-07-14 04:26:29,272 - __main__ - INFO - Final model AUC on validation set: 0.8809612985184044


In [8]:
# Cell 7: Train the Final Model using DMatrix with Best Parameters
best_params = study.best_trial.params
dtrain = xgb.DMatrix(data=X_train_preprocessed, label=y_train)
dval = xgb.DMatrix(data=X_val_preprocessed, label=y_val)

final_model = xgb.train(
    params=best_params,
    dtrain=dtrain,
    num_boost_round=best_params['n_estimators'],
    evals=[(dval, 'eval')],
    early_stopping_rounds=100
)

# Validate the final model
final_preds = final_model.predict(dval)
final_auc = roc_auc_score(y_val, final_preds)
logger.info(f"Final model AUC on validation set: {final_auc}")

Parameters: { "n_estimators" } are not used.



[0]	eval-rmse:0.32748
[1]	eval-rmse:0.32642
[2]	eval-rmse:0.32542
[3]	eval-rmse:0.32487
[4]	eval-rmse:0.32398
[5]	eval-rmse:0.32304
[6]	eval-rmse:0.32212
[7]	eval-rmse:0.32159
[8]	eval-rmse:0.32068
[9]	eval-rmse:0.31987
[10]	eval-rmse:0.31903
[11]	eval-rmse:0.31829
[12]	eval-rmse:0.31751
[13]	eval-rmse:0.31674
[14]	eval-rmse:0.31600
[15]	eval-rmse:0.31533
[16]	eval-rmse:0.31463
[17]	eval-rmse:0.31394
[18]	eval-rmse:0.31329
[19]	eval-rmse:0.31265
[20]	eval-rmse:0.31202
[21]	eval-rmse:0.31146
[22]	eval-rmse:0.31086
[23]	eval-rmse:0.31032
[24]	eval-rmse:0.30977
[25]	eval-rmse:0.30922
[26]	eval-rmse:0.30871
[27]	eval-rmse:0.30845
[28]	eval-rmse:0.30794
[29]	eval-rmse:0.30741
[30]	eval-rmse:0.30689
[31]	eval-rmse:0.30643
[32]	eval-rmse:0.30595
[33]	eval-rmse:0.30553
[34]	eval-rmse:0.30507
[35]	eval-rmse:0.30465
[36]	eval-rmse:0.30425
[37]	eval-rmse:0.30406
[38]	eval-rmse:0.30362
[39]	eval-rmse:0.30323
[40]	eval-rmse:0.30287
[41]	eval-rmse:0.30249
[42]	eval-rmse:0.30215
[43]	eval-rmse:0.3018

2024-07-14 04:39:50,818 - __main__ - INFO - Final model AUC on validation set: 0.8797911836736267


In [2]:
optuna.visualization.plot_optimization_history(study)

NameError: name 'optuna' is not defined

In [10]:
optuna.visualization.plot_parallel_coordinate(study)

In [11]:
optuna.visualization.plot_slice(study)

In [12]:
optuna.visualization.plot_contour(study)

In [13]:
optuna.visualization.plot_param_importances(study)

In [1]:
study.best_trial

NameError: name 'study' is not defined