In [1]:
# Cell 1: Library Imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import optuna
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.metrics import roc_auc_score
import logging
import os
from datetime import datetime
import klib
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from catboost import CatBoostClassifier


In [2]:
# Cell 2: Setup Logging
class Logger:
    def __init__(self):
        self.logger = self.setup_logging()
    
    def setup_logging(self):
        timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
        log_file_name = f'training_{timestamp}.log'

        if os.path.exists('training.log'):
            os.remove('training.log')

        logger = logging.getLogger(__name__)
        logger.setLevel(logging.INFO)

        console_handler = logging.StreamHandler()
        file_handler = logging.FileHandler(log_file_name)

        formatter = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
        console_handler.setFormatter(formatter)
        file_handler.setFormatter(formatter)

        logger.addHandler(console_handler)
        logger.addHandler(file_handler)
        return logger
    
    def info(self, message):
        self.logger.info(message)
    
    def error(self, message):
        self.logger.error(message)

logger = Logger()
logger.info("Setup and Imports complete.")

2024-07-15 00:37:26,180 - __main__ - INFO - Setup and Imports complete.


In [3]:
# Cell 3: Data Handling Class
class DataHandler:
    def __init__(self, logger):
        self.logger = logger

    def load_data(self, train_path, test_path):
        try:
            train_df = pd.read_csv(train_path)
            test_df = pd.read_csv(test_path)
            self.logger.info("Datasets loaded successfully.")
            return train_df, test_df
        except Exception as e:
            self.logger.error(f"Error loading datasets: {e}")
            return None, None

    def rename_columns(self, df, column_mapping):
        df.rename(columns=column_mapping, inplace=True)
        self.logger.info("Columns renamed.")
        return df

data_handler = DataHandler(logger)
train_df, test_df = data_handler.load_data(r"C:\Users\paulo\OneDrive\Documents\Binary-Classification-of-Insurance-Cross-Selling\model testing\xgboost\featured engineered\klib_full_trainset.csv", r"C:\Users\paulo\OneDrive\Documents\Binary-Classification-of-Insurance-Cross-Selling\model testing\xgboost\featured engineered\klib_full_testset.csv")

2024-07-15 00:37:37,329 - __main__ - INFO - Datasets loaded successfully.


In [4]:
# Ensure datasets are loaded correctly
if train_df is not None and test_df is not None:
    new_column_names = {
        'gender': 'Gender',
        'age': 'Age',
        'driving_license': 'Driving_License',
        'region_code': 'Region_Code',
        'previously_insured': 'Previously_Insured',
        'vehicle_age': 'Vehicle_Age',
        'vehicle_damage': 'Vehicle_Damage',
        'annual_premium': 'Annual_Premium',
        'policy_sales_channel': 'Policy_Sales_Channel',
        'vintage': 'Vintage',
        'response': 'Response'
    }

    train_df = data_handler.rename_columns(train_df, new_column_names)
    test_df = data_handler.rename_columns(test_df, new_column_names)

    # Split the data into training and validation sets before preprocessing
    X = train_df.drop(columns=['Response'])
    y = train_df['Response']
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

    logger.info(f"Training set shape: {X_train.shape}")
    logger.info(f"Validation set shape: {X_val.shape}")
else:
    raise ValueError("Failed to load datasets. Check the file paths and try again.")

2024-07-15 00:37:37,341 - __main__ - INFO - Columns renamed.
2024-07-15 00:37:37,342 - __main__ - INFO - Columns renamed.
2024-07-15 00:37:42,487 - __main__ - INFO - Training set shape: (9172186, 9)
2024-07-15 00:37:42,488 - __main__ - INFO - Validation set shape: (2293047, 9)


In [5]:
# Cell 4: Feature Engineering Classes
class InteractionFeatures(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.feature_names = None

    def fit(self, X, y=None):
        return self

    def transform(self, X):
        X['Age_Annual_Premium'] = X['Age'] * X['Annual_Premium']
        X['Age_Vintage'] = X['Age'] * X['Vintage']
        X['Annual_Premium_Vintage'] = X['Annual_Premium'] * X['Vintage']
        X['Age_Region_Code'] = X['Age'] * X['Region_Code']
        X['Vintage_Region_Code'] = X['Vintage'] * X['Region_Code']
        X['Annual_Premium_Region_Code'] = X['Annual_Premium'] * X['Region_Code']
        self.feature_names = X.columns.tolist()
        return X

In [6]:
class PolynomialFeatureGeneration(BaseEstimator, TransformerMixin):
    def __init__(self):
        self.poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
        self.feature_names = None

    def fit(self, X, y=None):
        self.poly.fit(X[['Age', 'Annual_Premium', 'Vintage']])
        return self

    def transform(self, X):
        poly_features = self.poly.transform(X[['Age', 'Annual_Premium', 'Vintage']])
        poly_feature_names = self.poly.get_feature_names_out(['Age', 'Annual_Premium', 'Vintage'])
        poly_df = pd.DataFrame(poly_features, columns=[f'poly_{name.replace(" ", "_")}' for name in poly_feature_names], index=X.index)
        X = pd.concat([X, poly_df], axis=1)
        self.feature_names = X.columns.tolist()
        return X

logger.info("Custom transformers defined.")

2024-07-15 00:37:42,504 - __main__ - INFO - Custom transformers defined.


In [7]:
# Cell 5: Preprocessing and Model Preparation
class PreprocessingPipeline:
    def __init__(self, logger):
        self.logger = logger
        self.pipeline = self.create_pipeline()

    def create_pipeline(self):
        pipeline = Pipeline([
            ('interactions', InteractionFeatures()),
            ('poly_features', PolynomialFeatureGeneration()),
            ('scaling', StandardScaler(with_mean=False))  # Preserve feature names
        ])
        self.logger.info("Preprocessing pipeline defined.")
        return pipeline

    def preprocess_data(self, X_train, X_val, y_train):
        X_train_preprocessed = self.pipeline.fit_transform(X_train, y_train)
        X_val_preprocessed = self.pipeline.transform(X_val)

        feature_names = self.pipeline.named_steps['poly_features'].feature_names
        X_train_preprocessed = pd.DataFrame(X_train_preprocessed, columns=feature_names)
        X_val_preprocessed = pd.DataFrame(X_val_preprocessed, columns=feature_names)

        self.logger.info(f"Training set after preprocessing: {X_train_preprocessed.shape}")
        self.logger.info(f"Validation set after preprocessing: {X_val_preprocessed.shape}")
        return X_train_preprocessed, X_val_preprocessed

# Usage Example
preprocessor = PreprocessingPipeline(logger)
X_train_preprocessed, X_val_preprocessed = preprocessor.preprocess_data(X_train, X_val, y_train)

2024-07-15 00:37:42,520 - __main__ - INFO - Preprocessing pipeline defined.
2024-07-15 00:37:47,584 - __main__ - INFO - Training set after preprocessing: (9172186, 21)
2024-07-15 00:37:47,585 - __main__ - INFO - Validation set after preprocessing: (2293047, 21)


In [8]:
# Cell 6: Hyperparameter Optimization with Optuna
def objective(trial):
    param = {
        'eval_metric': 'AUC',
        'iterations': trial.suggest_int('iterations', 500, 1000),  # Higher range
        'learning_rate': trial.suggest_float('learning_rate', 0.01, 0.1),  # Focused range
        'depth': trial.suggest_int('depth', 6, 16),  # Depth of the tree
        'l2_leaf_reg': trial.suggest_float('l2_leaf_reg', 1e-4, 10.0, log=True),  # L2 regularization term
        'bagging_temperature': trial.suggest_float('bagging_temperature', 0.0, 1.0),  # Bagging temperature
        'random_strength': trial.suggest_float('random_strength', 0.0, 1.0),  # Random strength
        'border_count': trial.suggest_int('border_count', 32, 255),  # Number of splits for numerical features
        'scale_pos_weight': trial.suggest_float('scale_pos_weight', 1.0, 10.0)  # Weighting for positive class
    }

    model = CatBoostClassifier(**param, verbose=100)

    model.fit(X_train_preprocessed, y_train, eval_set=(X_val_preprocessed, y_val), early_stopping_rounds=100)
    
    y_pred = model.predict_proba(X_val_preprocessed)[:, 1]
    auc = roc_auc_score(y_val, y_pred)
    return auc

logger.info("Starting hyperparameter optimization with Optuna.")

2024-07-15 00:37:47,596 - __main__ - INFO - Starting hyperparameter optimization with Optuna.


: 

In [9]:
# Increase Optuna verbosity
optuna.logging.set_verbosity(optuna.logging.DEBUG)

logger.info("Starting hyperparameter optimization with Optuna.")

# Generate a unique filename for each run
timestamp = datetime.now().strftime('%Y%m%d_%H%M%S')
sqlite_file_path = os.path.join(r'C:\Users\paulo\OneDrive\Documents\Binary-Classification-of-Insurance-Cross-Selling\model testing\Catboost', f'optuna_study_{timestamp}.db')

study = optuna.create_study(storage=f'sqlite:///{sqlite_file_path}', study_name=f'my_study_{timestamp}', direction='maximize')
study.optimize(objective, n_trials=25)  # Adjust the number of trials as needed

logger.info(f"Best trial parameters: {study.best_trial.params}")
logger.info(f"Best trial AUC: {study.best_trial.value}")

2024-07-15 00:37:47,614 - __main__ - INFO - Starting hyperparameter optimization with Optuna.
[I 2024-07-15 00:37:48,345] A new study created in RDB with name: my_study_20240715_003747


0:	test: 0.8430959	best: 0.8430959 (0)	total: 798ms	remaining: 10m 28s
100:	test: 0.8668733	best: 0.8668733 (100)	total: 1m	remaining: 6m 53s
200:	test: 0.8704541	best: 0.8704541 (200)	total: 1m 57s	remaining: 5m 44s
300:	test: 0.8725848	best: 0.8725848 (300)	total: 2m 55s	remaining: 4m 43s
400:	test: 0.8739455	best: 0.8739455 (400)	total: 3m 52s	remaining: 3m 44s
500:	test: 0.8748214	best: 0.8748214 (500)	total: 4m 50s	remaining: 2m 46s
600:	test: 0.8756290	best: 0.8756290 (600)	total: 5m 46s	remaining: 1m 47s
700:	test: 0.8761174	best: 0.8761174 (700)	total: 6m 41s	remaining: 49.8s
787:	test: 0.8765037	best: 0.8765037 (787)	total: 7m 29s	remaining: 0us

bestTest = 0.876503697
bestIteration = 787



[I 2024-07-15 00:45:21,046] Trial 0 finished with value: 0.8765036970247349 and parameters: {'iterations': 788, 'learning_rate': 0.08266837636003951, 'depth': 6, 'l2_leaf_reg': 0.0006653839603779793, 'bagging_temperature': 0.5268966402061496, 'random_strength': 0.40020198621011127, 'border_count': 187, 'scale_pos_weight': 5.964044965343005}. Best is trial 0 with value: 0.8765036970247349.


0:	test: 0.8573434	best: 0.8573434 (0)	total: 3.06s	remaining: 48m 9s
100:	test: 0.8719427	best: 0.8719427 (100)	total: 3m 52s	remaining: 32m 16s
200:	test: 0.8741726	best: 0.8741726 (200)	total: 7m 36s	remaining: 28m 5s
300:	test: 0.8750929	best: 0.8750929 (300)	total: 11m 22s	remaining: 24m 18s
400:	test: 0.8754818	best: 0.8754818 (400)	total: 15m 13s	remaining: 20m 37s
500:	test: 0.8755999	best: 0.8756097 (476)	total: 19m 9s	remaining: 16m 56s
600:	test: 0.8756061	best: 0.8756256 (580)	total: 23m 6s	remaining: 13m 11s
Stopped by overfitting detector  (100 iterations wait)

bestTest = 0.8756256471
bestIteration = 580

Shrink model to first 581 iterations.


[I 2024-07-15 01:11:46,579] Trial 1 finished with value: 0.8756256470517518 and parameters: {'iterations': 944, 'learning_rate': 0.07322030796487806, 'depth': 15, 'l2_leaf_reg': 0.10783585578688876, 'bagging_temperature': 0.11435566336197511, 'random_strength': 0.45667090827051815, 'border_count': 122, 'scale_pos_weight': 1.431125686046817}. Best is trial 0 with value: 0.8765036970247349.


0:	test: 0.8575774	best: 0.8575774 (0)	total: 2.52s	remaining: 26m 33s
100:	test: 0.8656465	best: 0.8656465 (100)	total: 3m 53s	remaining: 20m 28s
200:	test: 0.8694602	best: 0.8694602 (200)	total: 7m 20s	remaining: 15m 45s
300:	test: 0.8718775	best: 0.8718775 (300)	total: 10m 37s	remaining: 11m 43s
400:	test: 0.8734509	best: 0.8734509 (400)	total: 13m 56s	remaining: 8m 3s
500:	test: 0.8746802	best: 0.8746802 (500)	total: 17m 13s	remaining: 4m 32s
600:	test: 0.8757017	best: 0.8757017 (600)	total: 20m 31s	remaining: 1m 5s
632:	test: 0.8759619	best: 0.8759619 (632)	total: 21m 33s	remaining: 0us

bestTest = 0.8759619324
bestIteration = 632



[I 2024-07-15 01:33:26,897] Trial 2 finished with value: 0.8759619323908174 and parameters: {'iterations': 633, 'learning_rate': 0.021490504945601157, 'depth': 14, 'l2_leaf_reg': 0.005320015004774967, 'bagging_temperature': 0.275332315661884, 'random_strength': 0.09416802742280295, 'border_count': 242, 'scale_pos_weight': 4.865618530809193}. Best is trial 0 with value: 0.8765036970247349.


0:	test: 0.8457345	best: 0.8457345 (0)	total: 744ms	remaining: 7m 36s
100:	test: 0.8664239	best: 0.8664239 (100)	total: 1m 1s	remaining: 5m 13s
200:	test: 0.8683264	best: 0.8683264 (200)	total: 2m 1s	remaining: 4m 10s
300:	test: 0.8692684	best: 0.8692684 (300)	total: 3m 1s	remaining: 3m 9s
400:	test: 0.8698032	best: 0.8698032 (400)	total: 4m 2s	remaining: 2m 9s
500:	test: 0.8701778	best: 0.8701778 (500)	total: 5m 3s	remaining: 1m 9s
600:	test: 0.8704567	best: 0.8704567 (600)	total: 6m 5s	remaining: 8.51s
614:	test: 0.8704830	best: 0.8704830 (614)	total: 6m 13s	remaining: 0us

bestTest = 0.87048302
bestIteration = 614



[I 2024-07-15 01:39:43,792] Trial 3 finished with value: 0.8704830200447227 and parameters: {'iterations': 615, 'learning_rate': 0.09602776012440167, 'depth': 7, 'l2_leaf_reg': 1.1889402626460455, 'bagging_temperature': 0.4527591043844478, 'random_strength': 0.2083722843054907, 'border_count': 35, 'scale_pos_weight': 4.235328011651097}. Best is trial 0 with value: 0.8765036970247349.


0:	test: 0.8490549	best: 0.8490549 (0)	total: 764ms	remaining: 11m 30s
100:	test: 0.8634847	best: 0.8634847 (100)	total: 1m 7s	remaining: 8m 59s
200:	test: 0.8673243	best: 0.8673243 (200)	total: 2m 11s	remaining: 7m 38s
300:	test: 0.8694939	best: 0.8694939 (300)	total: 3m 14s	remaining: 6m 28s
400:	test: 0.8710331	best: 0.8710331 (400)	total: 4m 18s	remaining: 5m 24s
500:	test: 0.8722443	best: 0.8722443 (500)	total: 5m 24s	remaining: 4m 20s
600:	test: 0.8730666	best: 0.8730666 (600)	total: 6m 28s	remaining: 3m 15s
700:	test: 0.8738101	best: 0.8738101 (700)	total: 7m 33s	remaining: 2m 11s
800:	test: 0.8743602	best: 0.8743602 (800)	total: 8m 39s	remaining: 1m 6s
900:	test: 0.8748273	best: 0.8748273 (900)	total: 9m 45s	remaining: 1.95s
903:	test: 0.8748345	best: 0.8748345 (903)	total: 9m 47s	remaining: 0us

bestTest = 0.8748345093
bestIteration = 903



[I 2024-07-15 01:49:34,500] Trial 4 finished with value: 0.8748345092511387 and parameters: {'iterations': 904, 'learning_rate': 0.03362739156180403, 'depth': 8, 'l2_leaf_reg': 2.0252122607403105, 'bagging_temperature': 0.30539698180222474, 'random_strength': 0.7057068275372328, 'border_count': 174, 'scale_pos_weight': 7.080110898781119}. Best is trial 0 with value: 0.8765036970247349.


0:	test: 0.8568741	best: 0.8568741 (0)	total: 4.49s	remaining: 1h 10m 24s


Training has stopped (degenerate solution on iteration 84, probably too small l2-regularization, try to increase it)



bestTest = 0.8670375126
bestIteration = 83

Shrink model to first 84 iterations.


[I 2024-07-15 01:55:03,685] Trial 5 finished with value: 0.8670375448444045 and parameters: {'iterations': 941, 'learning_rate': 0.0339022242577684, 'depth': 16, 'l2_leaf_reg': 0.0005424291268140061, 'bagging_temperature': 0.22849207410563943, 'random_strength': 0.508196089662048, 'border_count': 159, 'scale_pos_weight': 7.481409004140483}. Best is trial 0 with value: 0.8765036970247349.


0:	test: 0.8519262	best: 0.8519262 (0)	total: 1.21s	remaining: 12m 54s
100:	test: 0.8701715	best: 0.8701715 (100)	total: 1m 44s	remaining: 9m 16s
200:	test: 0.8734129	best: 0.8734129 (200)	total: 3m 23s	remaining: 7m 25s
300:	test: 0.8750143	best: 0.8750143 (300)	total: 5m 3s	remaining: 5m 42s
400:	test: 0.8758532	best: 0.8758539 (399)	total: 6m 43s	remaining: 4m 1s
500:	test: 0.8763895	best: 0.8763895 (500)	total: 8m 24s	remaining: 2m 20s
600:	test: 0.8767926	best: 0.8767926 (600)	total: 10m 5s	remaining: 40.3s
640:	test: 0.8769058	best: 0.8769066 (639)	total: 10m 45s	remaining: 0us

bestTest = 0.8769066039
bestIteration = 639

Shrink model to first 640 iterations.


[I 2024-07-15 02:05:54,505] Trial 6 finished with value: 0.8769066038508856 and parameters: {'iterations': 641, 'learning_rate': 0.08734764043570355, 'depth': 10, 'l2_leaf_reg': 0.05880114235678777, 'bagging_temperature': 0.48456435876067216, 'random_strength': 0.9865673116409905, 'border_count': 152, 'scale_pos_weight': 2.4866155360121747}. Best is trial 6 with value: 0.8769066038508856.


0:	test: 0.8582429	best: 0.8582429 (0)	total: 5.44s	remaining: 1h 9m 35s
100:	test: 0.8691676	best: 0.8691676 (100)	total: 8m 4s	remaining: 53m 22s
200:	test: 0.8732964	best: 0.8732964 (200)	total: 15m 28s	remaining: 43m 38s
300:	test: 0.8753693	best: 0.8753693 (300)	total: 22m 52s	remaining: 35m 29s
400:	test: 0.8767723	best: 0.8767723 (400)	total: 30m 15s	remaining: 27m 41s
500:	test: 0.8776457	best: 0.8776457 (500)	total: 37m 44s	remaining: 20m 6s
600:	test: 0.8782715	best: 0.8782715 (600)	total: 45m 11s	remaining: 12m 33s
700:	test: 0.8786939	best: 0.8786939 (700)	total: 52m 48s	remaining: 5m 2s


In [None]:
# Train the final model using the best parameters
best_params = study.best_trial.params
final_model = CatBoostClassifier(**best_params, verbose=0)
final_model.fit(X_train_preprocessed, y_train)

# Validate the final model
final_preds = final_model.predict_proba(X_val_preprocessed)[:, 1]
final_auc = roc_auc_score(y_val, final_preds)
logger.info(f"Final model AUC on validation set: {final_auc}")

In [None]:
# Visualization
optuna.visualization.plot_optimization_history(study)
optuna.visualization.plot_parallel_coordinate(study)
optuna.visualization.plot_slice(study)
optuna.visualization.plot_contour(study)
optuna.visualization.plot_param_importances(study)
study.best_trial
