In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging
import gc
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool
from scipy.optimize import minimize
from datetime import datetime
import warnings
import joblib

warnings.filterwarnings("ignore")

In [2]:
# Function to create a log filename with the notebook name and current datetime
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
log_filename = f'kaggle_submission_{current_time}.log'

# Configure logging to save to a file and output to the console
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()  # This ensures logs are also output to the console
    ]
)

In [3]:
# Documenting the purpose and usage of the function
def get_column_stats(df):
    """Get basic statistics for each column in the dataframe."""
    stats = {}
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            stats[col] = {
                'min': df[col].min(),
                'max': df[col].max(),
                'mean': df[col].mean(),
            }
        else:
            stats[col] = {
                'unique': df[col].nunique()
            }
    return stats

# Log comparison of statistics
def compare_stats(stats_before, stats_after):
    """Compare statistics before and after type conversion."""
    for col in stats_before:
        if stats_before[col] != stats_after[col]:
            logging.warning(f"Column {col} has changed: {stats_before[col]} != {stats_after[col]}")

# Log precision loss
def calculate_precision_loss(stats_before, stats_after):
    """Calculate and log precision loss for numeric columns."""
    for col in stats_before:
        if 'mean' in stats_before[col]:
            mean_before = stats_before[col]['mean']
            mean_after = stats_after[col]['mean']
            precision_loss = abs(mean_before - mean_after) / abs(mean_before) * 100
            logging.info(f"Column {col} precision loss: {precision_loss:.6f}%")

# Memory optimization function
def reduce_mem_usage(df, verbose=True):
    """Iterate through all the columns of a dataframe and modify the data type to reduce memory usage."""
    start_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        logging.info(f'Start memory usage of dataframe: {start_mem:.2f} MB')

    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        logging.info(f'End memory usage of dataframe: {end_mem:.2f} MB')
        logging.info(f'Decreased by {(100 * (start_mem - end_mem) / start_mem):.1f}%')

    return df

# Log any unknown categories during mapping
def safe_map(df, column, mapping):
    """Map categorical values to numerical values and log any unknown categories."""
    unknown_categories = set(df[column]) - set(mapping.keys())
    if unknown_categories:
        logging.warning(f"Unknown categories in column {column}: {unknown_categories}")
    df[column] = df[column].map(mapping)
    return df

# Function to import data with logging
def import_data(path, index_col=None):
    """Import data from a CSV file and optimize memory usage."""
    try:
        df = pd.read_csv(path, index_col=index_col)
        
        # Get column stats before optimization
        stats_before = get_column_stats(df)
        
        df = reduce_mem_usage(df)
        
        # Get column stats after optimization
        stats_after = get_column_stats(df)
        
        # Compare statistics and calculate precision loss
        compare_stats(stats_before, stats_after)
        calculate_precision_loss(stats_before, stats_after)
        
        logging.info(f'Data loaded and memory optimized from {path}')
        return df
    except Exception as e:
        logging.error(f'Error loading data from {path}: {str(e)}')
        return None

# Preprocess data with logging
def preprocess_data(df):
    """Preprocess the dataset."""
    gender_mapping = {'Male': 1, 'Female': 0}
    vehicle_damage_mapping = {'Yes': 1, 'No': 0}
    vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    
    df = safe_map(df, 'Gender', gender_mapping)
    df = safe_map(df, 'Vehicle_Damage', vehicle_damage_mapping)
    df = safe_map(df, 'Vehicle_Age', vehicle_age_mapping)
    
    df.drop(['Driving_License'], axis=1, inplace=True)
    logging.info("Data preprocessing completed.")
    return df

# Feature engineering function with logging
def feature_engineering(df):
    """Feature engineering on the dataset."""
    df['Previously_Insured_Annual_Premium'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Annual_Premium'].astype(str)))[0]
    df['Previously_Insured_Vehicle_Age'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Age'].astype(str)))[0]
    df['Previously_Insured_Vehicle_Damage'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Damage'].astype(str)))[0]
    df['Previously_Insured_Vintage'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vintage'].astype(str)))[0]
    logging.info("Feature engineering completed.")
    return df

In [4]:
# Paths to datasets
train_path = r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\train.csv"
test_path = r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\test.csv"

# Load and optimize data
train_df = import_data(train_path, index_col='id')
test_df = import_data(test_path, index_col='id')

gc.collect()
logging.info("Data loaded successfully.")

2024-07-24 16:33:13,062 - INFO - Start memory usage of dataframe: 1053.30 MB
2024-07-24 16:33:14,761 - INFO - End memory usage of dataframe: 318.18 MB
2024-07-24 16:33:14,763 - INFO - Decreased by 69.8%
2024-07-24 16:33:15,113 - INFO - Column Age precision loss: 0.000000%
2024-07-24 16:33:15,114 - INFO - Column Driving_License precision loss: 0.000000%
2024-07-24 16:33:15,114 - INFO - Column Region_Code precision loss: 0.000108%
2024-07-24 16:33:15,115 - INFO - Column Previously_Insured precision loss: 0.000000%
2024-07-24 16:33:15,115 - INFO - Column Annual_Premium precision loss: 0.000036%
2024-07-24 16:33:15,115 - INFO - Column Policy_Sales_Channel precision loss: 0.000004%
2024-07-24 16:33:15,115 - INFO - Column Vintage precision loss: 0.000000%
2024-07-24 16:33:15,117 - INFO - Column Response precision loss: 0.000000%
2024-07-24 16:33:15,117 - INFO - Data loaded and memory optimized from C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\train.csv
2024-07-24 16:33:20,

In [5]:
# Apply preprocessing
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)
logging.info("Data preprocessed successfully.")

# Apply feature engineering
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

gc.collect()
logging.info("Feature engineering completed successfully.")

2024-07-24 16:33:23,048 - INFO - Data preprocessing completed.
2024-07-24 16:33:23,791 - INFO - Data preprocessing completed.
2024-07-24 16:33:23,791 - INFO - Data preprocessed successfully.
2024-07-24 16:33:44,916 - INFO - Feature engineering completed.
2024-07-24 16:33:58,952 - INFO - Feature engineering completed.
2024-07-24 16:33:59,009 - INFO - Feature engineering completed successfully.


In [6]:
# Normalize numeric columns
num_cols = ['Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']
scaler = StandardScaler()
train_df[num_cols] = scaler.fit_transform(train_df[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])
logging.info("Numeric columns normalized.")

2024-07-24 16:34:00,191 - INFO - Numeric columns normalized.


In [7]:
# Separate features and target variable
X = train_df.drop('Response', axis=1)
y = train_df['Response']

In [8]:
# Create Stratified K-Folds
n_splits = 5
skfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)

In [9]:
# Define CatBoost parameters
cat_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'class_names': [0, 1],
    'learning_rate': 0.075,
    'iterations': 3000,
    'depth': 9,
    'random_strength': 0,
    'l2_leaf_reg': 0.5,
    'max_leaves': 512,
    'fold_permutation_block': 64,
    # 'task_type': 'GPU',  # Ensure your environment supports GPU
    'random_seed': 42,
    'allow_writing_files': False,
    'verbose': 100,  # Display log every 100 iterations
    # 'thread_count': -1
}

# Initialize lists to store out-of-fold predictions, models, and AUC scores
cat_preds = []
cat_aucs = []

test_pool = Pool(test_df.astype(str), cat_features=X.columns.values)

# CatBoost Model
logging.info("Starting CatBoost training...")
for fold, (train_idx, test_idx) in enumerate(skfold.split(X, y)):
    logging.info(f"---- CatBoost Fold {fold + 1} ----")
    
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = X.iloc[test_idx], y.iloc[test_idx]
    
    train_pool = Pool(X_train.astype(str), y_train, cat_features=X.columns.values)
    valid_pool = Pool(X_valid.astype(str), y_valid, cat_features=X.columns.values)
    
    model = CatBoostClassifier(**cat_params)
    model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=50, verbose=100)
    
    valid_preds = model.predict_proba(X_valid.astype(str))[:, 1]
    auc_score = roc_auc_score(y_valid, valid_preds)
    cat_aucs.append(auc_score)
    logging.info(f"Validation AUC score for fold {fold + 1}: {auc_score:.6f}")
    
    test_pred = model.predict_proba(test_pool)[:, 1]
    cat_preds.append(test_pred)
    
    # Save the model for this fold
    joblib.dump(model, f'catboost_model_fold_{fold + 1}.pkl')
    
    # Clear memory
    del X_train, y_train, X_valid, y_valid, train_pool, valid_pool, model
    gc.collect()


# Calculate overall AUC score for CatBoost
auc_mean_cat = np.mean(cat_aucs)
auc_std_cat = np.std(cat_aucs)
logging.info(f"Overall ROC-AUC Score for CatBoost: {auc_mean_cat:.6f} ± {auc_std_cat:.6f}")

# Average the predictions from each fold for CatBoost
test_pred_cat = np.mean(cat_preds, axis=0)
joblib.dump(test_pred_cat, 'test_pred_cat.pkl')
logging.info("CatBoost models and predictions saved.")

2024-07-24 16:34:55,475 - INFO - Starting CatBoost training...
2024-07-24 16:34:56,645 - INFO - ---- CatBoost Fold 1 ----


0:	test: 0.8696392	best: 0.8696392 (0)	total: 12.6s	remaining: 10h 29m 12s


KeyboardInterrupt: 

In [None]:
# Define LightGBM parameters
lgb_params = {
    'objective': 'binary',
    'metric': 'auc',
    'reg_alpha': 0.03432385172267505,
    'reg_lambda': 0.2998279059616829,
    'colsample_bytree': 0.790292183596673,
    'subsample': 0.9046878168822107,
    'learning_rate': 0.05035039561309864,
    'max_depth': 10,  # Further reduced max depth
    'num_leaves': 31,  # Standard number of leaves
    'min_child_samples': 100,  # Increased min child samples
    'min_child_weight': 1,  # Adjusted min child weight
    'min_split_gain': 0.09978597066868167,
    'max_bin': 255,
    # 'device': 'gpu',
    'early_stopping_rounds': 50,
    'verbose': 1  # Enable verbose mode
}

# Initialize lists to store out-of-fold predictions and AUC scores
lgb_preds = []
lgb_aucs = []

# Train LightGBM model with cross-validation
logging.info("Starting LightGBM training...")
for fold, (train_idx, test_idx) in enumerate(skfold.split(X, y)):
    logging.info(f"---- Fold {fold + 1} ----")
    
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = X.iloc[test_idx], y.iloc[test_idx]

    train_data = lgb.Dataset(X_train, label=y_train)
    valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data)
    
    model = lgb.train(
        lgb_params,
        train_data,
        num_boost_round=3000,
        valid_sets=[train_data, valid_data],
    )
    
    valid_preds = model.predict(X_valid, num_iteration=model.best_iteration)
    auc_score = roc_auc_score(y_valid, valid_preds)
    lgb_aucs.append(auc_score)
    logging.info(f"Validation AUC score for fold {fold + 1}: {auc_score:.6f}")
    
    test_pred = model.predict(test_df, num_iteration=model.best_iteration)
    lgb_preds.append(test_pred)
    
    # Save the model for this fold
    joblib.dump(model, f'lgb_model_fold_{fold + 1}.pkl')
    
    # Clear memory
    del X_train, y_train, X_valid, y_valid, train_data, valid_data, model
    gc.collect()

# Calculate overall AUC score for LightGBM
auc_mean_lgb = np.mean(lgb_aucs)
auc_std_lgb = np.std(lgb_aucs)
logging.info(f"Overall ROC-AUC Score for LightGBM: {auc_mean_lgb:.6f} ± {auc_std_lgb:.6f}")

# Average the predictions from each fold for LightGBM
test_pred_lgb = np.mean(lgb_preds, axis=0)
joblib.dump(test_pred_lgb, 'test_pred_lgb.pkl')
logging.info("LightGBM models and predictions saved.")

2024-07-24 16:29:58,129 - INFO - Starting LightGBM training...
2024-07-24 16:29:58,136 - INFO - ---- Fold 1 ----


[LightGBM] [Info] Number of positive: 5642, number of negative: 40377
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000897 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1245
[LightGBM] [Info] Number of data points in the train set: 46019, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122602 -> initscore=-1.968022
[LightGBM] [Info] Start training from score -1.968022
Training until validation scores don't improve for 50 rounds


2024-07-24 16:29:59,872 - INFO - Validation AUC score for fold 1: 0.867926


Early stopping, best iteration is:
[225]	training's auc: 0.905537	valid_1's auc: 0.867926


2024-07-24 16:30:00,033 - INFO - ---- Fold 2 ----


[LightGBM] [Info] Number of positive: 5642, number of negative: 40377
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000503 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1242
[LightGBM] [Info] Number of data points in the train set: 46019, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122602 -> initscore=-1.968022
[LightGBM] [Info] Start training from score -1.968022
Training until validation scores don't improve for 50 rounds


2024-07-24 16:30:01,293 - INFO - Validation AUC score for fold 2: 0.858214
2024-07-24 16:30:01,425 - INFO - ---- Fold 3 ----


Early stopping, best iteration is:
[148]	training's auc: 0.894343	valid_1's auc: 0.858214
[LightGBM] [Info] Number of positive: 5641, number of negative: 40378
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.000561 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1244
[LightGBM] [Info] Number of data points in the train set: 46019, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122580 -> initscore=-1.968224
[LightGBM] [Info] Start training from score -1.968224
Training until validation scores don't improve for 50 rounds


2024-07-24 16:30:03,254 - INFO - Validation AUC score for fold 3: 0.864428


Early stopping, best iteration is:
[246]	training's auc: 0.909123	valid_1's auc: 0.864428


2024-07-24 16:30:03,417 - INFO - ---- Fold 4 ----


[LightGBM] [Info] Number of positive: 5641, number of negative: 40378
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.002479 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1238
[LightGBM] [Info] Number of data points in the train set: 46019, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122580 -> initscore=-1.968224
[LightGBM] [Info] Start training from score -1.968224
Training until validation scores don't improve for 50 rounds


2024-07-24 16:30:04,767 - INFO - Validation AUC score for fold 4: 0.862741
2024-07-24 16:30:04,898 - INFO - ---- Fold 5 ----


Early stopping, best iteration is:
[177]	training's auc: 0.899212	valid_1's auc: 0.862741
[LightGBM] [Info] Number of positive: 5642, number of negative: 40378
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.001446 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 1235
[LightGBM] [Info] Number of data points in the train set: 46020, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122599 -> initscore=-1.968046
[LightGBM] [Info] Start training from score -1.968046
Training until validation scores don't improve for 50 rounds


2024-07-24 16:30:06,111 - INFO - Validation AUC score for fold 5: 0.858663
2024-07-24 16:30:06,241 - INFO - Overall ROC-AUC Score for LightGBM: 0.862394 ± 0.003640
2024-07-24 16:30:06,244 - INFO - LightGBM models and predictions saved.


Early stopping, best iteration is:
[145]	training's auc: 0.894664	valid_1's auc: 0.858663


In [None]:
# Define XGBoost parameters
xgb_params = {
    'eval_metric': 'auc',
    'eta': 0.05,
    'alpha': 0.2545607592482198,
    'subsample': 0.8388163485383147,
    'colsample_bytree': 0.2732499701466825,
    'max_depth': 16,
    'min_child_weight': 5,
    'gamma': 0.0017688666476104672,
    'max_bin': 262143,
    # 'tree_method': 'gpu_hist',  # Ensure your environment supports GPU
    # 'predictor': 'gpu_predictor',  # Ensure your environment supports GPU
    'enable_categorical': True,
    'verbose': 100
}

# Initialize lists to store out-of-fold predictions and AUC scores
xgb_preds = []
xgb_aucs = []

# Train XGBoost model with cross-validation
logging.info("Starting XGBoost training...")
for fold, (train_idx, test_idx) in enumerate(skfold.split(X, y)):
    logging.info(f"---- Fold {fold + 1} ----")
    
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = X.iloc[test_idx], y.iloc[test_idx]
    
    dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
    dvalid = xgb.DMatrix(X_valid, label=y_valid, enable_categorical=True)
    
    model = xgb.train(
        xgb_params,
        dtrain,
        num_boost_round=3000,
        evals=[(dtrain, 'train'), (dvalid, 'valid')],
        early_stopping_rounds=50,
        verbose_eval=100
    )
    
    valid_preds = model.predict(dvalid, iteration_range=(0, model.best_iteration))
    auc_score = roc_auc_score(y_valid, valid_preds)
    xgb_aucs.append(auc_score)
    logging.info(f"Validation AUC score for fold {fold + 1}: {auc_score:.6f}")
    
    dtest = xgb.DMatrix(test_df, enable_categorical=True)
    test_pred = model.predict(dtest, iteration_range=(0, model.best_iteration))
    xgb_preds.append(test_pred)
    
    # Save the model for this fold
    model.save_model(f'xgb_model_fold_{fold + 1}.json')
    
    # Clear memory
    del X_train, y_train, X_valid, y_valid, dtrain, dvalid, model
    gc.collect()

# Calculate overall AUC score for XGBoost
auc_mean_xgb = np.mean(xgb_aucs)
auc_std_xgb = np.std(xgb_aucs)
logging.info(f"Overall ROC-AUC Score for XGBoost: {auc_mean_xgb:.6f} ± {auc_std_xgb:.6f}")

# Average the predictions from each fold for XGBoost
test_pred_xgb = np.mean(xgb_preds, axis=0)
joblib.dump(test_pred_xgb, 'test_pred_xgb.pkl')
logging.info("XGBoost models and predictions saved.")

2024-07-24 16:30:06,258 - INFO - Starting XGBoost training...
2024-07-24 16:30:06,267 - INFO - ---- Fold 1 ----


[0]	train-auc:0.81600	valid-auc:0.79088
[100]	train-auc:0.97729	valid-auc:0.86356
[128]	train-auc:0.98132	valid-auc:0.86345


2024-07-24 16:30:19,289 - INFO - Validation AUC score for fold 1: 0.865023
2024-07-24 16:30:19,555 - INFO - ---- Fold 2 ----


[0]	train-auc:0.82021	valid-auc:0.79236
[100]	train-auc:0.97922	valid-auc:0.85517
[120]	train-auc:0.98229	valid-auc:0.85483


2024-07-24 16:30:33,386 - INFO - Validation AUC score for fold 2: 0.856486
2024-07-24 16:30:33,649 - INFO - ---- Fold 3 ----


[0]	train-auc:0.82565	valid-auc:0.79679
[100]	train-auc:0.97883	valid-auc:0.86076
[126]	train-auc:0.98195	valid-auc:0.86018


2024-07-24 16:30:48,449 - INFO - Validation AUC score for fold 3: 0.862684
2024-07-24 16:30:48,716 - INFO - ---- Fold 4 ----


[0]	train-auc:0.83482	valid-auc:0.79364
[100]	train-auc:0.97890	valid-auc:0.85769
[175]	train-auc:0.98962	valid-auc:0.85504


2024-07-24 16:31:05,670 - INFO - Validation AUC score for fold 4: 0.858279
2024-07-24 16:31:06,009 - INFO - ---- Fold 5 ----


[0]	train-auc:0.82656	valid-auc:0.79528
[93]	train-auc:0.97640	valid-auc:0.85659


2024-07-24 16:31:17,561 - INFO - Validation AUC score for fold 5: 0.857371
2024-07-24 16:31:17,777 - INFO - Overall ROC-AUC Score for XGBoost: 0.859969 ± 0.003306
2024-07-24 16:31:17,780 - INFO - XGBoost models and predictions saved.


In [None]:
# Normalize AUC scores to sum to 1
total_auc = auc_mean_cat + auc_mean_lgb + auc_mean_xgb

weight_cat = auc_mean_cat / total_auc
weight_lgb = auc_mean_lgb / total_auc
weight_xgb = auc_mean_xgb / total_auc

# Print weights for verification
print(f"Weights - CatBoost: {weight_cat:.4f}, LightGBM: {weight_lgb:.4f}, XGBoost: {weight_xgb:.4f}")

# Blending predictions with calculated weights
blended_preds = (weight_cat * test_pred_cat + weight_lgb * test_pred_lgb + weight_xgb * test_pred_xgb)

# Create the submission DataFrame
submission = pd.DataFrame({
    'id': test_df.index,
    'Response': blended_preds
})

# Save the submission file
submission.to_csv("submission.csv", index=False)

print("Submission file created successfully!")


Weights - CatBoost: 0.3347, LightGBM: 0.3331, XGBoost: 0.3322
Submission file created successfully!


# Kaggle Competition: Auto Insurance Prediction
This notebook is designed to preprocess data, engineer features, train machine learning models, and blend predictions for the Kaggle Auto Insurance Prediction competition.

## 1. Import Libraries
Importing necessary libraries for data processing, model training, and evaluation.

## 2. Memory Optimization
Functions to reduce memory usage by optimizing data types of the dataframe.

## 3. Data Preprocessing
Preprocessing the dataset by mapping categorical values to numerical values and handling missing values.

## 4. Feature Engineering
Creating new features to enhance the predictive power of the models.

## 5. Model Training and Validation
Training CatBoost, LightGBM, and XGBoost models using Stratified K-Fold cross-validation.

## 6. Blending Predictions
Blending predictions from the three models based on their AUC scores to create the final submission.
