In [17]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool
from scipy.optimize import minimize
import optuna
from datetime import datetime
import gc
import warnings

warnings.filterwarnings("ignore")

In [2]:
# Create a log filename with the notebook name and current datetime
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
log_filename = f'kaggle_submission_{current_time}.log'

# Configure logging to save to a file
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()  # This ensures logs are also output to the console
    ]
)

In [3]:
def reduce_mem_usage(df):
    """Iterate through all the columns of a dataframe and modify the data type to reduce memory usage."""
    start_mem = df.memory_usage().sum() / 1024**2
    logging.info(f'Memory usage of dataframe is {start_mem:.2f} MB')

    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    logging.info(f'Memory usage after optimization is: {end_mem:.2f} MB')
    logging.info(f'Decreased by {(100 * (start_mem - end_mem) / start_mem):.1f}%')
    
    return df

def import_data(file, **kwargs):
    """Create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True, **kwargs)
    df = reduce_mem_usage(df)
    return df


In [4]:
# Paths to datasets
train_path = r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\train.csv"
test_path = r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\test.csv"

# Load and optimize data
train_df = import_data(train_path, index_col='id')
test_df = import_data(test_path, index_col='id')

gc.collect()
logging.info("Data loaded successfully.")


2024-07-23 13:34:43,739 - INFO - Memory usage of dataframe is 1053.30 MB
2024-07-23 13:34:45,387 - INFO - Memory usage after optimization is: 274.30 MB
2024-07-23 13:34:45,387 - INFO - Decreased by 74.0%
2024-07-23 13:34:54,680 - INFO - Memory usage of dataframe is 643.68 MB
2024-07-23 13:34:55,761 - INFO - Memory usage after optimization is: 175.55 MB
2024-07-23 13:34:55,762 - INFO - Decreased by 72.7%
2024-07-23 13:34:55,824 - INFO - Data loaded successfully.


In [5]:
def preprocess_data(df):
    df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
    df['Vehicle_Damage'] = df['Vehicle_Damage'].map({'Yes': 1, 'No': 0})
    vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    df['Vehicle_Age'] = df['Vehicle_Age'].map(vehicle_age_mapping)
    df.drop(['Driving_License'], axis=1, inplace=True)
    return df

# Apply preprocessing
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)
logging.info("Data preprocessed successfully.")


2024-07-23 13:34:56,174 - INFO - Data preprocessed successfully.


In [6]:
# def remove_outliers_iqr(df, column):
#     Q1 = df[column].quantile(0.25)
#     Q3 = df[column].quantile(0.75)
#     IQR = Q3 - Q1
#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR
#     return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# # Apply outlier removal
# train_df = remove_outliers_iqr(train_df, 'Annual_Premium')
# logging.info("Outliers removed successfully.")

In [7]:
def feature_engineering(df):
    df = df.copy()
    df['Previously_Insured_Annual_Premium'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Annual_Premium'].astype(str)))[0]
    df['Previously_Insured_Vehicle_Age'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Age'].astype(str)))[0]
    df['Previously_Insured_Vehicle_Damage'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Damage'].astype(str)))[0]
    df['Previously_Insured_Vintage'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vintage'].astype(str)))[0]
    return df

# Apply feature engineering
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

gc.collect()
logging.info("Feature engineering completed successfully.")

2024-07-23 13:35:28,743 - INFO - Feature engineering completed successfully.


In [8]:
# Separate features and target variable
X = train_df.drop('Response', axis=1).values
y = train_df['Response'].values

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_df_scaled = scaler.transform(test_df.values)

gc.collect()
logging.info(f"Features and target variable separated and scaled.")

2024-07-23 13:35:31,794 - INFO - Features and target variable separated and scaled.


In [9]:
# Create Stratified K-Folds
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [10]:
# Train LightGBM model with validation within each fold
def train_lgbm_with_validation(X_train, y_train, params, num_boost_round=1000):
    X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
    
    train_data = lgb.Dataset(X_train_split, label=y_train_split)
    valid_data = lgb.Dataset(X_valid_split, label=y_valid_split, reference=train_data)
    
    bst = lgb.train(
        params,
        train_data,
        num_boost_round=num_boost_round,
        valid_sets=[train_data, valid_data],
    )
    
    gc.collect()
    valid_preds = bst.predict(X_valid_split, num_iteration=bst.best_iteration)
    auc_score = roc_auc_score(y_valid_split, valid_preds)
    logging.info(f'Validation AUC score: {auc_score}')
    return bst, bst.best_iteration

# Define LightGBM parameters
params = {
    'objective': 'binary',
    'metric': 'auc',
    'reg_alpha': 0.03432385172267505,
    'reg_lambda': 0.2998279059616829,
    'colsample_bytree': 0.790292183596673,
    'subsample': 0.9046878168822107,
    'learning_rate': 0.05035039561309864,
    'max_depth': 29,
    'num_leaves': 1474,
    'min_child_samples': 75,
    'min_child_weight': 7.661448090878849,
    'min_split_gain': 0.09978597066868167,
    'max_bin': 499,
    'n_jobs': 8,
    'early_stopping_rounds': 100
}


In [15]:
# Initialize lists to store out-of-fold predictions and AUC scores
lgb_preds = []
lgb_aucs = []

# Perform cross-validation
for fold, (train_idx, test_idx) in enumerate(skfold.split(X_scaled, y)):
    print(f"\n---- Fold {fold + 1} ----\n")
    
    X_train, y_train = X_scaled[train_idx], y[train_idx]
    X_valid, y_valid = X_scaled[test_idx], y[test_idx]
    
    # Train the model with validation to find the best iteration
    model, best_iteration = train_lgbm_with_validation(X_train, y_train, params)
    logging.info(f"Best iteration found: {best_iteration}")

    # Predict on validation set
    valid_preds = model.predict(X_valid, num_iteration=best_iteration)
    auc_score = roc_auc_score(y_valid, valid_preds)
    lgb_aucs.append(auc_score)
    print(f"Validation AUC score for fold {fold + 1}: {auc_score:.6f}")
    
    # Predict on test set
    test_pred = model.predict(test_df_scaled, num_iteration=best_iteration)
    lgb_preds.append(test_pred)
    
    # Clean up to free memory
    del X_train, y_train, X_valid, y_valid, model
    gc.collect()

# Calculate overall AUC score
auc_mean = np.mean(lgb_aucs)
auc_std = np.std(lgb_aucs)
print(f"\n---> Overall ROC-AUC Score: {auc_mean:.6f} ± {auc_std:.6f}\n")

# Average the predictions from each fold
test_pred_lgb = np.mean(lgb_preds, axis=0)


---- Fold 1 ----



KeyboardInterrupt: 

In [12]:
xgb_params = {
    'eval_metric': 'auc',
    'n_estimators': 3000,
    'eta': 0.05,
    'alpha': 0.2545607592482198,
    'subsample': 0.8388163485383147,
    'colsample_bytree': 0.2732499701466825,
    'max_depth': 16,
    'min_child_weight': 5,
    'gamma': 0.0017688666476104672,
    'max_bin': 262143,
    'tree_method': 'hist',
    'device': 'cuda',
    'enable_categorical': True,
    'early_stopping_rounds': 50,
}

In [16]:
# Initialize lists to store out-of-fold predictions and AUC scores
preds_xgb = []
aucs_xgb = []

# Perform cross-validation
for fold, (train_idx, test_idx) in enumerate(skfold.split(X_scaled, y)):
    print(f"\n---- Fold {fold + 1} ----\n")
    
    X_train, y_train = X_scaled[train_idx], y[train_idx]
    X_valid, y_valid = X_scaled[test_idx], y[test_idx]
    
    # Prepare data for XGBoost
    dtrain = xgb.DMatrix(X_train, label=y_train)
    dvalid = xgb.DMatrix(X_valid, label=y_valid)
    
    # Train XGBoost model
    xgb_model = xgb.train(
        xgb_params,
        dtrain,
        num_boost_round=xgb_params['n_estimators'],
        evals=[(dtrain, 'train'), (dvalid, 'valid')],
        early_stopping_rounds=xgb_params['early_stopping_rounds']
    )
    
    # Predict on validation set
    valid_preds = xgb_model.predict(dvalid, ntree_limit=xgb_model.best_ntree_limit)
    auc_score = roc_auc_score(y_valid, valid_preds)
    aucs_xgb.append(auc_score)
    print(f"Validation AUC score for fold {fold + 1}: {auc_score:.6f}")
    
    # Predict on test set
    dtest = xgb.DMatrix(test_df_scaled)
    test_pred = xgb_model.predict(dtest, ntree_limit=xgb_model.best_ntree_limit)
    preds_xgb.append(test_pred)
    
    # Clean up to free memory
    del X_train, y_train, X_valid, y_valid, dtrain, dvalid, xgb_model
    gc.collect()

# Calculate overall AUC score for XGBoost
auc_mean_xgb = np.mean(aucs_xgb)
auc_std_xgb = np.std(aucs_xgb)
print(f"\n---> Overall ROC-AUC Score for XGBoost: {auc_mean_xgb:.6f} ± {auc_std_xgb:.6f}\n")

# Average the predictions from each fold for XGBoost
test_pred_xgb = np.mean(preds_xgb, axis=0)


---- Fold 1 ----

[0]	train-auc:0.84812	valid-auc:0.84825
[1]	train-auc:0.85206	valid-auc:0.85184
[2]	train-auc:0.85625	valid-auc:0.85522
[3]	train-auc:0.85848	valid-auc:0.85752
[4]	train-auc:0.85683	valid-auc:0.85599
[5]	train-auc:0.85928	valid-auc:0.85708
[6]	train-auc:0.86420	valid-auc:0.86112
[7]	train-auc:0.86342	valid-auc:0.86003


KeyboardInterrupt: 

In [None]:
cat_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'class_names': [0, 1],
    'learning_rate': 0.075,
    'iterations': 3000,
    'depth': 9,
    'random_strength': 0,
    'l2_leaf_reg': 0.5,
    'max_leaves': 512,
    'fold_permutation_block': 64,
    'task_type': 'GPU',
    'random_seed': 42,
    'verbose': False,
    'allow_writing_files': False
}

In [None]:
# Initialize lists to store out-of-fold predictions and AUC scores
preds_cat = []
aucs_cat = []

# Perform cross-validation
for fold, (train_idx, test_idx) in enumerate(skfold.split(X_scaled, y)):
    print(f"\n---- Fold {fold + 1} ----\n")
    
    X_train, y_train = X_scaled[train_idx], y[train_idx]
    X_valid, y_valid = X_scaled[test_idx], y[test_idx]
    
    # Prepare data for CatBoost
    train_pool = Pool(X_train, y_train)
    valid_pool = Pool(X_valid, y_valid)
    
    # Train CatBoost model
    cat_model = CatBoostClassifier(**cat_params)
    cat_model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=50)
    
    # Predict on validation set
    valid_preds = cat_model.predict_proba(X_valid)[:, 1]
    auc_score = roc_auc_score(y_valid, valid_preds)
    aucs_cat.append(auc_score)
    print(f"Validation AUC score for fold {fold + 1}: {auc_score:.6f}")
    
    # Predict on test set
    test_pred = cat_model.predict_proba(test_df_scaled)[:, 1]
    preds_cat.append(test_pred)
    
    # Clean up to free memory
    del X_train, y_train, X_valid, y_valid, train_pool, valid_pool, cat_model
    gc.collect()

# Calculate overall AUC score for CatBoost
auc_mean_cat = np.mean(aucs_cat)
auc_std_cat = np.std(aucs_cat)
print(f"\n---> Overall ROC-AUC Score for CatBoost: {auc_mean_cat:.6f} ± {auc_std_cat:.6f}\n")

# Average the predictions from each fold for CatBoost
test_pred_cat = np.mean(preds_cat, axis=0)


In [None]:
# Define a function to calculate the weighted average predictions
def weighted_average(weights, preds):
    weighted_preds = np.average(preds, axis=0, weights=weights)
    return weighted_preds

# Define the objective function to minimize (negative AUC)
def objective(weights, preds, y_true):
    weighted_preds = weighted_average(weights, preds)
    auc = roc_auc_score(y_true, weighted_preds)
    return -auc

# Prepare prediction arrays and true labels
preds = np.array([test_pred_lgb, test_pred_xgb, test_pred_cat])
y_true = y  # Use your actual target variable for the validation set

# Initialize weights
initial_weights = [1/3, 1/3, 1/3]

# Define constraints and bounds for the weights
constraints = ({'type': 'eq', 'fun': lambda w: 1 - sum(w)})
bounds = [(0, 1)] * len(initial_weights)

# Optimize the weights
opt_result = minimize(objective, initial_weights, args=(preds, y_true), method='SLSQP', bounds=bounds, constraints=constraints)

# Extract the optimized weights
optimized_weights = opt_result.x
print("Optimized weights:", optimized_weights)


In [None]:
# Compute the weighted average of the predictions using optimized weights
final_predictions_optimized = np.average(preds, axis=0, weights=optimized_weights)

# Create the submission DataFrame
submission_optimized = pd.DataFrame({'id': test_df.index, 'Response': final_predictions_optimized})

# Save the submission DataFrame to a CSV file
submission_optimized.to_csv("submission.csv", index=False)
print("Optimized weighted average submission file created successfully.")

