In [None]:
%%capture
!pip install klib

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging
import gc
import klib
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool
from datetime import datetime
import warnings
import joblib
import seaborn as sns


warnings.filterwarnings("ignore")

In [None]:
# Function to create a log filename with the notebook name and current datetime
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
log_filename = f'kaggle_submission_{current_time}.log'

# Configure logging to save to a file and output to the console
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()  # This ensures logs are also output to the console
    ]
)

In [None]:
# Paths to datasets
train_path = r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\train.csv"
test_path = r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\test.csv"

In [None]:
# Documenting the purpose and usage of the function
def get_column_stats(df):
    """Get basic statistics for each column in the dataframe."""
    stats = {}
    for col in df.columns:
        if pd.api.types.is_numeric_dtype(df[col]):
            stats[col] = {
                'min': df[col].min(),
                'max': df[col].max(),
                'mean': df[col].mean(),
            }
        else:
            stats[col] = {
                'unique': df[col].nunique()
            }
    return stats

# Log comparison of statistics
def compare_stats(stats_before, stats_after):
    """Compare statistics before and after type conversion."""
    for col in stats_before:
        if stats_before[col] != stats_after[col]:
            logging.warning(f"Column {col} has changed: {stats_before[col]} != {stats_after[col]}")

# Log precision loss
def calculate_precision_loss(stats_before, stats_after):
    """Calculate and log precision loss for numeric columns."""
    for col in stats_before:
        if 'mean' in stats_before[col]:
            mean_before = stats_before[col]['mean']
            mean_after = stats_after[col]['mean']
            precision_loss = abs(mean_before - mean_after) / abs(mean_before) * 100
            logging.info(f"Column {col} precision loss: {precision_loss:.6f}%")

# Memory optimization function
def reduce_mem_usage(df, verbose=True):
    """Iterate through all the columns of a dataframe and modify the data type to reduce memory usage."""
    start_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        logging.info(f'Start memory usage of dataframe: {start_mem:.2f} MB')

    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object and col_type.name != 'category':
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        elif col_type == object:
            df[col] = df[col].astype('category')

    end_mem = df.memory_usage().sum() / 1024**2
    if verbose:
        logging.info(f'End memory usage of dataframe: {end_mem:.2f} MB')
        logging.info(f'Decreased by {(100 * (start_mem - end_mem) / start_mem):.1f}%')

    return df

# Function to import data with logging
def import_data(path, index_col=None):
    """Import data from a CSV file and optimize memory usage."""
    try:
        df = pd.read_csv(path, index_col=index_col)
        
        # Get column stats before optimization
        stats_before = get_column_stats(df)
        
        df = reduce_mem_usage(df)
        
        # Get column stats after optimization
        stats_after = get_column_stats(df)
        
        # Compare statistics and calculate precision loss
        compare_stats(stats_before, stats_after)
        calculate_precision_loss(stats_before, stats_after)
        
        logging.info(f'Data loaded and memory optimized from {path}')
        return df
    except Exception as e:
        logging.error(f'Error loading data from {path}: {str(e)}')
        return None

# Log any unknown categories during mapping
def safe_map(df, column, mapping):
    """Map categorical values to numerical values and log any unknown categories."""
    unknown_categories = set(df[column]) - set(mapping.keys())
    if unknown_categories:
        logging.warning(f"Unknown categories in column {column}: {unknown_categories}")
    df[column] = df[column].map(mapping)
    return df

# Preprocess data with logging
def preprocess_data(df):
    """Preprocess the dataset."""
    gender_mapping = {'Male': 1, 'Female': 0}
    vehicle_damage_mapping = {'Yes': 1, 'No': 0}
    vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    
    df = safe_map(df, 'Gender', gender_mapping)
    df = safe_map(df, 'Vehicle_Damage', vehicle_damage_mapping)
    df = safe_map(df, 'Vehicle_Age', vehicle_age_mapping)
    
    df.drop(['Driving_License'], axis=1, inplace=True)
    logging.info("Data preprocessing completed.")
    return df

# Feature engineering function with logging
def feature_engineering(df):
    """Feature engineering on the dataset."""
    df['Previously_Insured_Annual_Premium'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Annual_Premium'].astype(str)))[0]
    df['Previously_Insured_Vehicle_Age'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Age'].astype(str)))[0]
    df['Previously_Insured_Vehicle_Damage'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Damage'].astype(str)))[0]
    df['Previously_Insured_Vintage'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vintage'].astype(str)))[0]
    logging.info("Feature engineering completed.")
    return df

def reduce_mem_usage_for_engineered_features(df):
    df['Previously_Insured_Annual_Premium'] = reduce_mem_usage(df[['Previously_Insured_Annual_Premium']])
    df['Previously_Insured_Vehicle_Age'] = reduce_mem_usage(df[['Previously_Insured_Vehicle_Age']])
    df['Previously_Insured_Vehicle_Damage'] = reduce_mem_usage(df[['Previously_Insured_Vehicle_Damage']])
    df['Previously_Insured_Vintage'] = reduce_mem_usage(df[['Previously_Insured_Vintage']])
    return df
    

In [None]:
# Load and optimize data
train_df = import_data(train_path, index_col='id')
test_df = import_data(test_path, index_col='id')

gc.collect()
logging.info("Data loaded successfully.")

In [None]:
# Apply preprocessing
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)
logging.info("Data preprocessed successfully.")

In [None]:
# Apply feature engineering
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

gc.collect()
logging.info("Feature engineering completed successfully.")

In [None]:
# Apply memory optimization to the engineered features
train_df = reduce_mem_usage_for_engineered_features(train_df)
test_df = reduce_mem_usage_for_engineered_features(test_df)
logging.info("Engineered features memory optimization completed.")

In [None]:
# Normalize numeric columns
num_cols = ['Age', 'Region_Code', 'Annual_Premium', 'Policy_Sales_Channel', 'Vintage']
scaler = StandardScaler()
train_df[num_cols] = scaler.fit_transform(train_df[num_cols])
test_df[num_cols] = scaler.transform(test_df[num_cols])
logging.info("Numeric columns normalized.")

In [None]:
# Separate features and target variable
X = train_df.drop('Response', axis=1)
y = train_df['Response']

In [None]:
# Create Stratified K-Folds
n_splits = 5
skfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
categorical_features = ['Gender', 'Vehicle_Age', 'Vehicle_Damage', 'Previously_Insured_Annual_Premium', 'Previously_Insured_Vehicle_Age', 'Previously_Insured_Vehicle_Damage', 'Previously_Insured_Vintage']

# XGBoost
dtrain = xgb.DMatrix(X, label=y, enable_categorical=True)
dtest = xgb.DMatrix(test_df, enable_categorical=True)

# LightGBM
train_data = lgb.Dataset(X, label=y, categorical_feature=categorical_features)

# CatBoost
train_pool = Pool(X, y, cat_features=categorical_features)
test_pool = Pool(test_df, cat_features=categorical_features)

logging.info("Data prepared for modeling.")


In [None]:
import optuna
from optuna.samplers import TPESampler
from catboost import CatBoostClassifier, Pool
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import json
import gc

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Objective function for Optuna
def objective(trial):
    params = {
        'loss_function': 'Logloss',
        'eval_metric': 'AUC',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'iterations': trial.suggest_int('iterations', 1000, 3000),
        'depth': trial.suggest_int('depth', 4, 10),
        'random_strength': trial.suggest_uniform('random_strength', 0, 10),
        'l2_leaf_reg': trial.suggest_loguniform('l2_leaf_reg', 1e-2, 10),
        'bagging_temperature': trial.suggest_uniform('bagging_temperature', 0, 1),
        'border_count': trial.suggest_int('border_count', 1, 255),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
        'max_bin': trial.suggest_int('max_bin', 200, 300),
        'grow_policy': trial.suggest_categorical('grow_policy', ['SymmetricTree', 'Depthwise', 'Lossguide']),
        'od_type': trial.suggest_categorical('od_type', ['IncToDec', 'Iter']),
        'task_type': 'GPU',
        'verbose': 100
    }
    
    train_pool = Pool(X_train, y_train, cat_features=categorical_features)
    valid_pool = Pool(X_valid, y_valid, cat_features=categorical_features)
    
    model = CatBoostClassifier(**params)
    model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=50, verbose=100)
    
    valid_preds = model.predict_proba(X_valid)[:, 1]
    auc_score = roc_auc_score(y_valid, valid_preds)
    
    # Clear memory
    del model, train_pool, valid_pool, valid_preds
    gc.collect()
    
    return auc_score

# Create study and optimize
study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(objective, n_trials=50)

# Log best parameters
best_params = study.best_params
logging.info(f"Best parameters: {best_params}")

# Save the best parameters for later use
with open("best_catboost_params.json", "w") as f:
    json.dump(best_params, f)


In [None]:
import optuna
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import json
import gc

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Objective function for Optuna
def objective(trial):
    params = {
        'objective': 'binary',
        'metric': 'auc',
        'learning_rate': trial.suggest_loguniform('learning_rate', 0.01, 0.3),
        'num_leaves': trial.suggest_int('num_leaves', 31, 256),
        'max_depth': trial.suggest_int('max_depth', 4, 16),
        'min_data_in_leaf': trial.suggest_int('min_data_in_leaf', 1, 100),
        'feature_fraction': trial.suggest_uniform('feature_fraction', 0.4, 1.0),
        'bagging_fraction': trial.suggest_uniform('bagging_fraction', 0.4, 1.0),
        'bagging_freq': trial.suggest_int('bagging_freq', 1, 10),
        'lambda_l1': trial.suggest_loguniform('lambda_l1', 1e-8, 10.0),
        'lambda_l2': trial.suggest_loguniform('lambda_l2', 1e-8, 10.0),
        'min_gain_to_split': trial.suggest_loguniform('min_gain_to_split', 1e-8, 10.0),
        'max_bin': trial.suggest_int('max_bin', 200, 300),
        'boosting_type': trial.suggest_categorical('boosting_type', ['gbdt', 'dart', 'goss']),
        'device': 'gpu',
        'verbosity': -1,
        'early_stopping_round': 50
    }
    
    train_data = lgb.Dataset(X_train, label=y_train, categorical_feature=categorical_features)
    valid_data = lgb.Dataset(X_valid, label=y_valid, reference=train_data, categorical_feature=categorical_features)
    
    model = lgb.train(params, train_data, valid_sets=[train_data, valid_data], early_stopping_rounds=50, verbose_eval=100)
    
    valid_preds = model.predict(X_valid, num_iteration=model.best_iteration)
    auc_score = roc_auc_score(y_valid, valid_preds)
    
    # Clear memory
    del model, train_data, valid_data, valid_preds
    gc.collect()
    
    return auc_score

# Create study and optimize
study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(objective, n_trials=50)

# Log best parameters
best_params = study.best_params
logging.info(f"Best parameters: {best_params}")

# Save the best parameters for later use
with open("best_lightgbm_params.json", "w") as f:
    json.dump(best_params, f)


In [None]:
import optuna
import xgboost as xgb
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score
import json
import gc

# Split the data into training and validation sets
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.2, stratify=y, random_state=42)

# Objective function for Optuna
def objective(trial):
    params = {
        'eval_metric': 'auc',
        'eta': trial.suggest_loguniform('eta', 0.01, 0.3),
        'alpha': trial.suggest_loguniform('alpha', 1e-8, 10.0),
        'lambda': trial.suggest_loguniform('lambda', 1e-8, 10.0),
        'subsample': trial.suggest_uniform('subsample', 0.5, 1.0),
        'colsample_bytree': trial.suggest_uniform('colsample_bytree', 0.5, 1.0),
        'max_depth': trial.suggest_int('max_depth', 4, 16),
        'min_child_weight': trial.suggest_int('min_child_weight', 1, 10),
        'gamma': trial.suggest_loguniform('gamma', 1e-8, 10.0),
        'max_bin': trial.suggest_int('max_bin', 200, 300),
        'tree_method': 'gpu_hist',
        'predictor': 'gpu_predictor',
        'enable_categorical': True,
        'verbosity': 1
    }
    
    dtrain = xgb.DMatrix(X_train, label=y_train, enable_categorical=True)
    dvalid = xgb.DMatrix(X_valid, label=y_valid, enable_categorical=True)
    
    model = xgb.train(
        params,
        dtrain,
        num_boost_round=3000,
        evals=[(dtrain, 'train'), (dvalid, 'valid')],
        early_stopping_rounds=50,
        verbose_eval=100
    )
    
    valid_preds = model.predict(dvalid, iteration_range=(0, model.best_iteration))
    auc_score = roc_auc_score(y_valid, valid_preds)
    
    # Clear memory
    del model, dtrain, dvalid, valid_preds
    gc.collect()
    
    return auc_score

# Create study and optimize
study = optuna.create_study(direction='maximize', sampler=TPESampler())
study.optimize(objective, n_trials=50)

# Log best parameters
best_params = study.best_params
logging.info(f"Best parameters: {best_params}")

# Save the best parameters for later use
with open("best_xgboost_params.json", "w") as f:
    json.dump(best_params, f)


In [None]:
# # Load the prediction files
# test_pred_cat = pd.read_csv('/kaggle/input/blend-lgbm-xgb-catboost-cat/submission.csv')
# test_pred_lgb = pd.read_csv('/kaggle/input/blend-lgbm-xgb-catboost-xgb/submission.csv')
# test_pred_xgb = pd.read_csv('/kaggle/input/blend-lgbm-xgb-catboost-lgbm/submission.csv')

# # Ensure we are using the correct columns for blending
# blended_preds = (test_pred_cat['Response'] + test_pred_lgb['Response'] + test_pred_xgb['Response']) / 3

# # Create the submission DataFrame
# submission = pd.DataFrame({
#     'id': test_pred_cat['id'],  # Assuming 'id' column is the same in all files
#     'Response': blended_preds
# })

# # Save the submission file
# submission.to_csv("submission.csv", index=False)

# print("Submission file created successfully!")
