In [11]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import klib
import logging
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import optuna
from datetime import datetime
import os
import warnings
import gc

warnings.filterwarnings("ignore")

In [12]:
# Create a log filename with the notebook name and current datetime
current_time = datetime.now().strftime("%Y%m%d_%H%M%S")
log_filename = f'kaggle_submission_{current_time}.log'

# Configure logging to save to a file
logging.basicConfig(
    level=logging.INFO,
    format='%(asctime)s - %(levelname)s - %(message)s',
    handlers=[
        logging.FileHandler(log_filename),
        logging.StreamHandler()  # This ensures logs are also output to the console
    ]
)

In [13]:
def reduce_mem_usage(df):
    """ iterate through all the columns of a dataframe and modify the data type
        to reduce memory usage.        
    """
    start_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage of dataframe is {:.2f} MB'.format(start_mem))
    
    for col in df.columns:
        col_type = df[col].dtype
        
        if col_type != object:
            c_min = df[col].min()
            c_max = df[col].max()
            if str(col_type)[:3] == 'int':
                if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                    df[col] = df[col].astype(np.int8)
                elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                    df[col] = df[col].astype(np.int16)
                elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                    df[col] = df[col].astype(np.int32)
                elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                    df[col] = df[col].astype(np.int64)  
            else:
                if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                    df[col] = df[col].astype(np.float16)
                elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                    df[col] = df[col].astype(np.float32)
                else:
                    df[col] = df[col].astype(np.float64)
        else:
            df[col] = df[col].astype('category')
    end_mem = df.memory_usage().sum() / 1024**2
    print('Memory usage after optimization is: {:.2f} MB'.format(end_mem))
    print('Decreased by {:.1f}%'.format(100 * (start_mem - end_mem) / start_mem))
    
    return df

def import_data(file, **kwargs):
    """create a dataframe and optimize its memory usage"""
    df = pd.read_csv(file, parse_dates=True, keep_date_col=True, **kwargs)
    df = reduce_mem_usage(df)
    return df

In [14]:
# Paths to datasets
train_path = r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\train.csv"
test_path = r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\test.csv"

# Load and optimize data
train_df = import_data(train_path, index_col='id')
test_df = import_data(test_path, index_col='id')

gc.collect()
logging.info("Data loaded successfully.")

Memory usage of dataframe is 1053.30 MB
Memory usage after optimization is: 274.30 MB
Decreased by 74.0%
Memory usage of dataframe is 643.68 MB


2024-07-23 13:27:18,671 - INFO - Data loaded successfully.


Memory usage after optimization is: 175.55 MB
Decreased by 72.7%


In [15]:
# Preprocess data
def preprocess_data(df):
    df['Gender'] = df['Gender'].map({'Male': 1, 'Female': 0})
    df['Vehicle_Damage'] = df['Vehicle_Damage'].map({'Yes': 1, 'No': 0})
    vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
    df['Vehicle_Age'] = df['Vehicle_Age'].map(vehicle_age_mapping)
    df.drop(['Driving_License'], axis=1, inplace=True)
    return df

# Apply preprocessing
train_df = preprocess_data(train_df)
test_df = preprocess_data(test_df)
logging.info("Data preprocessed successfully.")

2024-07-23 13:27:19,073 - INFO - Data preprocessed successfully.


In [16]:
# # Remove outliers
# def remove_outliers_iqr(df, column):
#     Q1 = df[column].quantile(0.25)
#     Q3 = df[column].quantile(0.75)
#     IQR = Q3 - Q1
#     lower_bound = Q1 - 1.5 * IQR
#     upper_bound = Q3 + 1.5 * IQR
#     return df[(df[column] >= lower_bound) & (df[column] <= upper_bound)]

# # Apply outlier removal
# train_df = remove_outliers_iqr(train_df, 'Annual_Premium')
# logging.info("Outliers removed successfully.")
# logging.info(f"Train dataset size after outlier removal: {train_df.shape}")
# logging.info(f"Train dataset columns after outlier removal:\n{train_df.dtypes}")
# logging.info(f"Train dataset missing values after outlier removal:\n{train_df.isnull().sum()}")
# logging.info(f"Train dataset descriptive statistics after outlier removal:\n{train_df.describe(include='all')}")

In [17]:
# Feature engineering
def feature_engineering(df):
    df = df.copy()
    df['Previously_Insured_Annual_Premium'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Annual_Premium'].astype(str)))[0]
    df['Previously_Insured_Vehicle_Age'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Age'].astype(str)))[0]
    df['Previously_Insured_Vehicle_Damage'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vehicle_Damage'].astype(str)))[0]
    df['Previously_Insured_Vintage'] = pd.factorize((df['Previously_Insured'].astype(str) + df['Vintage'].astype(str)))[0]
    return df

# Apply feature engineering
train_df = feature_engineering(train_df)
test_df = feature_engineering(test_df)

gc.collect()
logging.info("Feature engineering completed successfully.")

2024-07-23 13:27:52,852 - INFO - Feature engineering completed successfully.


In [18]:
# Separate features and target variable
X = train_df.drop('Response', axis=1).values
y = train_df['Response'].values

# Scale features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)
test_df_scaled = scaler.transform(test_df.values)

gc.collect()
logging.info(f"Features and target variable separated and scaled.")

2024-07-23 13:27:55,810 - INFO - Features and target variable separated and scaled.


In [19]:
# Create Stratified K-Folds
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

In [20]:
# Train LightGBM model with validation within each fold
def train_lgbm_with_validation(X_train, y_train, params, num_boost_round=1000):
    X_train_split, X_valid_split, y_train_split, y_valid_split = train_test_split(
        X_train, y_train, test_size=0.2, random_state=42, stratify=y_train)
    
    train_data = lgb.Dataset(X_train_split, label=y_train_split)
    valid_data = lgb.Dataset(X_valid_split, label=y_valid_split, reference=train_data)
    
    # Include early_stopping_rounds in the parameter dictionary
    params['early_stopping_rounds'] = 100
    
    bst = lgb.train(
        params,
        train_data,
        num_boost_round=num_boost_round,
        valid_sets=[train_data, valid_data],
    )
    
    gc.collect()
    valid_preds = bst.predict(X_valid_split, num_iteration=bst.best_iteration)
    auc_score = roc_auc_score(y_valid_split, valid_preds)
    logging.info(f'Validation AUC score: {auc_score}')
    return bst, bst.best_iteration

# Define LightGBM parameters
params = {
    'objective': 'binary',
    'metric': 'auc',
    'reg_alpha': 0.03432385172267505,
    'reg_lambda': 0.2998279059616829,
    'colsample_bytree': 0.790292183596673,
    'subsample': 0.9046878168822107,
    'learning_rate': 0.05035039561309864,
    'max_depth': 29,
    'num_leaves': 1474,
    'min_child_samples': 75,
    'min_child_weight': 7.661448090878849,
    'min_split_gain': 0.09978597066868167,
    'max_bin': 499,
    'n_jobs': 8,
    'early_stopping_rounds': 100
}

In [21]:
# Initialize lists to store out-of-fold predictions and AUC scores
oof_preds = []
oof_aucs = []

# Perform cross-validation
for fold, (train_idx, test_idx) in enumerate(skfold.split(X_scaled, y)):
    print(f"\n---- Fold {fold + 1} ----\n")
    
    X_train, y_train = X_scaled[train_idx], y[train_idx]
    X_valid, y_valid = X_scaled[test_idx], y[test_idx]
    
    # Train the model with validation to find the best iteration
    model, best_iteration = train_lgbm_with_validation(X_train, y_train, params)
    logging.info(f"Best iteration found: {best_iteration}")

    # Predict on validation set
    valid_preds = model.predict(X_valid, num_iteration=best_iteration)
    auc_score = roc_auc_score(y_valid, valid_preds)
    oof_aucs.append(auc_score)
    print(f"Validation AUC score for fold {fold + 1}: {auc_score:.6f}")
    
    # Predict on test set
    test_pred = model.predict(test_df_scaled, num_iteration=best_iteration)
    oof_preds.append(test_pred)
    
    # Clean up to free memory
    del X_train, y_train, X_valid, y_valid, model
    gc.collect()

# Calculate overall AUC score
auc_mean = np.mean(oof_aucs)
auc_std = np.std(oof_aucs)
print(f"\n---> Overall ROC-AUC Score: {auc_mean:.6f} Â± {auc_std:.6f}\n")

# Average the predictions from each fold
test_pred_lgb = np.mean(oof_preds, axis=0)


---- Fold 1 ----

[LightGBM] [Info] Number of positive: 905638, number of negative: 6457432
[LightGBM] [Info] Auto-choosing col-wise multi-threading, the overhead of testing was 0.181401 seconds.
You can set `force_col_wise=true` to remove the overhead.
[LightGBM] [Info] Total Bins 2029
[LightGBM] [Info] Number of data points in the train set: 7363070, number of used features: 13
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.122997 -> initscore=-1.964347
[LightGBM] [Info] Start training from score -1.964347
Training until validation scores don't improve for 100 rounds


In [None]:
# Save the predictions to a submission file
submission = pd.DataFrame({'id': test_df.index, 'Response': test_pred_lgb})
submission.to_csv("submission.csv", index=False)
print("Submission file created successfully.")
