# [Binary Classification of Insurance Cross Selling](https://www.kaggle.com/competitions/playground-series-s4e7)

In [7]:
# pip install \
#     --extra-index-url=https://pypi.nvidia.com \
#     cudf-cu12==24.6.* dask-cudf-cu12==24.6.* cuml-cu12==24.6.* \
#     cugraph-cu12==24.6.* cuspatial-cu12==24.6.* cuproj-cu12==24.6.* \
#     cuxfilter-cu12==24.6.* cucim-cu12==24.6.* pylibraft-cu12==24.6.* \
#     raft-dask-cu12==24.6.* cuvs-cu12==24.6.*

In [8]:
# %pip install matplotlib seaborn scikit-learn category_encoders torchmetrics kagtool

# Preprocess + Training

In [9]:
# # https://docs.rapids.ai/install
%load_ext cudf.pandas

from pathlib import Path
import os
import time
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt, seaborn as sns
from tqdm import tqdm
import gc

import joblib
from scipy import sparse
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import roc_auc_score, roc_curve
import category_encoders as ce
from category_encoders import TargetEncoder

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, Subset, TensorDataset
import torch.optim as optim
from torchmetrics.classification import BinaryAUROC
from torchmetrics.functional import roc as torch_roc

np.set_printoptions(linewidth=140)
pd.set_option('display.width', 140)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

if 'KAGGLE_KERNEL_RUN_TYPE' in os.environ:
    path = Path('/kaggle/input/playground-series-s4e7')
else:
    path = Path('playground-series-s4e7')
    
sample_df = pd.read_csv(path/'sample_submission.csv')
test_df = pd.read_csv(path/'test.csv')
df = pd.read_csv(path/'train.csv')
display(f"Number of rows: {len(df)}, Number of columns: {len(df.columns)}")
df.head()

The cudf.pandas extension is already loaded. To reload it, use:
  %reload_ext cudf.pandas


'Number of rows: 11504798, Number of columns: 12'

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response
0,0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0
1,1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1
2,2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0
3,3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0
4,4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0


In [10]:
for c in ["Annual_Premium", "Policy_Sales_Channel", "Vintage", "Age"]:
    print(df[c].nunique())

51728
152
290
66


### Target Encoding Annual_Premium - One Hot Encode Others

In [11]:
def preprocess_data(df, features, scaler=None):
    start_time = time.time()`
    df = df.copy()
    
    # Explicitly cast columns to appropriate dtypes
    df["Vehicle_Age"] = df["Vehicle_Age"].astype('category').cat.rename_categories(
        {"1-2 Year": 1, "< 1 Year": 0, "> 2 Years": 2}).astype('int8')
    df["Gender"] = (df["Gender"] == "Male").astype("int8")
    df["Vehicle_Damage"] = (df["Vehicle_Damage"] == "Yes").astype("int8")
    df["Age"] = df["Age"].astype("int8")
    df["Driving_License"] = df["Driving_License"].astype("int8")
    df["Region_Code"] = df["Region_Code"].astype("int8")
    df["Previously_Insured"] = df["Previously_Insured"].astype("int8")

    # Binning continuous features
    df['Age'] = pd.qcut(df['Age'], q=50, labels=False, duplicates='drop').astype('int8')
    df['Vintage'] = pd.qcut(df['Vintage'], q=50, labels=False, duplicates='drop').astype('int8')
    df['Policy_Sales_Channel'] = pd.qcut(df['Policy_Sales_Channel'], q=50, labels=False, duplicates='drop').astype('int8')

    # Target encode Annual Premium
    if scaler is None:
        mean_response = df.groupby('Annual_Premium')['Response'].mean()
        df['Annual_Premium'] = df['Annual_Premium'].map(mean_response).astype('float16')
        mean_response.to_csv('annual_premium_mean_response.csv', index=True)
    else:
        mean_response = pd.read_csv('annual_premium_mean_response.csv', index_col=0).squeeze()
        df['Annual_Premium'] = df['Annual_Premium'].map(mean_response).astype('float16')
        df['Annual_Premium'] = df['Annual_Premium'].fillna(mean_response.mean()).astype('float16')

    if 'Response' in df:
        df['Response'] = df['Response'].astype('int8')

    # Encode all features with LabelEncoder individually
    for feat in tqdm(features, desc="Label encoding features"):
        if feat != 'Annual_Premium':
            lbl_enc = LabelEncoder()
            df[feat] = lbl_enc.fit_transform(df[feat].astype(str).values)
            df[feat] = df[feat].astype('float16')

    # Normalize Annual Premium
    if scaler is None:
        scaler = StandardScaler()
        df['Annual_Premium'] = scaler.fit_transform(df[['Annual_Premium']])
        
    if 'Response' in df:
        new_df = pd.concat([df['Response'], df[features]], axis=1)
    else:
        new_df = df[features].copy()

    return new_df, scaler

# Example usage
features = [f for f in df.columns if f not in ("id", "Response")]
new_df, scaler = preprocess_data(df, features)
display(new_df.info())

# Test preprocessing on a new dataset
processed_test_df, _ = preprocess_data(test_df, features, scaler=scaler)
display(processed_test_df.head())
display(processed_test_df.head())

Label encoding features: 100%|██████████| 10/10 [00:19<00:00,  1.91s/it]


<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11504798 entries, 0 to 11504797
Data columns (total 11 columns):
 #   Column                Dtype  
---  ------                -----  
 0   Response              int8   
 1   Gender                float16
 2   Age                   float16
 3   Driving_License       float16
 4   Region_Code           float16
 5   Previously_Insured    float16
 6   Vehicle_Age           float16
 7   Vehicle_Damage        float16
 8   Annual_Premium        float16
 9   Policy_Sales_Channel  float16
 10  Vintage               float16
dtypes: float16(10), int8(1)
memory usage: 230.4 MB


None

Label encoding features: 100%|██████████| 10/10 [00:12<00:00,  1.29s/it]


Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,0.0,0.0,1.0,42.0,0.0,0.0,0.0,0.12793,2.0,30.0
1,1.0,16.0,1.0,21.0,0.0,1.0,1.0,0.373047,6.0,9.0
2,1.0,16.0,1.0,38.0,0.0,1.0,1.0,0.12793,1.0,39.0
3,0.0,1.0,1.0,42.0,1.0,0.0,0.0,0.009132,8.0,7.0
4,1.0,19.0,1.0,11.0,0.0,1.0,0.0,0.378418,6.0,14.0


Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,0.0,0.0,1.0,42.0,0.0,0.0,0.0,0.12793,2.0,30.0
1,1.0,16.0,1.0,21.0,0.0,1.0,1.0,0.373047,6.0,9.0
2,1.0,16.0,1.0,38.0,0.0,1.0,1.0,0.12793,1.0,39.0
3,0.0,1.0,1.0,42.0,1.0,0.0,0.0,0.009132,8.0,7.0
4,1.0,19.0,1.0,11.0,0.0,1.0,0.0,0.378418,6.0,14.0


### Preprocess

In [12]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import pandas as pd
from torch.utils.data import Dataset, DataLoader, TensorDataset
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from torch.optim.lr_scheduler import OneCycleLR
from tqdm import tqdm  # Import tqdm for progress bars


# credit: https://www.kaggle.com/code/cpmpml/the-power-of-undersampling-cudf-pandas
def down_sampling(X, y, i, half_sample=True):
    majority_class = X[y == 0].copy()
    minority_class = X[y == 1]
    if half_sample:
        sample_size = len(minority_class) // 2  # Half the size of the minority class
    else:
        sample_size = len(minority_class)
    majority_sample, X_rest, y_sample, y_rest = train_test_split(
        majority_class, y[y == 0], train_size=sample_size, random_state=i, stratify=majority_class['Age'])
    X_minimal = pd.concat([majority_sample, minority_class], axis=0)
    y_minimal = pd.concat([y_sample, y[y == 1]])

    return X_minimal, y_minimal, X_rest, y_rest


def prepare_tensors(df, cat_features, cont_features, split=0.05, down_sample=False):
    start_time = time.time()
    
    # Split the data into training and validation sets
    df_train, df_valid = train_test_split(df, test_size=split, stratify=df["Response"])
    ytrain, yvalid = df_train.pop('Response'), df_valid.pop('Response')

    # Down-sample the training dataset if specified
    if down_sample:
        df_train, ytrain, _, _ = down_sampling(df_train, ytrain, i=42, half_sample=False)

    # Prepare feature tensors directly
    xtrain_cat = torch.tensor(df_train[cat_features].values, dtype=torch.float32).to(device)
    xvalid_cat = torch.tensor(df_valid[cat_features].values, dtype=torch.float32).to(device)
    xtrain_cont = torch.tensor(df_train[cont_features].values, dtype=torch.float32).to(device)
    xvalid_cont = torch.tensor(df_valid[cont_features].values, dtype=torch.float32).to(device)

    ytrain_tensor = torch.tensor(ytrain.values, dtype=torch.float32).to(device)
    yvalid_tensor = torch.tensor(yvalid.values, dtype=torch.float32).to(device)

    # Ensure the number of samples matches
    assert xtrain_cat.size(0) == ytrain_tensor.size(0), "Size mismatch between xtrain_cat and ytrain tensors"
    assert xvalid_cat.size(0) == yvalid_tensor.size(0), "Size mismatch between xvalid_cat and yvalid tensors"
    assert xtrain_cont.size(0) == ytrain_tensor.size(0), "Size mismatch between xtrain_cont and ytrain tensors"
    assert xvalid_cont.size(0) == yvalid_tensor.size(0), "Size mismatch between xvalid_cont and yvalid tensors"

    print(f"Tensor preparation done in {time.time() - start_time:.2f} seconds")

    return (
        TensorDataset(xtrain_cat, xtrain_cont, ytrain_tensor),
        TensorDataset(xvalid_cat, xvalid_cont, yvalid_tensor)
    )


# new_df.info()
cat_features = ['Gender', 'Age', 'Driving_License', 'Region_Code', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage', 'Policy_Sales_Channel', 'Vintage']
cont_features = ['Annual_Premium']
tr_ds, val_ds = prepare_tensors(new_df, cat_features, cont_features, split=0.1, down_sample=False) 
tr_ds[0][0].shape, tr_ds[0][1].shape

Tensor preparation done in 4.23 seconds


(torch.Size([9]), torch.Size([1]))

In [13]:
xtrain_cat = new_df[cat_features].values
type(xtrain_cat), xtrain_cat.dtype, xtrain_cat.shape

(cudf.pandas._wrappers.numpy.ndarray, dtype('float16'), (11504798, 9))

### Embedding + Dense Layer

In [18]:
class EntityEmbeddingModel(nn.Module):
    def __init__(self, data, catcols, contcols, dropout):
        super().__init__()
        self.emb_layers = nn.ModuleList([
            nn.Embedding(data[c].nunique() + 1, min(int(np.ceil(data[c].nunique() / 2)), 50))
            for c in tqdm(catcols, desc="Creating embedding layers")
        ])
        
        total_embed_dim = sum(emb.embedding_dim for emb in self.emb_layers)
        self.num_cont_features = len(contcols)

        self.fc_layers = nn.Sequential(
            nn.BatchNorm1d(total_embed_dim + self.num_cont_features),
            nn.Linear(total_embed_dim + self.num_cont_features, 128),
            nn.ReLU(),
            nn.BatchNorm1d(128),
            nn.Dropout(dropout),
            nn.Linear(128, 64),
            nn.ReLU(),
            nn.BatchNorm1d(64),
            nn.Dropout(dropout),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.BatchNorm1d(32),
            nn.Dropout(dropout),
            nn.Linear(32, 1)
        )
        self._initialize_weights()

    def _initialize_weights(self):
        for m in self.modules():
            if isinstance(m, nn.Linear):
                nn.init.kaiming_uniform_(m.weight, nonlinearity='relu')
                if m.bias is not None:
                    nn.init.constant_(m.bias, 0)
            elif isinstance(m, nn.Embedding):
                nn.init.xavier_uniform_(m.weight)
            elif isinstance(m, nn.BatchNorm1d):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

    def forward(self, x_cat, x_cont):
        x_cat = x_cat.long()
        x_cat = torch.cat([emb_layer(x_cat[:, i]) for i, emb_layer in enumerate(self.emb_layers)], dim=1)
        x = torch.cat([x_cat, x_cont], dim=1)
        return self.fc_layers(x)


def train_and_evaluate_model(tr_ds, val_ds, df, cat_features, cont_features, bs, epochs, lr, max_lr, dropout):
    start_time = time.time()
    train_loader = DataLoader(tr_ds, batch_size=bs, shuffle=True)
    valid_loader = DataLoader(val_ds, batch_size=bs, shuffle=False)
    
    # Initialize model
    model = EntityEmbeddingModel(df, catcols=cat_features, contcols=cont_features, dropout=dropout).to(device)
    
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=lr)
    scheduler = optim.lr_scheduler.OneCycleLR(optimizer, max_lr=max_lr, steps_per_epoch=len(train_loader), epochs=epochs)
    auc_metric = BinaryAUROC().to(device)

    for epoch in range(epochs):
        model.train()
        train_loss = 0.0
        
        for X_cat_batch, X_cont_batch, y_batch in tqdm(train_loader, desc=f"Training Epoch {epoch+1}/{epochs}"):
            X_cat_batch, X_cont_batch, y_batch = X_cat_batch.to(device, non_blocking=True), X_cont_batch.to(device, non_blocking=True), y_batch.to(device, non_blocking=True)
            optimizer.zero_grad()
            output = model(X_cat_batch, X_cont_batch).squeeze(1)
            loss = criterion(output, y_batch.float())
            loss.backward()
            optimizer.step()
            scheduler.step()
            train_loss += loss.item()
        
        model.eval()
        valid_loss = 0.0
        all_outputs, all_targets = [], []
        with torch.no_grad():
            for X_cat_batch, X_cont_batch, y_batch in valid_loader:
                X_cat_batch, X_cont_batch, y_batch = X_cat_batch.to(device, non_blocking=True), X_cont_batch.to(device, non_blocking=True), y_batch.to(device, non_blocking=True)
                output = model(X_cat_batch, X_cont_batch).squeeze(1)
                loss = criterion(output, y_batch.float())
                valid_loss += loss.item()
                all_outputs.append(output)
                all_targets.append(y_batch)

        all_outputs = torch.cat(all_outputs)
        all_targets = torch.cat(all_targets)
        auc = auc_metric(all_outputs, all_targets)
        
        print(
            f"Epoch {epoch+1} done in {time.time() - start_time:.2f} seconds, "
            f"Training Loss: {train_loss/len(train_loader)}, "
            f"Validation Loss: {valid_loss/len(valid_loader)}, "
            f"Validation AUC: {auc:.4f}"
        )
    return model, all_outputs, all_targets
        
# Train and evaluate the model
torch.cuda.empty_cache()
gc.collect()
model, all_outputs, all_targets = train_and_evaluate_model(tr_ds, val_ds, new_df, cat_features, cont_features,
                                                           bs=2048, epochs=7, lr=1e-1, max_lr=1e-1, dropout=0.5)


Creating embedding layers: 100%|██████████| 9/9 [00:02<00:00,  3.36it/s]
Training Epoch 1/7: 100%|██████████| 5056/5056 [02:49<00:00, 29.84it/s]


Epoch 1 done in 186.25 seconds, Training Loss: 0.26223835301927373, Validation Loss: 0.25406744063324777, Validation AUC: 0.8809


Training Epoch 2/7: 100%|██████████| 5056/5056 [02:49<00:00, 29.89it/s]


Epoch 2 done in 369.49 seconds, Training Loss: 0.2567911802708537, Validation Loss: 0.2543600890454024, Validation AUC: 0.8815


Training Epoch 3/7: 100%|██████████| 5056/5056 [02:51<00:00, 29.48it/s]


Epoch 3 done in 554.95 seconds, Training Loss: 0.25666678868988646, Validation Loss: 0.2503640203607464, Validation AUC: 0.8824


Training Epoch 4/7: 100%|██████████| 5056/5056 [02:49<00:00, 29.91it/s]


Epoch 4 done in 738.22 seconds, Training Loss: 0.2556032901040361, Validation Loss: 0.2509207422198774, Validation AUC: 0.8826


Training Epoch 5/7: 100%|██████████| 5056/5056 [02:50<00:00, 29.74it/s]


Epoch 5 done in 922.87 seconds, Training Loss: 0.25427875327406146, Validation Loss: 0.24950039662500292, Validation AUC: 0.8829


Training Epoch 6/7: 100%|██████████| 5056/5056 [02:50<00:00, 29.59it/s]


Epoch 6 done in 1108.31 seconds, Training Loss: 0.2529214698161128, Validation Loss: 0.24939156755858044, Validation AUC: 0.8832


Training Epoch 7/7: 100%|██████████| 5056/5056 [02:50<00:00, 29.61it/s]


Epoch 7 done in 1293.60 seconds, Training Loss: 0.25196926890506965, Validation Loss: 0.24931364901549452, Validation AUC: 0.8832


# Submission

In [23]:
def prepare_test_tensors(df, cat_features, cont_features):
    # Start timing the process
    start_time = time.time()
    
    # Separate categorical and continuous features
    x_cat = df[cat_features].values
    x_cont = df[cont_features].values
    
    # Convert to numpy arrays
    x_cat = np.array(x_cat)
    x_cont = np.array(x_cont)
    
    print(f"x_cat shape: {x_cat.shape}, x_cat dtype: {x_cat.dtype}")
    print(f"x_cont shape: {x_cont.shape}, x_cont dtype: {x_cont.dtype}")
    print(f"Prepare x_cat, x_cont done in {time.time() - start_time:.2f} seconds")
    
    print('x_cat sample:', x_cat[0])
    print('x_cont sample:', x_cont[0])
    
    # Convert to PyTorch tensors with appropriate dtype
    start_time = time.time()
    x_cat_tensor = torch.from_numpy(x_cat).to(device)
    x_cont_tensor = torch.from_numpy(x_cont).to(device)
    
    print(f"x_cat_tensor dtype: {x_cat_tensor.dtype}")
    print(f"x_cont_tensor dtype: {x_cont_tensor.dtype}")
    print(f"x_cat_tensor shape: {x_cat_tensor.shape}")
    print(f"x_cont_tensor shape: {x_cont_tensor.shape}")
    print(f"Tensors conversion done in {time.time() - start_time:.2f} seconds")
    return TensorDataset(x_cat_tensor, x_cont_tensor)

processed_test_df, _ = preprocess_data(test_df, features, scaler=scaler)
display(processed_test_df.head())

test_ds = prepare_test_tensors(processed_test_df, cat_features, cont_features)
len(test_ds)

Label encoding features: 100%|██████████| 10/10 [00:12<00:00,  1.30s/it]


Unnamed: 0,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage
0,0.0,0.0,1.0,42.0,0.0,0.0,0.0,0.12793,2.0,30.0
1,1.0,16.0,1.0,21.0,0.0,1.0,1.0,0.373047,6.0,9.0
2,1.0,16.0,1.0,38.0,0.0,1.0,1.0,0.12793,1.0,39.0
3,0.0,1.0,1.0,42.0,1.0,0.0,0.0,0.009132,8.0,7.0
4,1.0,19.0,1.0,11.0,0.0,1.0,0.0,0.378418,6.0,14.0


x_cat shape: (7669866, 9), x_cat dtype: float16
x_cont shape: (7669866, 1), x_cont dtype: float16
Prepare x_cat, x_cont done in 0.18 seconds
x_cat sample: [ 0.  0.  1. 42.  0.  0.  0.  2. 30.]
x_cont sample: [0.1279]
x_cat_tensor dtype: torch.float16
x_cont_tensor dtype: torch.float16
x_cat_tensor shape: torch.Size([7669866, 9])
x_cont_tensor shape: torch.Size([7669866, 1])
Tensors conversion done in 0.01 seconds


7669866

In [24]:
model.eval()
test_preds = []
with torch.no_grad():
    for X_cat_batch, X_cont_batch in tqdm(DataLoader(test_ds, batch_size=1024, shuffle=False), desc="Processing batches"):
        output = model(X_cat_batch, X_cont_batch).squeeze(1)
        test_preds.append(output)
test_preds = torch.cat(test_preds)
display(test_preds.shape)

Processing batches: 100%|██████████| 7491/7491 [00:59<00:00, 126.75it/s]


torch.Size([7669866])

In [25]:
# Compute ROC curve using torchmetrics
fpr, tpr, thresholds = torch_roc(all_outputs, all_targets.long(), task="binary")

# Convert tensors to numpy arrays for further processing
fpr = fpr.cpu().numpy()
tpr = tpr.cpu().numpy()
thresholds = thresholds.cpu().numpy()

# Calculate the Youden's J statistic for each threshold
youden_j = tpr - fpr

# Find the index of the maximum Youden's J statistic
optimal_index = np.argmax(youden_j)
optimal_threshold = thresholds[optimal_index]
print(f'Optimal Threshold: {optimal_threshold}')

proba = torch.sigmoid(test_preds).cpu().numpy().reshape(-1, 1)
preds_df = pd.DataFrame({
    'id': test_df.id,
    'target': (proba >= optimal_threshold).astype(int).flatten()
})
preds_df.to_csv("subm_mlp_embedding.csv", index=False)
preds_df.head()

Optimal Threshold: 0.1168849989771843


Unnamed: 0,id,target
0,11504798,0
1,11504799,1
2,11504800,1
3,11504801,0
4,11504802,0
