## Step 1. Import libraries

In [1]:
import numpy as np 
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt

import math 
import copy
import pickle
import gc

from sklearn.model_selection import StratifiedKFold
from xgboost import XGBClassifier
from sklearn.metrics import roc_auc_score
from sklearn.base import clone

from sklearn.preprocessing import StandardScaler

## Step 1. Load and explore the data

In [2]:
TARGET = 'Response'
SEED = 94

In [3]:
print('Loading Data...')
train = pd.read_csv('input/train.csv')
test = pd.read_csv('input/test.csv')

submission_data = pd.read_csv('input/sample_submission.csv')

print('Data Load Successfully.')

Loading Data...
Data Load Successfully.


In [4]:
train.shape, test.shape

((11504798, 12), (7669866, 11))

In [5]:
train.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11504798 entries, 0 to 11504797
Data columns (total 12 columns):
 #   Column                Dtype  
---  ------                -----  
 0   id                    int64  
 1   Gender                object 
 2   Age                   int64  
 3   Driving_License       int64  
 4   Region_Code           float64
 5   Previously_Insured    int64  
 6   Vehicle_Age           object 
 7   Vehicle_Damage        object 
 8   Annual_Premium        float64
 9   Policy_Sales_Channel  float64
 10  Vintage               int64  
 11  Response              int64  
dtypes: float64(3), int64(6), object(3)
memory usage: 1.0+ GB


In [6]:
train.describe()

Unnamed: 0,id,Age,Driving_License,Region_Code,Previously_Insured,Annual_Premium,Policy_Sales_Channel,Vintage,Response
count,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0
mean,5752398.0,38.38356,0.998022,26.41869,0.4629966,30461.37,112.4254,163.8977,0.1229973
std,3321149.0,14.99346,0.0444312,12.99159,0.4986289,16454.75,54.03571,79.97953,0.3284341
min,0.0,20.0,0.0,0.0,0.0,2630.0,1.0,10.0,0.0
25%,2876199.0,24.0,1.0,15.0,0.0,25277.0,29.0,99.0,0.0
50%,5752398.0,36.0,1.0,28.0,0.0,31824.0,151.0,166.0,0.0
75%,8628598.0,49.0,1.0,35.0,1.0,39451.0,152.0,232.0,0.0
max,11504800.0,85.0,1.0,52.0,1.0,540165.0,163.0,299.0,1.0


In [7]:
#Combine daataset for processing
train['is_train'] = 1
test['is_train'] = 0

df = pd.concat([train, test])
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,is_train
0,0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0.0,1
1,1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1.0,1
2,2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0.0,1
3,3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0.0,1
4,4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0.0,1


## Step 2. Data preprocessing

In [8]:
# Check missing values
df.isnull().sum()

id                            0
Gender                        0
Age                           0
Driving_License               0
Region_Code                   0
Previously_Insured            0
Vehicle_Age                   0
Vehicle_Damage                0
Annual_Premium                0
Policy_Sales_Channel          0
Vintage                       0
Response                7669866
is_train                      0
dtype: int64

* **Age and Vehicle_Age (0.77)**:
    Strong positive correlation. Older individuals tend to have older vehicles.
* **Previously_Insured and Vehicle_Damage (-0.84)**:
    Strong negative correlation. If someone is previously insured, their vehicle is less likely to be damaged.
* **Policy_Sales_Channel and Age (-0.60)**:
    Moderate negative correlation. Younger individuals are more likely to be reached through certain sales channels.

In [9]:
def transform_categorical_features(df):
    print('Transforming categorical features..')

    gender_map = {'Male': 0, 'Female': 1}
    vehicle_age = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2} 
    vehicle_damage = {'No':0, 'Yes':1}

    df['Gender'] = df['Gender'].map(gender_map)
    df['Vehicle_Age'] = df['Vehicle_Age'].map(vehicle_age)
    df['Vehicle_Damage'] = df['Vehicle_Damage'].map(vehicle_damage)

    print("Transformed successfully.")
    return df

In [10]:
def create_additional_features(df):
    print('Creating additional features..')
    
    df['Vehicle_Age_Policy_Sales_Channel'] = pd.factorize(df['Vehicle_Age'].astype(str) + df['Policy_Sales_Channel'].astype(str))[0]
    df['Age_Vehicle_Age'] = pd.factorize(df['Age'].astype(str) + df['Vehicle_Age'].astype(str))[0]
    df['Prev_Insured_Vehicle_Damage'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Vehicle_Damage'].astype(str))[0]
    df['Prev_Insured_Vintage'] = pd.factorize(df['Previously_Insured'].astype(str) + df['Vintage'].astype(str))[0]
    df['Policy_Sales_Channel_Age'] = pd.factorize(df['Policy_Sales_Channel'].astype(str) + df['Age'].astype(str))[0]

    return df

In [11]:
def adjust_data_types(df):
    print('Adjusting data types')
    df['Region_Code'] = df['Region_Code'].astype(int)
    df['Annual_Premium'] = df['Annual_Premium'].astype(int)
    df['Policy_Sales_Channel'] = df['Policy_Sales_Channel'].astype(int)
    
    return df

In [12]:
def optimize_memory_usage(df):
    print('Optimizing memory usage')
    start_mem_usage = df.memory_usage().sum() / 1024**2
    
    for col in df.columns:
        col_type = df[col].dtype
        if col_type.name in ['category', 'object']:
            raise ValueError(f"Column '{col}' is of type '{col_type.name}'")

        c_min = df[col].min()
        c_max = df[col].max()
        if str(col_type)[:3] == 'int':
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)
        else:
            if c_min > np.finfo(np.float16).min and c_max < np.finfo(np.float16).max:
                df[col] = df[col].astype(np.float16)
            elif c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)

    end_mem_usage = df.memory_usage().sum() / 1024**2
    print(f'------ Memory usage before: {start_mem_usage:.2f} MB')
    print(f'------ Memory usage after: {end_mem_usage:.2f} MB')
    print(f'------ Reduced memory usage by {(100 * (start_mem_usage - end_mem_usage) / start_mem_usage):.1f}%')

    return df

In [13]:
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,is_train
0,0,Male,21,1,35.0,0,1-2 Year,Yes,65101.0,124.0,187,0.0,1
1,1,Male,43,1,28.0,0,> 2 Years,Yes,58911.0,26.0,288,1.0,1
2,2,Female,25,1,14.0,1,< 1 Year,No,38043.0,152.0,254,0.0,1
3,3,Female,35,1,1.0,0,1-2 Year,Yes,2630.0,156.0,76,0.0,1
4,4,Female,36,1,15.0,1,1-2 Year,No,31951.0,152.0,294,0.0,1


In [14]:
df = transform_categorical_features(df)
df = adjust_data_types(df)  
df = create_additional_features(df)
df = optimize_memory_usage(df)

df.head()                          

Transforming categorical features..
Transformed successfully.
Adjusting data types
Creating additional features..
Optimizing memory usage
------ Memory usage before: 2560.09 MB
------ Memory usage after: 713.17 MB
------ Reduced memory usage by 72.1%


Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,is_train,Vehicle_Age_Policy_Sales_Channel,Age_Vehicle_Age,Prev_Insured_Vehicle_Damage,Prev_Insured_Vintage,Policy_Sales_Channel_Age
0,0,0,21,1,35,0,1,1,65101,124,187,0.0,1,0,0,0,0,0
1,1,0,43,1,28,0,2,1,58911,26,288,1.0,1,1,1,0,1,1
2,2,1,25,1,14,1,0,0,38043,152,254,0.0,1,2,2,1,2,2
3,3,1,35,1,1,0,1,1,2630,156,76,0.0,1,3,3,0,3,3
4,4,1,36,1,15,1,1,0,31951,152,294,0.0,1,4,4,1,4,4


In [None]:
# Compute the correlation matrix
corr = df.corr()

# Create a heatmap of the correlation matrix
plt.figure(figsize=(12, 8))
sns.heatmap(corr, annot=True, cmap='coolwarm', fmt='.2f', vmin=-1, vmax=1)
plt.title('Correlation Matrix')
plt.show()

In [15]:
from sklearn.preprocessing import MinMaxScaler

# Initialize MinMaxScaler
min_max_scaler = MinMaxScaler()

# Select features to scale
features_to_scale = ['Annual_Premium', 'Vintage', 'Policy_Sales_Channel']

# Fit and transform the selected features
df[features_to_scale] = min_max_scaler.fit_transform(df[features_to_scale])

In [16]:
df.head()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Response,is_train,Vehicle_Age_Policy_Sales_Channel,Age_Vehicle_Age,Prev_Insured_Vehicle_Damage,Prev_Insured_Vintage,Policy_Sales_Channel_Age
0,0,0,21,1,35,0,1,1,0.116218,0.759259,0.612457,0.0,1,0,0,0,0,0
1,1,0,43,1,28,0,2,1,0.104702,0.154321,0.961938,1.0,1,1,1,0,1,1
2,2,1,25,1,14,1,0,0,0.06588,0.932099,0.844291,0.0,1,2,2,1,2,2
3,3,1,35,1,1,0,1,1,0.0,0.95679,0.228374,0.0,1,3,3,0,3,3
4,4,1,36,1,15,1,1,0,0.054547,0.932099,0.982699,0.0,1,4,4,1,4,4


In [None]:
# Dropping original columns
df = df.drop(['Annual_Premium_Binned', 'Vintage_Binned'], axis=1)


In [None]:
df.head()

## Step 4. Split the data

In [17]:
# Split the data back into train and test sets
train = df[df['is_train'] == 1].drop(columns=['is_train'])
test = df[df['is_train'] == 0].drop(columns=['is_train'])

X_train = train.drop(columns=[TARGET])
y_train = train[TARGET]

X_test = test.drop(columns=[TARGET])
y_test = submission_data

In [18]:
X_train.shape

(11504798, 16)

### Subsample the data to speed up training process

In [None]:

X_train_subsample = X_train.sample(frac=0.01, random_state=42)
y_train_subsample = y_train.sample(frac=0.01, random_state=42)


In [None]:
X_test_subsample = X_test.sample(frac = 0.01, random_state=42)

In [None]:
X_train_subsample.shape
X_test_subsample.shape

In [None]:
X_train.head()

In [None]:
import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.callbacks import EarlyStopping

# Custom callback to print additional training information
class CustomCallback(Callback):
    def on_epoch_end(self, epoch, logs=None):
        print(f"Epoch {epoch + 1}/{self.params['epochs']}")
        print(f" - loss: {logs['loss']:.4f} - auc: {logs['auc']:.4f} - val_loss: {logs['val_loss']:.4f} - val_auc: {logs['val_auc']:.4f}")


# Build the model
model = Sequential()
model.add(Dense(128, input_dim=X_train.shape[1], activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(64, activation='relu'))
model.add(Dropout(0.5))
model.add(Dense(32, activation='relu'))
model.add(Dense(1, activation='sigmoid'))

# Compile the model with AUC as a metric
model.compile(optimizer=Adam(learning_rate=0.001), loss='binary_crossentropy', metrics=[tf.keras.metrics.AUC()])

# Define early stopping callback
early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

# Train the model
history = model.fit(X_train, y_train, epochs=100, batch_size=32, validation_split=0.2, callbacks=[early_stopping])

# Evaluate the model
loss, auc = model.evaluate(X_test, y_test)
print(f"Test AUC: {auc:.4f}")

# Predict probabilities
y_pred_proba = model.predict(X_test).ravel()

# Calculate the AUC score
roc_auc = roc_auc_score(y_test, y_pred_proba)
print(f"ROC AUC: {roc_auc:.4f}")

In [23]:
X_train.describe()

Unnamed: 0,id,Gender,Age,Driving_License,Region_Code,Previously_Insured,Vehicle_Age,Vehicle_Damage,Annual_Premium,Policy_Sales_Channel,Vintage,Vehicle_Age_Policy_Sales_Channel,Age_Vehicle_Age,Prev_Insured_Vehicle_Damage,Prev_Insured_Vintage,Policy_Sales_Channel_Age
count,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0,11504800.0
mean,5752398.0,0.458649,38.38356,0.998022,26.41869,0.4629966,0.6031073,0.5026798,0.05177592,0.6878114,0.5325181,8.525531,30.60593,0.6286911,263.7749,210.2554
std,3321149.0,0.4982872,14.99346,0.0444312,12.99159,0.4986289,0.5678559,0.4999928,0.03061149,0.3335538,0.2767458,18.1163,26.20851,0.7044007,162.391,444.9097
min,0.0,0.0,20.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,2876199.0,0.0,24.0,1.0,15.0,0.0,0.0,0.0,0.04213121,0.1728395,0.3079585,2.0,10.0,0.0,123.0,13.0
50%,5752398.0,0.0,36.0,1.0,28.0,0.0,1.0,1.0,0.05431088,0.9259259,0.5397924,2.0,24.0,1.0,255.0,62.0
75%,8628598.0,1.0,49.0,1.0,35.0,1.0,1.0,1.0,0.06849973,0.9320988,0.7681661,5.0,44.0,1.0,396.0,177.0
max,11504800.0,1.0,85.0,1.0,52.0,1.0,2.0,1.0,1.0,1.0,1.0,385.0,190.0,3.0,579.0,6505.0


## Step 5. Train and evaluate the model

In [33]:
def train_and_evaluate(model, X, y, X_test, folds=10, random_state=None):
    print(f'Training {model.__class__.__name__}\n')
    
    scores = []
    feature_importances = np.zeros(X.shape[1])
    evaluation_history = []
    
    oof_pred_probs = np.zeros(X.shape[0])
    test_pred_probs = np.zeros(X_test.shape[0])
    
    skf = StratifiedKFold(n_splits=10, random_state=94, shuffle=True)
    
    for fold_index, (train_index, val_index) in enumerate(skf.split(X, y)):
        X_train, X_val = X.iloc[train_index], X.iloc[val_index]
        y_train, y_val = y.iloc[train_index], y.iloc[val_index]
        
        model_clone = copy.deepcopy(model)
        model_clone.fit(
                X_train, 
                y_train, 
                eval_set=[(X_val, y_val)], 
                verbose=500)
        
        feature_importances += model_clone.feature_importances_ / folds
        evaluation_history.append(model_clone.evals_result())
        
        y_pred_probs = model_clone.predict_proba(X_val)[:, 1]
        oof_pred_probs[val_index] = y_pred_probs
        
        temp_test_pred_probs = model_clone.predict_proba(X_test)[:, 1]
        test_pred_probs += temp_test_pred_probs / folds
        
        auc_score = roc_auc_score(y_val, y_pred_probs)
        scores.append(auc_score)
        
        print(f'\n--- Fold {fold_index + 1} - AUC: {auc_score:.5f}\n\n')
        
        del model_clone
        gc.collect()
    
    print(f'------ Average AUC: {np.mean(scores):.5f} ± {np.std(scores):.5f}\n\n')

    return oof_pred_probs, test_pred_probs

In [36]:
best_params = {
    'alpha': 1.302348865795227e-06, 
    'max_depth': 15, 
    'learning_rate': 0.061800451723613786, 
    'subsample': 0.7098803046786328, 
    'colsample_bytree': 0.2590672912533101, 
    'min_child_weight': 10, 
    'gamma': 0.8399887056014855, 
    'reg_alpha': 0.0016943548302122801, 
    'max_bin': 71284,
    'early_stopping_rounds': 50
}
best_xgb_model = XGBClassifier(**best_params, n_estimators=12000, random_state=94, eval_metric="auc")

# Call train_and_evaluate function with XGBClassifier model
oof_pred_probs, predictions = train_and_evaluate(best_xgb_model, X_train, y_train, X_test, folds=10, random_state=SEED)

Training XGBClassifier

[0]	validation_0-auc:0.77383
[500]	validation_0-auc:0.88718
[1000]	validation_0-auc:0.88876
[1500]	validation_0-auc:0.88930
[1523]	validation_0-auc:0.88931

--- Fold 1 - AUC: 0.88932


[0]	validation_0-auc:0.77525
[500]	validation_0-auc:0.88686
[1000]	validation_0-auc:0.88862
[1500]	validation_0-auc:0.88919
[1505]	validation_0-auc:0.88921

--- Fold 2 - AUC: 0.88922


[0]	validation_0-auc:0.77428
[500]	validation_0-auc:0.88699
[1000]	validation_0-auc:0.88878
[1500]	validation_0-auc:0.88928
[1514]	validation_0-auc:0.88929

--- Fold 3 - AUC: 0.88930


[0]	validation_0-auc:0.77341
[500]	validation_0-auc:0.88636
[1000]	validation_0-auc:0.88807
[1500]	validation_0-auc:0.88870
[1538]	validation_0-auc:0.88870

--- Fold 4 - AUC: 0.88870


[0]	validation_0-auc:0.77619
[500]	validation_0-auc:0.88745
[1000]	validation_0-auc:0.88921
[1160]	validation_0-auc:0.88938

--- Fold 5 - AUC: 0.88939


[0]	validation_0-auc:0.77452
[500]	validation_0-auc:0.88713
[1000]	validation_0-auc

In [37]:
submission = pd.DataFrame({
    'id': X_test['id'],
    'Response': predictions
})
submission.to_csv('submission.csv', index=False)
submission.head()

Unnamed: 0,id,Response
0,11504798,0.003977
1,11504799,0.414721
2,11504800,0.193119
3,11504801,0.000106
4,11504802,0.055567


## Possible improvements

In [None]:
##Binning some features

from sklearn.preprocessing import LabelEncoder, StandardScaler

# Binning Vintage
bins_vintage = [0, 200, 400, 600, 800, float('inf')]
labels_vintage = ['Very New', 'New', 'Moderately New', 'Experienced', 'Very Experienced']
df['Vintage_Binned'] = pd.cut(df['Vintage'], bins=bins_vintage, labels=labels_vintage)
# Binning Annual_Premium
bins_premium = [0, 10000, 30000, 50000, 100000, float('inf')]
labels_premium = ['Very Low', 'Low', 'Moderate', 'High', 'Very High']
df['Annual_Premium_Binned'] = pd.cut(df['Annual_Premium'], bins=bins_premium, labels=labels_premium)

# Encoding Policy_Sales_Channel
le = LabelEncoder()
df['Policy_Sales_Channel_Encoded'] = le.fit_transform(df['Policy_Sales_Channel'])

# Dropping original columns
df = df.drop(['Vintage', 'Annual_Premium', 'Policy_Sales_Channel'], axis=1)

df['Annual_Premium_Binned_Numeric'], _ = pd.factorize(df['Annual_Premium_Binned'])
df['Vintage_Binned_Numeric'],_ = pd.factorize(df['Vintage_Binned'])

In [None]:
##Using RandomizedSearch - hyperparamiters tunning

from sklearn.model_selection import RandomizedSearchCV
from xgboost import XGBClassifier


xgb_params = {
    'colsample_bylevel': [0.1, 0.2, 0.3, 0.5, 0.7, 1.0],
    'colsample_bynode': [0.1, 0.2, 0.3, 0.5, 0.7, 1.0],
    'colsample_bytree': [0.1, 0.2, 0.3, 0.5, 0.7, 1.0],
    'gamma': [0, 0.1, 0.5, 0.6051, 1],
    'max_bin': [256, 512, 682, 1024],
    'max_delta_step': [0, 1, 5, 7, 10],
    'max_depth': [3, 5, 10, 20, 50, 68, 100],
    'min_child_weight': [1, 3, 5, 7, 10],
    'n_estimators': [100, 500, 1000, 5000, 10000],
    'reg_alpha': [0, 0.1, 0.4651, 0.5],
    'reg_lambda': [0, 0.1, 0.5, 1],
    'subsample': [0.5, 0.6, 0.7, 0.8, 0.9, 1.0]
}

# Set up cross-validation strategy
cv = StratifiedKFold(n_splits=FOLDS, shuffle=True, random_state=SEED)

xgb_model = XGBClassifier(objective="binary:logistic", n_jobs=-1, random_state=SEED, eval_metric="auc", verbosity=0, tree_method='hist')

random_search = RandomizedSearchCV(estimator=xgb_model, param_distributions=xgb_params, n_iter=5, scoring='roc_auc', cv=cv, verbose=1, random_state=SEED)
print(random_search)
random_search.fit(X_train_subsample, y_train_subsample)

print("Best parameters found: ", random_search.best_params_)
print("Best AUC score: ", random_search.best_score_)