In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import logging
import gc
import klib
from sklearn.model_selection import StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import xgboost as xgb
from catboost import CatBoostClassifier, Pool
from datetime import datetime
import warnings
import joblib
import seaborn as sns


warnings.filterwarnings("ignore")

In [2]:
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import joblib


In [3]:

# Paths to datasets
train_path = r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\train.csv"
test_path = r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\test.csv"



In [4]:

def import_data(path, index_col=None):
    """Import data from a CSV file and optimize memory usage."""
    df = pd.read_csv(path, index_col=index_col)
    return reduce_mem_usage(df)

def reduce_mem_usage(df):
    """Iterate through all the columns of a dataframe and modify the data type to reduce memory usage."""
    for col in df.columns:
        col_type = df[col].dtype
        if isinstance(col_type, pd.IntervalDtype):
            continue

        if str(col_type)[:3] == 'int':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)  
        elif str(col_type)[:5] == 'float':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
    return df

def feature_engineering(df):
    """Feature engineering on the dataset."""
    # Binning age and converting to categorical labels instead of intervals
    age_bins = pd.cut(df['Age'], bins=7, labels=False)
    df['Age_Type'] = age_bins
    df['Vehicle_Age'] = df['Vehicle_Age'].astype('category').cat.codes
    df['Vehicle_Damage'] = df['Vehicle_Damage'].astype('category').cat.codes
    df['Previously_Insured'] = df['Previously_Insured'].astype('category').cat.codes

    df['Age_x_Vehicle_Age'] = df['Age_Type'] * df['Vehicle_Age']
    df['Age_x_Vehicle_Damage'] = df['Age_Type'] * df['Vehicle_Damage']
    df['Age_x_Previously_Insured'] = df['Age_Type'] * df['Previously_Insured']

    fac_pre = ['Policy_Sales_Channel', 'Vehicle_Damage', 'Annual_Premium', 'Vintage', 'Age_Type']
    col_pre = []
    for i in fac_pre:
        df['Previously_Insured_x_' + i] = pd.factorize(df['Previously_Insured'].astype(str) + df[i].astype(str))[0]
        col_pre.append('Previously_Insured_x_' + i)

    fac_pro = fac_pre[1:]
    col_pro = []
    for i in fac_pro:
        df['Policy_Sales_Channel_x_' + i] = pd.factorize(df['Policy_Sales_Channel'].astype(str) + df[i].astype(str))[0]
        col_pro.append('Policy_Sales_Channel_x_' + i)
    return df, col_pre, col_pro


In [5]:


# Load and optimize data
train_df = import_data(train_path, index_col='id')
test_df = import_data(test_path, index_col='id')

# Combine train and test datasets for consistent transformation
full_df = pd.concat([train_df, test_df], axis=0)

# Convert columns to category type
less = ['Gender', 'Vehicle_Age', 'Vehicle_Damage', 'Policy_Sales_Channel']
for col in less:
    full_df[col] = full_df[col].astype('category')

# Apply feature engineering to the combined dataset
full_df, col_pre, col_pro = feature_engineering(full_df)

# Split back into train and test sets
train_df = full_df.iloc[:len(train_df), :]
test_df = full_df.iloc[len(train_df):, :]

# Split the training data into training and validation sets
X = train_df.drop('Response', axis=1)
y = train_df['Response']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)



In [6]:

# Define the ColumnTransformer
coltrans = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=False, dtype=np.float32), ['Gender', 'Vehicle_Damage']),
        ('minmax', MinMaxScaler(), ['Age', 'Region_Code', 'Previously_Insured', 'Policy_Sales_Channel', 'Vintage']),
        ('ordinal', OrdinalEncoder(categories=[[0, 1, 2]], dtype=np.float32), ['Vehicle_Age']),
        ('robust', RobustScaler(), ['Annual_Premium']),
        ('standard', StandardScaler(), ['Age_Type', 'Age_x_Vehicle_Age', 'Age_x_Vehicle_Damage', 'Age_x_Previously_Insured']),
        ('standard_2', StandardScaler(), col_pre + col_pro),
    ],
    remainder='passthrough'  # Keeps columns not specified in transformers
)

# Fit the transformer on the training data and transform both training and validation sets
X_train = coltrans.fit_transform(X_train)
X_valid = coltrans.transform(X_valid)
test_df = coltrans.transform(test_df.drop('Response', axis=1))

gc.collect()



1995

In [7]:
ratio = len(train_df[train_df['Response'] == 0]) / len(train_df[train_df['Response'] == 1])

class_weights = {0: 1, 1: ratio}

In [1]:
skfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)

NameError: name 'StratifiedKFold' is not defined

In [9]:

# Define CatBoost parameters
cat_params = {
    'loss_function': 'Logloss',
    'eval_metric': 'AUC',
    'class_names': [0, 1],
    'learning_rate': 0.05,
    'iterations': 5000,
    'depth': 9,
    'random_strength': 0,
    'l2_leaf_reg': 0.5,
    'task_type': 'GPU',  # Ensure your environment supports GPU
    'allow_writing_files': False,
    'verbose': 100,
    'class_weights': class_weights 
    # 'thread_count': -1
}

# Initialize lists to store out-of-fold predictions, models, and AUC scores
cat_preds = []
cat_aucs = []

test_pool = Pool(test_df.astype(str), cat_features=X.columns.values)

# CatBoost Model
for fold, (train_idx, test_idx) in enumerate(skfold.split(X, y)):
    
    X_train, y_train = X.iloc[train_idx], y.iloc[train_idx]
    X_valid, y_valid = X.iloc[test_idx], y.iloc[test_idx]
    
    train_pool = Pool(X_train.astype(str), y_train, cat_features=X.columns.values)
    valid_pool = Pool(X_valid.astype(str), y_valid, cat_features=X.columns.values)
    
    model = CatBoostClassifier(**cat_params)
    model.fit(train_pool, eval_set=valid_pool, early_stopping_rounds=50, verbose=100)
    
    valid_preds = model.predict_proba(X_valid.astype(str))[:, 1]
    auc_score = roc_auc_score(y_valid, valid_preds)
    cat_aucs.append(auc_score)
    logging.info(f"Validation AUC score for fold {fold + 1}: {auc_score:.6f}")
    
    test_pred = model.predict_proba(test_pool)[:, 1]
    cat_preds.append(test_pred)
    
    # Clear memory
    del X_train, y_train, X_valid, y_valid, train_pool, valid_pool, model
    gc.collect()


# Calculate overall AUC score for CatBoost
auc_mean_cat = np.mean(cat_aucs)
auc_std_cat = np.std(cat_aucs)

# Average the predictions from each fold for CatBoost
test_pred_cat = np.mean(cat_preds, axis=0)


In [None]:


# Reimport the test_df to get the original index
test_df = import_data(test_path, index_col='id')

# Create a submission DataFrame using the original test index
submission = pd.DataFrame({
    'id': test_df.index,
    'Response': test_pred_cat
})

# Save the submission DataFrame to a CSV file
submission.to_csv('submission_cat.csv', index=False)
