In [3]:
import pandas as pd
import numpy as np
import gc
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.metrics import roc_auc_score
import xgboost as xgb
import joblib

# Paths to datasets
train_path = r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\train.csv"
test_path = r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\test.csv"

def import_data(path, index_col=None):
    """Import data from a CSV file and optimize memory usage."""
    df = pd.read_csv(path, index_col=index_col)
    return reduce_mem_usage(df)

def reduce_mem_usage(df):
    """Iterate through all the columns of a dataframe and modify the data type to reduce memory usage."""
    for col in df.columns:
        col_type = df[col].dtype
        if isinstance(col_type, pd.IntervalDtype):
            continue

        if str(col_type)[:3] == 'int':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.iinfo(np.int8).min and c_max < np.iinfo(np.int8).max:
                df[col] = df[col].astype(np.int8)
            elif c_min > np.iinfo(np.int16).min and c_max < np.iinfo(np.int16).max:
                df[col] = df[col].astype(np.int16)
            elif c_min > np.iinfo(np.int32).min and c_max < np.iinfo(np.int32).max:
                df[col] = df[col].astype(np.int32)
            elif c_min > np.iinfo(np.int64).min and c_max < np.iinfo(np.int64).max:
                df[col] = df[col].astype(np.int64)  
        elif str(col_type)[:5] == 'float':
            c_min = df[col].min()
            c_max = df[col].max()
            if c_min > np.finfo(np.float32).min and c_max < np.finfo(np.float32).max:
                df[col] = df[col].astype(np.float32)
            else:
                df[col] = df[col].astype(np.float64)
    return df

def feature_engineering(df):
    """Feature engineering on the dataset."""
    # Binning age and converting to categorical labels instead of intervals
    age_bins = pd.cut(df['Age'], bins=7, labels=False)
    df['Age_Type'] = age_bins
    df['Vehicle_Age'] = df['Vehicle_Age'].astype('category').cat.codes
    df['Vehicle_Damage'] = df['Vehicle_Damage'].astype('category').cat.codes
    df['Previously_Insured'] = df['Previously_Insured'].astype('category').cat.codes

    df['Age_x_Vehicle_Age'] = df['Age_Type'] * df['Vehicle_Age']
    df['Age_x_Vehicle_Damage'] = df['Age_Type'] * df['Vehicle_Damage']
    df['Age_x_Previously_Insured'] = df['Age_Type'] * df['Previously_Insured']

    fac_pre = ['Policy_Sales_Channel', 'Vehicle_Damage', 'Annual_Premium', 'Vintage', 'Age_Type']
    col_pre = []
    for i in fac_pre:
        df['Previously_Insured_x_' + i] = pd.factorize(df['Previously_Insured'].astype(str) + df[i].astype(str))[0]
        col_pre.append('Previously_Insured_x_' + i)

    fac_pro = fac_pre[1:]
    col_pro = []
    for i in fac_pro:
        df['Policy_Sales_Channel_x_' + i] = pd.factorize(df['Policy_Sales_Channel'].astype(str) + df[i].astype(str))[0]
        col_pro.append('Policy_Sales_Channel_x_' + i)
    return df, col_pre, col_pro


In [4]:
# Load and optimize data
train_df = import_data(train_path, index_col='id')
test_df = import_data(test_path, index_col='id')

# Combine train and test datasets for consistent transformation
full_df = pd.concat([train_df, test_df], axis=0)

# Convert columns to category type
less = ['Gender', 'Vehicle_Age', 'Vehicle_Damage', 'Policy_Sales_Channel']
for col in less:
    full_df[col] = full_df[col].astype('category')

# Apply feature engineering to the combined dataset
full_df, col_pre, col_pro = feature_engineering(full_df)


In [5]:
# Split back into train and test sets
train_df = full_df.iloc[:len(train_df), :]
test_df = full_df.iloc[len(train_df):, :]

# Split the training data into training and validation sets
X = train_df.drop('Response', axis=1)
y = train_df['Response']

X_train, X_valid, y_train, y_valid = train_test_split(X, y, stratify=y, test_size=0.2, random_state=42)

In [6]:
# Define the ColumnTransformer
coltrans = ColumnTransformer(
    transformers=[
        ('cat', OneHotEncoder(sparse_output=False, dtype=np.float32), ['Gender', 'Vehicle_Damage']),
        ('minmax', MinMaxScaler(), ['Age', 'Region_Code', 'Previously_Insured', 'Policy_Sales_Channel', 'Vintage']),
        ('ordinal', OrdinalEncoder(categories=[[0, 1, 2]], dtype=np.float32), ['Vehicle_Age']),
        ('robust', RobustScaler(), ['Annual_Premium']),
        ('standard', StandardScaler(), ['Age_Type', 'Age_x_Vehicle_Age', 'Age_x_Vehicle_Damage', 'Age_x_Previously_Insured']),
        ('standard_2', StandardScaler(), col_pre + col_pro),
    ],
    remainder='passthrough'  # Keeps columns not specified in transformers
)

# Fit the transformer on the training data and transform both training and validation sets
X_train = coltrans.fit_transform(X_train)
X_valid = coltrans.transform(X_valid)
test_df = coltrans.transform(test_df.drop('Response', axis=1))

gc.collect()

5348

In [7]:
from sklearn.feature_selection import RFE

# Perform feature selection using RFE with XGBoost as the estimator
xgb_params = {
    'random_state': 512,
    'objective': "binary:logistic",
    'eval_metric': 'auc',
    'max_depth': 8,
    'min_child_weight': 12,
    'colsample_bytree': 0.5,
    'gamma': 0.2,
    'learning_rate': 0.09093568107192034,
    'subsample': 1.0,
    'reg_alpha': 0.0011852827097616767,
    'reg_lambda': 1.0735757602378362e-06,
    'max_bin': 197818,
    'scale_pos_weight': len(train_df[train_df['Response'] == 0]) / len(train_df[train_df['Response'] == 1]),  # Adjust this based on your dataset
    'tree_method': 'hist',  # Ensure your environment supports GPU
    'device': 'cuda',  # Ensure your environment supports GPU
}

estimator = xgb.XGBClassifier(**xgb_params)
selector = RFE(estimator, n_features_to_select=19, step=1)
selector = selector.fit(X_train, y_train)

# Transform the training, validation, and test data to keep only selected features
X_train_selected = selector.transform(X_train)
X_valid_selected = selector.transform(X_valid)
X_test_selected = selector.transform(test_df)

# Train XGBoost model with cross-validation on the selected features
xgb_preds = []
xgb_aucs = []

skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=42)
for fold, (train_idx, valid_idx) in enumerate(skf.split(X_train_selected, y_train)):
    print(f"Training fold {fold + 1}")
    X_train_fold, y_train_fold = X_train_selected[train_idx], y_train.iloc[train_idx]
    X_valid_fold, y_valid_fold = X_train_selected[valid_idx], y_train.iloc[valid_idx]
    
    dtrain = xgb.DMatrix(X_train_fold, label=y_train_fold)
    dvalid = xgb.DMatrix(X_valid_fold, label=y_valid_fold)
    
    model = xgb.train(
        xgb_params,
        dtrain,
        num_boost_round=3000,
        evals=[(dtrain, 'train'), (dvalid, 'valid')],
        verbose_eval=100,
        early_stopping_rounds=10,
    )
    
    valid_preds = model.predict(dvalid, iteration_range=(0, model.best_iteration))
    auc_score = roc_auc_score(y_valid_fold, valid_preds)
    xgb_aucs.append(auc_score)
    
    dtest = xgb.DMatrix(X_test_selected)
    test_pred = model.predict(dtest, iteration_range=(0, model.best_iteration))
    xgb_preds.append(test_pred)
    
    # Save the model for this fold
    model.save_model(f'xgb_model_fold_{fold + 1}.json')
    
    # Clear memory
    del X_train_fold, y_train_fold, X_valid_fold, y_valid_fold, dtrain, dvalid, model
    gc.collect()

Training fold 1
[0]	train-auc:0.85702	valid-auc:0.85723
[100]	train-auc:0.87717	valid-auc:0.87674
[200]	train-auc:0.87967	valid-auc:0.87861
[300]	train-auc:0.88064	valid-auc:0.87896
[400]	train-auc:0.88121	valid-auc:0.87907
[450]	train-auc:0.88144	valid-auc:0.87908
Training fold 2
[0]	train-auc:0.85697	valid-auc:0.85713
[100]	train-auc:0.87718	valid-auc:0.87670
[200]	train-auc:0.87972	valid-auc:0.87853
[300]	train-auc:0.88058	valid-auc:0.87884
[367]	train-auc:0.88100	valid-auc:0.87892
Training fold 3
[0]	train-auc:0.85705	valid-auc:0.85733
[100]	train-auc:0.87702	valid-auc:0.87659
[200]	train-auc:0.87963	valid-auc:0.87842
[300]	train-auc:0.88063	valid-auc:0.87881
[400]	train-auc:0.88126	valid-auc:0.87892
[486]	train-auc:0.88168	valid-auc:0.87896
Training fold 4
[0]	train-auc:0.85726	valid-auc:0.85628
[100]	train-auc:0.87728	valid-auc:0.87580
[200]	train-auc:0.87986	valid-auc:0.87778
[300]	train-auc:0.88080	valid-auc:0.87818
[400]	train-auc:0.88141	valid-auc:0.87829
[429]	train-auc:0.88

In [8]:
# Average the predictions from each fold for XGBoost
test_pred_xgb = np.mean(xgb_preds, axis=0)
joblib.dump(test_pred_xgb, 'test_pred_xgb2.pkl')


['test_pred_xgb2.pkl']

In [9]:
# Reimport the test_df to get the original index
test_df = import_data(test_path, index_col='id')

# Create a submission DataFrame using the original test index
submission = pd.DataFrame({
    'id': test_df.index,
    'Response': test_pred_xgb
})

# Save the submission DataFrame to a CSV file
submission.to_csv('submission_xgb2.csv', index=False)
