In [3]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.cluster import KMeans

In [4]:
# Load the datasets
train_df = pd.read_csv(r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\train.csv", index_col='id')
test_df = pd.read_csv(r"C:\Users\paulo\OneDrive\Documents\kaggle_competition_2_datasets\test.csv", index_col='id')

KeyboardInterrupt: 

In [None]:
# Transform binary variables
train_df['Gender'] = train_df['Gender'].map({'Male': 1, 'Female': 0})
train_df['Vehicle_Damage'] = train_df['Vehicle_Damage'].map({'Yes': 1, 'No': 0})

# Drop Driving_License due to limited variability
train_df = train_df.drop(['Driving_License'], axis=1)

In [None]:
# Handle continuous variables
continuous_numeric = ['Age', 'Vintage', 'Annual_Premium']
Q1 = train_df['Annual_Premium'].quantile(0.25)
Q3 = train_df['Annual_Premium'].quantile(0.75)
IQR = Q3 - Q1
lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR
train_df['Outlier_Annual_Premium'] = ((train_df['Annual_Premium'] < lower_bound) | (train_df['Annual_Premium'] > upper_bound)).astype(int)
train_df = train_df[(train_df['Annual_Premium'] >= lower_bound) & (train_df['Annual_Premium'] <= upper_bound)]
train_df = train_df.drop('Outlier_Annual_Premium', axis=1)

In [None]:
# Group rare categories in categorical variables
def group_rare_categories(df, column, threshold=0.01):
    category_freq = df[column].value_counts(normalize=True)
    rare_categories = category_freq[category_freq < threshold].index
    df[column] = df[column].apply(lambda x: 'Other' if x in rare_categories else x)
    return df

categorical = ['Region_Code', 'Policy_Sales_Channel']
for col in categorical:
    train_df = group_rare_categories(train_df, col, 0.01)

In [None]:
# Ordinal Encoding for Vehicle_Age
vehicle_age_mapping = {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2}
train_df['Vehicle_Age'] = train_df['Vehicle_Age'].map(vehicle_age_mapping)

In [None]:
# One-Hot Encoding for other categorical variables
train_df = pd.get_dummies(train_df, columns=categorical, drop_first=True)

# Check columns after one-hot encoding
print("Columns after one-hot encoding:", train_df.columns)

Columns after one-hot encoding: Index(['Gender', 'Age', 'Previously_Insured', 'Vehicle_Age', 'Vehicle_Damage',
       'Annual_Premium', 'Vintage', 'Response', 'Region_Code_3.0',
       'Region_Code_6.0', 'Region_Code_8.0', 'Region_Code_10.0',
       'Region_Code_11.0', 'Region_Code_14.0', 'Region_Code_15.0',
       'Region_Code_18.0', 'Region_Code_21.0', 'Region_Code_28.0',
       'Region_Code_29.0', 'Region_Code_30.0', 'Region_Code_33.0',
       'Region_Code_35.0', 'Region_Code_36.0', 'Region_Code_37.0',
       'Region_Code_41.0', 'Region_Code_45.0', 'Region_Code_46.0',
       'Region_Code_47.0', 'Region_Code_50.0', 'Region_Code_Other',
       'Policy_Sales_Channel_122.0', 'Policy_Sales_Channel_124.0',
       'Policy_Sales_Channel_152.0', 'Policy_Sales_Channel_154.0',
       'Policy_Sales_Channel_160.0', 'Policy_Sales_Channel_Other'],
      dtype='object')


In [None]:
# Feature engineering
def feature_engineering(df):
    df['Age_Vehicle_Age'] = df['Age'] * df['Vehicle_Age']
    df['Age_Previously_Insured'] = df['Age'] * df['Previously_Insured']
    df['Vehicle_Age_Damage'] = df['Vehicle_Age'] * df['Vehicle_Damage']
    df['Previously_Insured_Damage'] = df['Previously_Insured'] * df['Vehicle_Damage']
    df['Age_squared'] = df['Age'] ** 2
    df['Vehicle_Age_squared'] = df['Vehicle_Age'] ** 2
    df['Annual_Premium_per_Age'] = df['Annual_Premium'] / (df['Age'] + 1)
    return df

# Apply feature engineering
train_df = feature_engineering(train_df)

In [None]:
# Update the list of continuous variables to include newly created features
continuous_numeric = continuous_numeric + [
    'Age_Vehicle_Age', 'Age_Previously_Insured', 'Vehicle_Age_Damage', 
    'Previously_Insured_Damage', 'Age_squared', 'Vehicle_Age_squared', 
    'Annual_Premium_per_Age'
]

# Standardize the continuous variables
scaler = StandardScaler()
train_df[continuous_numeric] = scaler.fit_transform(train_df[continuous_numeric])

In [None]:
# Apply KMeans clustering
optimal_clusters = 4
kmeans = KMeans(n_clusters=optimal_clusters, random_state=42)
clusters = kmeans.fit_predict(train_df[continuous_numeric])
train_df['Cluster'] = clusters

In [None]:
train_df.to_csv('train_lgbm_887_preprocessed.csv', index=False)

In [None]:
import klib

klib_train_df = klib.data_cleaning(train_df)

klib_train_df.to_csv('klib_train_lgbm_887_preprocessed.csv', index=False)

Shape of cleaned data: (9120748, 44) - Remaining NAs: 0


Dropped rows: 6777
     of which 6777 duplicates. (Rows (first 150 shown): [58025, 98683, 214214, 225653, 267268, 306179, 394591, 416003, 418979, 433434, 440643, 449138, 450905, 453076, 484045, 499695, 517206, 549350, 573611, 576764, 597168, 597879, 601658, 606598, 634487, 643986, 646593, 655753, 666138, 717180, 731054, 731817, 747756, 760299, 779430, 790316, 797797, 803594, 819563, 820936, 833229, 839101, 852663, 873550, 887899, 896426, 904688, 914794, 929632, 933462, 959614, 965370, 965782, 995289, 1004741, 1006519, 1011429, 1013853, 1024007, 1034690, 1050403, 1054263, 1068651, 1071537, 1079843, 1089169, 1102839, 1105408, 1112322, 1114343, 1115089, 1116690, 1134435, 1139591, 1147844, 1160552, 1179456, 1197529, 1207555, 1213203, 1225645, 1235371, 1244917, 1256503, 1258417, 1258496, 1261902, 1281307, 1281432, 1290599, 1291244, 1296217, 1298837, 1302519, 1331908, 1335833, 1339657, 1340488, 1356986, 1363610, 1365283, 1366143, 1377

In [None]:
train_df = klib_train_df

In [None]:
train_df.head()

In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.cluster import KMeans

In [3]:
train_df = pd.read_csv('klib_train_lgbm_887_preprocessed.csv')

In [4]:
# Separate features and target variable
X = train_df.drop('response', axis=1)
y = train_df['response']

In [5]:
# Split the data
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

In [10]:
# LightGBM parameters
params = {
    'n_estimators': 3000,
    'num_leaves': 14,
    'min_child_samples': 44,
    'learning_rate': 0.013082848414054271,
    'max_bin': 1024,  # log_max_bin of 10 corresponds to 2^10 = 1024
    'colsample_bytree': 0.7020907928739494,
    'reg_alpha': 2.8809013344332164,
    'reg_lambda': 0.501392057176914,
}

# Initialize the LightGBM model
model = lgb.LGBMClassifier(**params, verbose=1, force_col_wise=True)

# Train the model
model.fit(X_train, y_train)


[LightGBM] [Info] Number of positive: 882799, number of negative: 6413799
[LightGBM] [Info] Total Bins 2719
[LightGBM] [Info] Number of data points in the train set: 7296598, number of used features: 43
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.120988 -> initscore=-1.983110
[LightGBM] [Info] Start training from score -1.983110


In [11]:
# Make predictions
y_train_pred_proba = model.predict_proba(X_train)[:, 1]
y_val_pred_proba = model.predict_proba(X_val)[:, 1]

# Calculate ROC AUC scores
roc_auc_train = roc_auc_score(y_train, y_train_pred_proba)
roc_auc_val = roc_auc_score(y_val, y_val_pred_proba)

# Print ROC AUC scores
print(f'Training ROC AUC Score: {roc_auc_train}')
print(f'Validation ROC AUC Score: {roc_auc_val}')

Training ROC AUC Score: 0.8841588129850312
Validation ROC AUC Score: 0.8842255100186597
