In [10]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import lightgbm as lgb
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import KFold, StratifiedKFold

In [2]:
train_data = pd.read_csv("playground-series-s4e7/train.csv")
test_data = pd.read_csv("playground-series-s4e7/test.csv")

In [3]:
X = train_data.drop(columns=['id', 'Response'])
y = train_data['Response']

In [4]:
mappings = {
    'Gender': {'Male': 0, 'Female': 1},
    'Vehicle_Age': {'< 1 Year': 0, '1-2 Year': 1, '> 2 Years': 2},
    'Vehicle_Damage': {'Yes': 1, 'No': 0}
}

In [23]:
for col, mapping in mappings.items():
    X[col] = X[col].map(mapping)

categorical_cols = ["Gender", "Driving_License", "Region_Code", "Previously_Insured", "Vehicle_Age", "Vehicle_Damage", "Policy_Sales_Channel"]
numerical_cols = [col for col in X.columns if col not in categorical_cols]

In [25]:
from sklearn.impute import SimpleImputer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [26]:
numeric_transformer = Pipeline(steps=[
    ('imputer', SimpleImputer(strategy='most_frequent')),
])

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numerical_cols),
    ])

X_imputed = preprocessor.fit_transform(X)
X_imputed = pd.DataFrame(X_imputed, columns=numerical_cols)

In [27]:
print("Missing values in imputed data:\n", X_imputed.isnull().sum())
print("Infinity values in imputed data:\n", X_imputed[X_imputed == np.inf].sum())

Missing values in imputed data:
 Age               0
Annual_Premium    0
Vintage           0
dtype: int64
Infinity values in imputed data:
 Age               0.0
Annual_Premium    0.0
Vintage           0.0
dtype: float64


In [28]:
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
X_poly = poly.fit_transform(X_imputed)

In [29]:
print(f"Original shape: {X.shape}")
print(f"Polynomial features shape: {X_poly.shape}")

Original shape: (11504798, 10)
Polynomial features shape: (11504798, 6)


In [30]:
skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=0)
roc_auc_scores = []

In [31]:
for train_index, val_index in skf.split(X_poly, y):
    X_train, X_val = X_poly[train_index], X_poly[val_index]
    y_train, y_val = y[train_index], y[val_index]
    
    lgbm_classifier = lgb.LGBMClassifier(random_state=0, class_weight='balanced')
    lgbm_classifier.fit(X_train, y_train, eval_set=(X_val, y_val), early_stopping_rounds=100, verbose=100)
    
    val_preds = lgbm_classifier.predict_proba(X_val)[:, 1]
    roc_auc_scores.append(roc_auc_score(y_val, val_preds))

Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.592611
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.592611
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.592377
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.592377
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.592042
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.592042
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.591802
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.591802
Training until validation scores don't improve for 100 rounds
[100]	valid_0's binary_logloss: 0.592048
Did not meet early stopping. Best iteration is:
[100]	valid_0's binary_logloss: 0.592048


In [32]:
print('Average LightGBM Stratified K-Fold ROC-AUC: {:.4f}'.format(np.mean(roc_auc_scores)))

Average LightGBM Stratified K-Fold ROC-AUC: 0.7534


In [33]:
test_ids = test_data['id']
X_test = test_data.drop(columns=['id'])

In [34]:
for col, mapping in mappings.items():
    X_test[col] = X_test[col].map(mapping)

In [35]:
X_test_imputed = preprocessor.transform(X_test)
X_test_imputed = pd.DataFrame(X_test_imputed, columns=numerical_cols)

In [36]:
print("Missing values in test data:\n", X_test_imputed.isnull().sum())
print("Infinity values in test data:\n", X_test_imputed[X_test_imputed == np.inf].sum())

Missing values in test data:
 Age               0
Annual_Premium    0
Vintage           0
dtype: int64
Infinity values in test data:
 Age               0.0
Annual_Premium    0.0
Vintage           0.0
dtype: float64


In [37]:
X_test_poly = poly.transform(X_test_imputed)

In [38]:
print(f"Test original shape: {X_test.shape}")
print(f"Test polynomial features shape: {X_test_poly.shape}")

Test original shape: (7669866, 10)
Test polynomial features shape: (7669866, 6)


In [39]:
lgbm_predictions_test = lgbm_classifier.predict_proba(X_test_poly)[:, 1]

In [41]:
result = pd.DataFrame({'id': test_ids, 'Response': lgbm_predictions_test.flatten()})
result.to_csv("submission7.csv", index=False)