In [1]:
import pandas as pd
import numpy as np
import lightgbm as lgb
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.metrics import roc_auc_score
from category_encoders import TargetEncoder
import logging
import matplotlib.pyplot as plt
import seaborn as sns
import joblib
from imblearn.over_sampling import SMOTE

# Configure logger
logger = logging.getLogger(__name__)
logger.setLevel(logging.INFO)

# Create handlers
console_handler = logging.StreamHandler()
file_handler = logging.FileHandler('training.log')

# Create formatters and add them to the handlers
console_format = logging.Formatter('%(asctime)s - %(name)s - %(levelname)s - %(message)s')
file_format = logging.Formatter('%(asctime)s - %(name)s - %(levellevel)s - %(message)s')
console_handler.setFormatter(console_format)
file_handler.setFormatter(file_format)

# Add handlers to the logger
logger.addHandler(console_handler)
logger.addHandler(file_handler)

# Load datasets
train_df = pd.read_csv(r"C:\Users\paulo\OneDrive\Documents\Binary-Classification-of-Insurance-Cross-Selling\preprocessed_train.csv")
test_df = pd.read_csv(r"C:\Users\paulo\OneDrive\Documents\Binary-Classification-of-Insurance-Cross-Selling\preprocessed_test.csv")

# Drop ID column from training and test data
train_df.drop(columns=['id'], inplace=True)
test_df.drop(columns=['id'], inplace=True)

logger.info(f"Train dataset shape: {train_df.shape}")
logger.info(f"Test dataset shape: {test_df.shape}")

# Interaction features
interaction_features = {
    'Age_Annual_Premium': train_df['Age'] * train_df['Annual_Premium'],
    'Age_Vintage': train_df['Age'] * train_df['Vintage'],
    'Annual_Premium_Vintage': train_df['Annual_Premium'] * train_df['Vintage'],
    'Age_Region_Code': train_df['Age'] * train_df['Region_Code'],
    'Vintage_Region_Code': train_df['Vintage'] * train_df['Region_Code'],
    'Annual_Premium_Region_Code': train_df['Annual_Premium'] * train_df['Region_Code']
}
train_df = train_df.assign(**interaction_features)

logger.info(f"Interaction features created")

# Target encoding
target_enc = TargetEncoder(cols=['Gender', 'Vehicle_Age', 'Vehicle_Damage'])
train_df = target_enc.fit_transform(train_df, train_df['Response'])

logger.info(f"Target encoding performed for categorical variables.")

# Polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=True, include_bias=False)
poly_features = poly.fit_transform(train_df[['Age', 'Annual_Premium', 'Vintage']])
poly_feature_names = poly.get_feature_names_out(['Age', 'Annual_Premium', 'Vintage'])

# Create a DataFrame for polynomial features ensuring no duplicates with existing feature names
poly_df = pd.DataFrame(poly_features, columns=[f'poly_{name.replace(" ", "_")}' for name in poly_feature_names], index=train_df.index)

# Concatenate polynomial features with train_df ensuring no duplicates
train_df = pd.concat([train_df, poly_df], axis=1)

logger.info(f"Polynomial features created")

# Apply StandardScaler to the features
scaler = StandardScaler()
scaled_features = scaler.fit_transform(train_df.drop(columns=['Response']))
scaled_df = pd.DataFrame(scaled_features, columns=train_df.columns.drop('Response').str.replace(' ', '_'))
scaled_df['Response'] = train_df['Response'].values

# Handle imbalanced data using SMOTE
X_resampled, y_resampled = SMOTE().fit_resample(scaled_df.drop(columns=['Response']), scaled_df['Response'])

logger.info(f"Data resampled using SMOTE. New shape: {X_resampled.shape}")

# Save the transformed datasets to CSV
X_resampled.to_csv('X_transformed.csv', index=False)
y_resampled.to_csv('y_transformed.csv', index=False)

logger.info("Transformed datasets saved to CSV files")

Dask dataframe query planning is disabled because dask-expr is not installed.

You can install it with `pip install dask[dataframe]` or `conda install dask`.
This will raise in a future version.

2024-07-13 17:44:42,825 - __main__ - INFO - Train dataset shape: (11465233, 11)
--- Logging error ---
Traceback (most recent call last):
  File "c:\Users\paulo\anaconda3\envs\pytorch_env\Lib\logging\__init__.py", line 464, in format
    return self._format(record)
           ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\paulo\anaconda3\envs\pytorch_env\Lib\logging\__init__.py", line 460, in _format
    return self._fmt % values
           ~~~~~~~~~~^~~~~~~~
KeyError: 'levellevel'

During handling of the above exception, another exception occurred:

Traceback (most recent call last):
  File "c:\Users\paulo\anaconda3\envs\pytorch_env\Lib\logging\__init__.py", line 1160, in emit
    msg = self.format(record)
          ^^^^^^^^^^^^^^^^^^^
  File "c:\Users\paulo\anaconda3\envs\pytorch_env\Lib\logging\__ini

In [None]:
# If needed, load the transformed datasets from CSV
# X_resampled = pd.read_csv('X_transformed.csv')
# y_resampled = pd.read_csv('y_transformed.csv')

# Split the data into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X_resampled, y_resampled, test_size=0.2, stratify=y_resampled, random_state=42)

logger.info(f"Training set shape: {X_train.shape}")
logger.info(f"Validation set shape: {X_val.shape}")

In [None]:
from sklearn.model_selection import RandomizedSearchCV
from sklearn.metrics import make_scorer, roc_auc_score
import lightgbm as lgb

# Define the parameter grid
param_dist = {
    'learning_rate': np.linspace(0.03, 0.1, 10),
    'num_leaves': np.arange(60, 121, 10),
    'max_depth': np.arange(10, 16),
    'min_data_in_leaf': np.arange(10, 51, 10),
    'bagging_fraction': np.linspace(0.6, 0.8, 5),
    'feature_fraction': np.linspace(0.6, 0.8, 5),
    'lambda_l1': np.linspace(0.0, 1.0, 5),
    'lambda_l2': np.linspace(0.0, 1.0, 5),
    'bagging_freq': np.arange(1, 8)
}

# Initialize the LightGBM classifier
lgb_model = lgb.LGBMClassifier(objective='binary', metric='auc')

# Define the scoring function
scorer = make_scorer(roc_auc_score, greater_is_better=True, needs_proba=True)

# Initialize RandomizedSearchCV
random_search = RandomizedSearchCV(
    estimator=lgb_model,
    param_distributions=param_dist,
    n_iter=1,  # Increase number of iterations for better search
    scoring=scorer,
    cv=3,  # 3-fold cross-validation
    verbose=1,
    random_state=42,
    n_jobs=-1
)

# Fit RandomizedSearchCV
random_search.fit(X_train, y_train)

# Get the best trial
best_model = random_search.best_estimator_
best_params = random_search.best_params_
best_score = random_search.best_score_

logger.info(f"Best AUC: {best_score}")
logger.info(f"Best parameters: {best_params}")

# Save the best trial parameters
pd.DataFrame([best_params]).to_csv('best_params_randomsearch.csv', index=False)

# Load the best parameters from the saved file (if needed)
best_params_loaded = pd.read_csv('best_params_randomsearch.csv').to_dict(orient='records')[0]

# Optionally, you can refit the model on the entire training set using the best parameters
best_model_retrained = lgb.LGBMClassifier(**best_params_loaded)
best_model_retrained.fit(X_train, y_train)

# Evaluate the best model on the training set
train_preds = best_model_retrained.predict_proba(X_train)[:, 1]
train_auc = roc_auc_score(y_train, train_preds)

# Evaluate the best model on the validation set
val_preds = best_model_retrained.predict_proba(X_val)[:, 1]
val_auc = roc_auc_score(y_val, val_preds)

logger.info(f"Training AUC with best parameters: {train_auc}")
logger.info(f"Validation AUC with best parameters: {val_auc}")

# Check for overfitting
overfit_threshold = 0.05  # Adjust the threshold as needed
overfit_metric = abs(train_auc - val_auc)
if overfit_metric > overfit_threshold:
    logger.warning(f"Overfitting detected: Train AUC - {train_auc}, Val AUC - {val_auc}, Difference - {overfit_metric}")
else:
    logger.info(f"No overfitting detected: Train AUC - {train_auc}, Val AUC - {val_auc}, Difference - {overfit_metric}")

In [None]:
# Train a LightGBM model with best params
train_data = lgb.Dataset(X_train, label=y_train, free_raw_data=False)
val_data = lgb.Dataset(X_val, label=y_val, reference=train_data, free_raw_data=False)

model = lgb.train(
    best_params_loaded,
    train_data,
    num_boost_round=200,
    valid_sets=[train_data, val_data],
    early_stopping_rounds=20,
    verbose_eval=10
)

# Predict on train and validation set
y_train_pred = model.predict(X_train, num_iteration=model.best_iteration)
y_val_pred = model.predict(X_val, num_iteration=model.best_iteration)

# Evaluation metrics
train_auc = roc_auc_score(y_train, y_train_pred)
val_auc = roc_auc_score(y_val, y_val_pred)

logger.info(f"Final Train AUC: {train_auc}")
logger.info(f"Final Validation AUC: {val_auc}")

# Feature Importance
importance = model.feature_importance(importance_type='split')
feature_names = model.feature_name()

# Create a DataFrame for feature importance
feature_importance_df = pd.DataFrame({
    'Feature': feature_names,
    'Importance': importance
}).sort_values(by='Importance', ascending=False)

# Save feature importance to CSV
feature_importance_df.to_csv("feature_importance.csv", index=False)
logger.info("Feature importance saved as feature_importance.csv")

# Plot feature importance
plt.figure(figsize=(10, 8))
sns.barplot(data=feature_importance_df, x='Importance', y='Feature')
plt.xlabel("Importance")
plt.ylabel("Feature")
plt.title("Feature Importance")
plt.gca().invert_yaxis()
plt.savefig("feature_importance.png")
logger.info("Feature importance plot saved as feature_importance.png")

plt.show()

# Save the model and metrics
joblib.dump(model, "lightgbm_model_best.pkl")
logger.info("Model saved as lightgbm_model_best.pkl")

# Save the metrics to a CSV file
metrics = {
    "train_auc": train_auc,
    "val_auc": val_auc
}

metrics_df = pd.DataFrame(metrics, index=[0])
metrics_df.to_csv("model_metrics.csv", index=False)
logger.info("Model metrics saved as model_metrics.csv")

# Final log message
logger.info("Training and evaluation process completed successfully.")