In [None]:
# XGBoost Preprocessing Notebook

# Import Libraries
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder
import logging

# Setting up the logger
logging.basicConfig(level=logging.INFO, filename='preprocessing_xgb.log', filemode='w',
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# Create directories for storing logs
os.makedirs('logs_xgb', exist_ok=True)

# Load Data
train_path = "C:/Users/paulo/OneDrive/Documents/Binary-Classification-of-Insurance-Cross-Selling/train.csv"
test_path = "C:/Users/paulo/OneDrive/Documents/Binary-Classification-of-Insurance-Cross-Selling/test.csv"

logger.info("Loading datasets...")
train_df = pd.read_csv(train_path, index_col='id')
test_df = pd.read_csv(test_path, index_col='id')
logger.info("Datasets loaded successfully.")
logger.info(f"Train dataset shape: {train_df.shape}")
logger.info(f"Test dataset shape: {test_df.shape}")

# Data Preprocessing
# Copy the data
train_xgb = train_df.copy()
test_xgb = test_df.copy()

# Fill missing values
for col in train_xgb.select_dtypes(include=['int64', 'float64']).columns:
    train_xgb[col] = train_xgb[col].fillna(train_xgb[col].median())
    if col in test_xgb.columns:
        test_xgb[col] = test_xgb[col].fillna(test_xgb[col].median())

for col in train_xgb.select_dtypes(include=['object']).columns:
    train_xgb[col] = train_xgb[col].fillna(train_xgb[col].mode()[0])
    if col in test_xgb.columns:
        test_xgb[col] = test_xgb[col].fillna(test_xgb[col].mode()[0])

logger.info("Missing values handled.")

# Encode categorical variables using label encoding
le = LabelEncoder()
for col in train_xgb.select_dtypes(include=['object']).columns:
    train_xgb[col] = le.fit_transform(train_xgb[col])
    if col in test_xgb.columns:
        test_xgb[col] = le.transform(test_xgb[col])

logger.info("Categorical variables encoded.")

# Save the processed data
train_xgb.to_csv('train_xgb_processed.csv', index=False)
test_xgb.to_csv('test_xgb_processed.csv', index=False)
logger.info("Processed data saved as CSV.")
