In [1]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import StandardScaler, LabelEncoder
from scipy import stats
import logging

# Setup logger
logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

# Create directories for logs
import os
logs_dir = "logs"
if not os.path.exists(logs_dir):
    os.makedirs(logs_dir)

graphs_dir = "graphs"
if not os.path.exists(graphs_dir):
    os.makedirs(graphs_dir)


In [2]:
# Load datasets
train_df = pd.read_csv(r'C:\Users\paulo\OneDrive\Documents\Binary-Classification-of-Insurance-Cross-Selling\train.csv')
test_df = pd.read_csv(r'C:\Users\paulo\OneDrive\Documents\Binary-Classification-of-Insurance-Cross-Selling\test.csv')

# Initial inspection
logger.info(f"Train dataset shape: {train_df.shape}")
logger.info(f"Test dataset shape: {test_df.shape}")

# Checking for missing values
logger.info(f"Missing values in train dataset:\n{train_df.isnull().sum()}")
logger.info(f"Missing values in test dataset:\n{test_df.isnull().sum()}")

# Checking for duplicates
logger.info(f"Duplicate rows in train dataset: {train_df.duplicated().sum()}")
logger.info(f"Duplicate rows in test dataset: {test_df.duplicated().sum()}")


INFO:__main__:Train dataset shape: (11504798, 12)
INFO:__main__:Test dataset shape: (7669866, 11)
INFO:__main__:Missing values in train dataset:
id                      0
Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age             0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
Response                0
dtype: int64
INFO:__main__:Missing values in test dataset:
id                      0
Gender                  0
Age                     0
Driving_License         0
Region_Code             0
Previously_Insured      0
Vehicle_Age             0
Vehicle_Damage          0
Annual_Premium          0
Policy_Sales_Channel    0
Vintage                 0
dtype: int64
INFO:__main__:Duplicate rows in train dataset: 0
INFO:__main__:Duplicate rows in test dataset: 0


In [3]:
# Identify outliers using Z-score
z_scores = stats.zscore(train_df.select_dtypes(include=[np.number]))
abs_z_scores = np.abs(z_scores)
outliers = (abs_z_scores > 3).all(axis=1)
train_outliers = train_df[outliers]
logger.info(f"Number of rows with outliers: {train_outliers.shape[0]}")

# Optionally cap outliers or remove them
train_df = train_df[(abs_z_scores < 3).all(axis=1)]


INFO:__main__:Number of rows with outliers: 0


In [4]:
# Encode categorical variables
label_encoders = {}
categorical_features = ['Gender', 'Vehicle_Age', 'Vehicle_Damage']

for feature in categorical_features:
    le = LabelEncoder()
    train_df[feature] = le.fit_transform(train_df[feature])
    test_df[feature] = le.transform(test_df[feature])
    label_encoders[feature] = le

logger.info(f"Encoded categorical features: {categorical_features}")


INFO:__main__:Encoded categorical features: ['Gender', 'Vehicle_Age', 'Vehicle_Damage']


In [5]:
# Scale numerical features
scaler = StandardScaler()
numerical_features = ['Age', 'Annual_Premium', 'Vintage']

train_df[numerical_features] = scaler.fit_transform(train_df[numerical_features])
test_df[numerical_features] = scaler.transform(test_df[numerical_features])

logger.info(f"Scaled numerical features: {numerical_features}")

# Save the preprocessed datasets
train_df.to_csv(r"C:\Users\paulo\OneDrive\Documents\Binary-Classification-of-Insurance-Cross-Selling\preprocessed_train.csv", index=False)
test_df.to_csv(r"C:\Users\paulo\OneDrive\Documents\Binary-Classification-of-Insurance-Cross-Selling\preprocessed_test.csv", index=False)

logger.info("Preprocessed datasets saved.")


INFO:__main__:Scaled numerical features: ['Age', 'Annual_Premium', 'Vintage']
INFO:__main__:Preprocessed datasets saved.
