In [1]:
# PyTorch Preprocessing Notebook

# Import Libraries
import os
import pandas as pd
from sklearn.preprocessing import LabelEncoder, StandardScaler
import logging

# Setting up the logger
logging.basicConfig(level=logging.INFO, filename='preprocessing_pytorch.log', filemode='w',
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# Create directories for storing logs
os.makedirs('logs_pytorch', exist_ok=True)

# Load Data
train_path = "C:/Users/paulo/OneDrive/Documents/Binary-Classification-of-Insurance-Cross-Selling/train.csv"
test_path = "C:/Users/paulo/OneDrive/Documents/Binary-Classification-of-Insurance-Cross-Selling/test.csv"

logger.info("Loading datasets...")
train_df = pd.read_csv(train_path, index_col='id')
test_df = pd.read_csv(test_path, index_col='id')
logger.info("Datasets loaded successfully.")
logger.info(f"Train dataset shape: {train_df.shape}")
logger.info(f"Test dataset shape: {test_df.shape}")

# Data Preprocessing
# Copy the data
train_pytorch = train_df.copy()
test_pytorch = test_df.copy()

# Fill missing values
for col in train_pytorch.select_dtypes(include=['int64', 'float64']).columns:
    train_pytorch[col] = train_pytorch[col].fillna(train_pytorch[col].median())
    if col in test_pytorch.columns:
        test_pytorch[col] = test_pytorch[col].fillna(test_pytorch[col].median())

for col in train_pytorch.select_dtypes(include=['object']).columns:
    train_pytorch[col] = train_pytorch[col].fillna(train_pytorch[col].mode()[0])
    if col in test_pytorch.columns:
        test_pytorch[col] = test_pytorch[col].fillna(test_pytorch[col].mode()[0])

logger.info("Missing values handled.")

# Encode categorical variables using one-hot encoding
train_pytorch = pd.get_dummies(train_pytorch)
test_pytorch = pd.get_dummies(test_pytorch)

# Ensure the test set has the same columns as the training set
test_pytorch = test_pytorch.reindex(columns=train_pytorch.columns, fill_value=0)

logger.info("Categorical variables encoded and aligned.")

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(train_pytorch.drop('Response', axis=1))
X_test = scaler.transform(test_pytorch.drop('Response', axis=1, errors='ignore'))

# Create the final DataFrame for saving
train_pytorch_scaled = pd.DataFrame(X_train, columns=train_pytorch.drop('Response', axis=1).columns)
train_pytorch_scaled['Response'] = train_pytorch['Response'].values
test_pytorch_scaled = pd.DataFrame(X_test, columns=test_pytorch.drop('Response', axis=1).columns)

logger.info("Features standardized.")

# Save the processed data
train_pytorch_scaled.to_csv('train_pytorch_processed.csv', index=False)
test_pytorch_scaled.to_csv('test_pytorch_processed.csv', index=False)
logger.info("Processed data saved as CSV.")
