In [1]:
# PyTorch Model Training Notebook

# Import Libraries
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import logging
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset
from sklearn.preprocessing import LabelEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score, accuracy_score

# Setting up the logger
logging.basicConfig(level=logging.INFO, filename='model_training_pytorch_base.log', filemode='w',
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# Create directories for storing graphs
os.makedirs('graphs_pytorch_base', exist_ok=True)

# Check for GPU availability
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# Load Data
train_path = "C:/Users/paulo/OneDrive/Documents/Binary-Classification-of-Insurance-Cross-Selling/train.csv"
test_path = "C:/Users/paulo/OneDrive/Documents/Binary-Classification-of-Insurance-Cross-Selling/test.csv"

logger.info("Loading datasets...")
train_df = pd.read_csv(train_path, index_col='id')
test_df = pd.read_csv(test_path, index_col='id')
logger.info("Datasets loaded successfully.")
logger.info(f"Train dataset shape: {train_df.shape}")
logger.info(f"Test dataset shape: {test_df.shape}")

# Use a 40% sample of the training data
logger.info("Sampling 40% of the training data...")
train_sample = train_df.sample(frac=0.4, random_state=42)
test_sample = test_df.copy()  # Use the entire test set for final evaluation
logger.info(f"Train sample shape: {train_sample.shape}")
logger.info(f"Test sample shape: {test_sample.shape}")

# Data Preprocessing
# Copy the sample data
train_pytorch = train_sample.copy()
test_pytorch = test_sample.copy()

# Fill missing values
for col in train_pytorch.select_dtypes(include=['int64', 'float64']).columns:
    train_pytorch[col] = train_pytorch[col].fillna(train_pytorch[col].median())
    if col in test_pytorch.columns:
        test_pytorch[col] = test_pytorch[col].fillna(test_pytorch[col].median())

for col in train_pytorch.select_dtypes(include=['object']).columns:
    train_pytorch[col] = train_pytorch[col].fillna(train_pytorch[col].mode()[0])
    if col in test_pytorch.columns:
        test_pytorch[col] = test_pytorch[col].fillna(test_pytorch[col].mode()[0])

logger.info("Missing values handled.")

# Encode categorical variables using one-hot encoding
train_pytorch = pd.get_dummies(train_pytorch)
test_pytorch = pd.get_dummies(test_pytorch)

# Ensure the test set has the same columns as the training set
test_pytorch = test_pytorch.reindex(columns=train_pytorch.columns, fill_value=0)

logger.info("Categorical variables encoded and aligned.")

# Split data into features and target
X = train_pytorch.drop('Response', axis=1)
y = train_pytorch['Response']

# Split into training and validation sets
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=42)

# Standardize the features
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_val = scaler.transform(X_val)
X_test = scaler.transform(test_pytorch.drop('Response', axis=1, errors='ignore'))  # Test data doesn't have 'Response' column

logger.info("Features standardized.")

# Convert data to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32).to(device)
y_train_tensor = torch.tensor(y_train.values, dtype=torch.float32).unsqueeze(1).to(device)
X_val_tensor = torch.tensor(X_val, dtype=torch.float32).to(device)
y_val_tensor = torch.tensor(y_val.values, dtype=torch.float32).unsqueeze(1).to(device)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32).to(device)

# Create DataLoader
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)

logger.info("Data converted to PyTorch tensors and DataLoader created.")

# Define the model
class SimpleNN(nn.Module):
    def __init__(self, input_dim):
        super(SimpleNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 128)
        self.fc2 = nn.Linear(128, 64)
        self.fc3 = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.relu(self.fc2(x))
        x = self.sigmoid(self.fc3(x))
        return x

input_dim = X_train.shape[1]
model = SimpleNN(input_dim).to(device)
logger.info("Model defined.")

# Define loss and optimizer
criterion = nn.BCELoss()
optimizer = optim.Adam(model.parameters(), lr=0.001)
logger.info("Loss function and optimizer defined.")

# Training loop
num_epochs = 10
model.train()
for epoch in range(num_epochs):
    epoch_loss = 0
    total = 0
    y_true = []
    y_pred = []
    
    for X_batch, y_batch in train_loader:
        # Forward pass
        outputs = model(X_batch)
        loss = criterion(outputs, y_batch)
        
        # Backward pass and optimization
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()
        
        # Calculate running loss
        epoch_loss += loss.item()
        total += y_batch.size(0)
        
        # Store true and predicted values for ROC AUC calculation
        y_true.extend(y_batch.cpu().numpy())
        y_pred.extend(outputs.detach().cpu().numpy())
    
    # Calculate training ROC AUC
    train_roc_auc = roc_auc_score(y_true, y_pred)
    
    # Calculate metrics on validation set
    model.eval()
    with torch.no_grad():
        val_outputs = model(X_val_tensor)
        val_loss = criterion(val_outputs, y_val_tensor).item()
        val_roc_auc = roc_auc_score(y_val_tensor.cpu(), val_outputs.cpu())
    
    logger.info(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/total:.4f}, Training ROC AUC: {train_roc_auc:.4f}, Validation Loss: {val_loss:.4f}, Validation ROC AUC: {val_roc_auc:.4f}")
    print(f"Epoch [{epoch+1}/{num_epochs}], Loss: {epoch_loss/total:.4f}, Training ROC AUC: {train_roc_auc:.4f}, Validation Loss: {val_loss:.4f}, Validation ROC AUC: {val_roc_auc:.4f}")
    model.train()

# Final predictions on the test set
model.eval()
with torch.no_grad():
    y_pred_pytorch_test = model(X_test_tensor).cpu().numpy().flatten()

# Save final predictions
test_pytorch['Response_PyTorch'] = y_pred_pytorch_test
test_pytorch[['Response_PyTorch']].to_csv('test_predictions_pytorch_base.csv', index=True)

logger.info('Final predictions saved to test_predictions_pytorch_base.csv')
print('Final predictions saved to test_predictions_pytorch_base.csv')

# Visualize loss over epochs (if desired)
loss_values = [loss.item() for epoch in range(num_epochs)]
plt.plot(range(num_epochs), loss_values)
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training Loss Over Epochs')
plt.savefig('graphs_pytorch_base/training_loss.png')
plt.show()
logger.info('Training loss plot saved.')


Epoch [1/10], Loss: 0.0042, Training ROC AUC: 0.8551, Validation Loss: 0.2651, Validation ROC AUC: 0.8576
Epoch [2/10], Loss: 0.0041, Training ROC AUC: 0.8574, Validation Loss: 0.2650, Validation ROC AUC: 0.8582
Epoch [3/10], Loss: 0.0041, Training ROC AUC: 0.8582, Validation Loss: 0.2647, Validation ROC AUC: 0.8588
Epoch [4/10], Loss: 0.0041, Training ROC AUC: 0.8588, Validation Loss: 0.2644, Validation ROC AUC: 0.8592
Epoch [5/10], Loss: 0.0041, Training ROC AUC: 0.8591, Validation Loss: 0.2640, Validation ROC AUC: 0.8597
