In [1]:
# Import Libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.cuda.amp import GradScaler, autocast
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, filename='base_model_analysis.log', filemode='w', 
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# Load Data
df = pd.read_csv('transformed__train_dataframe.csv')
logger.info(f"Data loaded successfully. Shape: {df.shape}")

# Define Sample Proportion
sample_proportion = 0.40
logger.info(f"Using sample proportion: {sample_proportion * 100}%")

# Create a Stratified Sample
sample_df, _ = train_test_split(df, stratify=df['Response'], test_size=(1 - sample_proportion), random_state=42)
logger.info(f"Sampled dataset with {len(sample_df)} samples")

# Save the sample to a new CSV file for reference
sample_df.to_csv('ideal_sample_dataframe.csv', index=False)
logger.info("Sampled dataset saved to 'ideal_sample_dataframe.csv'")

# Simplified Neural Network Model
class SimplifiedNN(nn.Module):
    def __init__(self, input_dim):
        super(SimplifiedNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)  # No sigmoid here
        return x

# Function to Train and Evaluate the Model
def train_and_evaluate(df, target, epochs=50, random_state=42):
    input_dim = df.drop(target, axis=1).shape[1]

    X = df.drop(target, axis=1).values
    y = df[target].values
    
    X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=random_state)
    
    scaler = StandardScaler()
    X_train = scaler.fit_transform(X_train)
    X_val = scaler.transform(X_val)

    tensor_x_train = torch.tensor(X_train, dtype=torch.float32)
    tensor_y_train = torch.tensor(y_train, dtype=torch.float32)
    tensor_x_val = torch.tensor(X_val, dtype=torch.float32)
    tensor_y_val = torch.tensor(y_val, dtype=torch.float32)

    train_dataset = TensorDataset(tensor_x_train, tensor_y_train)
    val_dataset = TensorDataset(tensor_x_val, tensor_y_val)
    train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

    model = SimplifiedNN(input_dim).to(device)
    criterion = nn.BCEWithLogitsLoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    scaler = GradScaler()

    for epoch in range(epochs):
        logger.info(f"Training epoch {epoch+1}/{epochs}")
        model.train()
        for inputs, labels in train_loader:
            inputs, labels = inputs.to(device), labels.to(device)
            optimizer.zero_grad()
            with autocast():
                outputs = model(inputs)
                loss = criterion(outputs, labels.unsqueeze(1))
            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
        
        # Validation
        model.eval()
        all_labels = []
        all_outputs = []
        with torch.no_grad():
            for inputs, labels in val_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                outputs = model(inputs)
                all_labels.extend(labels.cpu().numpy())
                all_outputs.extend(outputs.cpu().numpy())

        roc_auc = roc_auc_score(all_labels, all_outputs)
        logger.info(f'Epoch {epoch+1}/{epochs}, ROC AUC Score: {roc_auc:.4f}')
        print(f'Epoch {epoch+1}/{epochs}, ROC AUC Score: {roc_auc:.4f}')
    
    return model, roc_auc

# Define Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# Train and Evaluate the Model on the Ideal Sample
logger.info("Training and evaluating the model on the ideal sample size...")
print("Training and evaluating the model on the ideal sample size...")
model, roc_auc = train_and_evaluate(sample_df, 'Response', epochs=5)
logger.info("Model training and evaluation completed.")
print("Model training and evaluation completed.")

# Print the final ROC AUC score
print(f"Final ROC AUC Score: {roc_auc:.4f}")


Training and evaluating the model on the ideal sample size...
Epoch 1/5, ROC AUC Score: 0.8679
Epoch 2/5, ROC AUC Score: 0.8685
Epoch 3/5, ROC AUC Score: 0.8684
Epoch 4/5, ROC AUC Score: 0.8688
Epoch 5/5, ROC AUC Score: 0.8692
Model training and evaluation completed.
Final ROC AUC Score: 0.8692
