In [2]:
# Import Libraries
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import TensorDataset, DataLoader
from torch.cuda.amp import GradScaler, autocast
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_auc_score
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
import logging

# Set up logging
logging.basicConfig(level=logging.INFO, filename='sample_size_analysis.log', filemode='w', 
                    format='%(asctime)s - %(levelname)s - %(message)s')
logger = logging.getLogger()

# Load Data
df = pd.read_csv('transformed__train_dataframe.csv')
logger.info(f"Data loaded successfully. Shape: {df.shape}")

# Simplified Neural Network Model
class SimplifiedNN(nn.Module):
    def __init__(self, input_dim):
        super(SimplifiedNN, self).__init__()
        self.fc1 = nn.Linear(input_dim, 64)
        self.fc2 = nn.Linear(64, 1)
        self.relu = nn.ReLU()
        self.dropout = nn.Dropout(0.3)

    def forward(self, x):
        x = self.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)  # No sigmoid here
        return x

# Function to Evaluate Sample Sizes
def evaluate_sample_sizes(df, target, sample_sizes, epochs=2, random_state=42):
    results = []
    input_dim = df.drop(target, axis=1).shape[1]
    
    for sample_size in sample_sizes:
        logger.info(f"Processing sample size: {sample_size}")
        print(f"Processing sample size: {sample_size}")
        sample_df, _ = train_test_split(df, stratify=df[target], test_size=(1 - sample_size / len(df)), random_state=random_state)
        
        X = sample_df.drop(target, axis=1).values
        y = sample_df[target].values
        
        X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=random_state)
        
        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_val = scaler.transform(X_val)

        tensor_x_train = torch.tensor(X_train, dtype=torch.float32)
        tensor_y_train = torch.tensor(y_train, dtype=torch.float32)
        tensor_x_val = torch.tensor(X_val, dtype=torch.float32)
        tensor_y_val = torch.tensor(y_val, dtype=torch.float32)

        train_dataset = TensorDataset(tensor_x_train, tensor_y_train)
        val_dataset = TensorDataset(tensor_x_val, tensor_y_val)
        train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True)
        val_loader = DataLoader(val_dataset, batch_size=64, shuffle=False)

        model = SimplifiedNN(input_dim).to(device)
        criterion = nn.BCEWithLogitsLoss()
        optimizer = optim.Adam(model.parameters(), lr=0.001)
        scaler = GradScaler()
        
        for epoch in range(epochs):
            logger.info(f"Training epoch {epoch+1}/{epochs} for sample size {sample_size}")
            print(f"Training epoch {epoch+1}/{epochs} for sample size {sample_size}")
            model.train()
            for inputs, labels in train_loader:
                inputs, labels = inputs.to(device), labels.to(device)
                optimizer.zero_grad()
                with autocast():
                    outputs = model(inputs)
                    loss = criterion(outputs, labels.unsqueeze(1))
                scaler.scale(loss).backward()
                scaler.step(optimizer)
                scaler.update()
            
            # Validation
            model.eval()
            all_labels = []
            all_outputs = []
            with torch.no_grad():
                for inputs, labels in val_loader:
                    inputs, labels = inputs.to(device), labels.to(device)
                    outputs = model(inputs)
                    all_labels.extend(labels.cpu().numpy())
                    all_outputs.extend(outputs.cpu().numpy())

            roc_auc = roc_auc_score(all_labels, all_outputs)
            logger.info(f'Sample Size: {sample_size}, Epoch {epoch+1}/{epochs}, ROC AUC Score: {roc_auc:.4f}')
            print(f'Sample Size: {sample_size}, Epoch {epoch+1}/{epochs}, ROC AUC Score: {roc_auc:.4f}')
        
        results.append((sample_size, roc_auc))
    
    return results

# Define Device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
logger.info(f"Using device: {device}")

# Define Sample Sizes to Test (starting from the most meaningful size)
start_size = 2944827
end_size = len(df)
sample_sizes = np.linspace(start_size, end_size, 10, dtype=int)

# Evaluate and Find Ideal Sample Size
logger.info("Evaluating sample sizes...")
print("Evaluating sample sizes...")
results = evaluate_sample_sizes(df, 'Response', sample_sizes, epochs=2)
logger.info("Evaluation completed.")
print("Evaluation completed.")

# Convert Results to DataFrame for Easy Analysis
results_df = pd.DataFrame(results, columns=['Sample Size', 'ROC AUC Score'])
print(results_df)

# Plot Results
plt.figure(figsize=(10, 5))
plt.plot(results_df['Sample Size'], results_df['ROC AUC Score'], marker='o')
plt.xlabel('Sample Size')
plt.ylabel('ROC AUC Score')
plt.title('Sample Size vs. ROC AUC Score')
plt.grid(True)
plt.show()


Evaluating sample sizes...
Processing sample size: 2944827
Training epoch 1/2 for sample size 2944827
Sample Size: 2944827, Epoch 1/2, ROC AUC Score: 0.8684
Training epoch 2/2 for sample size 2944827
Sample Size: 2944827, Epoch 2/2, ROC AUC Score: 0.8693
Processing sample size: 3631793
Training epoch 1/2 for sample size 3631793
Sample Size: 3631793, Epoch 1/2, ROC AUC Score: 0.8688
Training epoch 2/2 for sample size 3631793
Sample Size: 3631793, Epoch 2/2, ROC AUC Score: 0.8689
Processing sample size: 4318759
Training epoch 1/2 for sample size 4318759
Sample Size: 4318759, Epoch 1/2, ROC AUC Score: 0.8666
Training epoch 2/2 for sample size 4318759
Sample Size: 4318759, Epoch 2/2, ROC AUC Score: 0.8677
Processing sample size: 5005726
Training epoch 1/2 for sample size 5005726
Sample Size: 5005726, Epoch 1/2, ROC AUC Score: 0.8683
Training epoch 2/2 for sample size 5005726
Sample Size: 5005726, Epoch 2/2, ROC AUC Score: 0.8686
Processing sample size: 5692692
Training epoch 1/2 for sample

InvalidParameterError: The 'test_size' parameter of train_test_split must be a float in the range (0.0, 1.0), an int in the range [1, inf) or None. Got 0.0 instead.