In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score,confusion_matrix,classification_report
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader, TensorDataset,Dataset,random_split


In [2]:
def load_and_preprocess_data(filepath):
    # Load the dataset
    df = pd.read_csv(filepath)
    
    # Handle missing values (replace '#' with appropriate strategy)
    df = df.replace('#', np.nan)
    df = df.dropna()
    
    # Convert categorical variables
    categorical_columns = ['Menopause', 'Breast', 'Metastasis', 'History']
    for col in categorical_columns:
        df[col] = pd.Categorical(df[col]).codes
    
    # One-hot encode Breast Quadrant
    df = pd.get_dummies(df, columns=['Breast Quadrant'])
    
    # Prepare features and target
    features = df.drop(['S/N', 'Year', 'Diagnosis Result'], axis=1)
    target = (df['Diagnosis Result'] == 'Malignant').astype(int)
    
    return features, target

In [3]:
# Step 2: Custom PyTorch Dataset
class BreastCancerDataset(Dataset):
    def __init__(self, features, labels):
        self.features = torch.FloatTensor(features.values)
        self.labels = torch.FloatTensor(labels.values)
    
    def __len__(self):
        return len(self.labels)
    
    def __getitem__(self, idx):
        return self.features[idx], self.labels[idx]

In [4]:
# Step 3: Neural Network Model
class BreastCancerClassifier(nn.Module):
    def __init__(self, input_dim):
        super().__init__()
        self.network = nn.Sequential(
            nn.Linear(input_dim, 64),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(64, 32),
            nn.ReLU(),
            nn.Dropout(0.2),
            nn.Linear(32, 1),
            nn.Sigmoid()
        )
    def forward(self, x):
        return self.network(x)

In [5]:
def train_model(model, train_loader, val_loader, criterion, optimizer, epochs=50):
    # 1. Determine the computing device (GPU if available, otherwise CPU)
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    model.to(device)  # Move the model to the selected device
    
    # Variable to keep track of the best validation loss
    best_val_loss = float('inf')
    
    # Main training loop: going through the entire dataset multiple times
    for epoch in range(epochs):
        # Set the model to training mode
        model.train()
        train_loss = 0.0
        
        # Iterate through batches of training data
        for features, labels in train_loader:
            # Move features and labels to the selected device
            features, labels = features.to(device), labels.to(device)
            
            # Clear previous gradients
            optimizer.zero_grad()
            
            # Forward pass: get predictions
            outputs = model(features).squeeze()
            
            # Calculate the loss (how wrong the predictions are)
            loss = criterion(outputs, labels)
            
            # Backward pass: calculate gradients
            loss.backward()
            
            # Update model parameters
            optimizer.step()
            
            # Accumulate the training loss
            train_loss += loss.item()
# Validation
        model.eval()
        val_loss = 0.0
        val_predictions = []
        val_true = []
        
        with torch.no_grad():
            for features, labels in val_loader:
                features, labels = features.to(device), labels.to(device)
                outputs = model(features).squeeze()
                loss = criterion(outputs, labels)
                val_loss += loss.item()
                
                predictions = (outputs > 0.5).float()
                val_predictions.extend(predictions.cpu().numpy())
                val_true.extend(labels.cpu().numpy())
        
        train_loss /= len(train_loader)
        val_loss /= len(val_loader)
        val_accuracy = accuracy_score(val_true, val_predictions)
        
        print(f'Epoch {epoch+1}: Train Loss: {train_loss:.4f}, '
              f'Val Loss: {val_loss:.4f}, Val Accuracy: {val_accuracy:.4f}')
        
        if val_loss < best_val_loss:
            best_val_loss = val_loss
            torch.save(model.state_dict(), 'best_model.pth')
    
    return model

In [6]:
def main():
    # File path
    filepath = 'C:/Users/DATA-JOHN/Desktop/Breast-Cancer-Prediction-Model/breast-cancer-dataset.csv'
    
    # Use the load_and_preprocess_data function for consistent preprocessing
    features, target = load_and_preprocess_data(filepath)
    
    # Scale features
    scaler = StandardScaler()
    features_scaled = scaler.fit_transform(features)
    
    # Split data
    X_train, X_temp, y_train, y_temp = train_test_split(
        features_scaled, target, test_size=0.3, random_state=42
    )
    X_val, X_test, y_val, y_test = train_test_split(
        X_temp, y_temp, test_size=0.5, random_state=42
    )
    
    # 4. Create PyTorch datasets and data loaders
    train_dataset = BreastCancerDataset(
        pd.DataFrame(X_train), pd.Series(y_train)
    )
    val_dataset = BreastCancerDataset(
        pd.DataFrame(X_val), pd.Series(y_val)
    )
    test_dataset = BreastCancerDataset(
        pd.DataFrame(X_test), pd.Series(y_test)
    )
    
    train_loader = DataLoader(train_dataset, batch_size=16, shuffle=True)
    val_loader = DataLoader(val_dataset, batch_size=16)
    test_loader = DataLoader(test_dataset, batch_size=16)
    
    # 5. Set up the model, loss function, and optimizer
    model = BreastCancerClassifier(input_dim=features.shape[1])
    criterion = nn.BCELoss()
    optimizer = optim.Adam(model.parameters(), lr=0.001)
    
    # 6. Train the model
    trained_model = train_model(
        model, train_loader, val_loader, criterion, optimizer
    )
    
    # 7. Evaluate the model on test data
    device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
    trained_model.eval()
    test_predictions = []
    test_true = []
    
    with torch.no_grad():
        for features, labels in test_loader:
            features, labels = features.to(device), labels.to(device)
            outputs = trained_model(features).squeeze()
            predictions = (outputs > 0.5).float()
            test_predictions.extend(predictions.cpu().numpy())
            test_true.extend(labels.cpu().numpy())
    
    # 8. Print evaluation metrics
    print("\nTest Set Evaluation:")
    print(classification_report(test_true, test_predictions))
    print("\nConfusion Matrix:")
    print(confusion_matrix(test_true, test_predictions))

if __name__ == '__main__':
    main()

Epoch 1: Train Loss: 0.6601, Val Loss: 0.6723, Val Accuracy: 0.4516
Epoch 2: Train Loss: 0.6353, Val Loss: 0.6464, Val Accuracy: 0.4839
Epoch 3: Train Loss: 0.5964, Val Loss: 0.6192, Val Accuracy: 0.5484
Epoch 4: Train Loss: 0.5471, Val Loss: 0.5886, Val Accuracy: 0.7419
Epoch 5: Train Loss: 0.5192, Val Loss: 0.5537, Val Accuracy: 0.7419
Epoch 6: Train Loss: 0.4572, Val Loss: 0.5134, Val Accuracy: 0.8387
Epoch 7: Train Loss: 0.4091, Val Loss: 0.4724, Val Accuracy: 0.8387
Epoch 8: Train Loss: 0.3865, Val Loss: 0.4359, Val Accuracy: 0.8710
Epoch 9: Train Loss: 0.3454, Val Loss: 0.4043, Val Accuracy: 0.8387
Epoch 10: Train Loss: 0.2962, Val Loss: 0.3803, Val Accuracy: 0.8387
Epoch 11: Train Loss: 0.2703, Val Loss: 0.3609, Val Accuracy: 0.8387
Epoch 12: Train Loss: 0.2527, Val Loss: 0.3466, Val Accuracy: 0.8387
Epoch 13: Train Loss: 0.2526, Val Loss: 0.3367, Val Accuracy: 0.8387
Epoch 14: Train Loss: 0.2456, Val Loss: 0.3304, Val Accuracy: 0.8387
Epoch 15: Train Loss: 0.2375, Val Loss: 0.3