<a href="https://colab.research.google.com/github/omar-omar-om/gradProject-notebooks/blob/main/NN_All_ENCODING.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Install & Import Required Libraries



In [None]:
!pip install torch torchvision torchaudio scikit-learn
!pip install torch torchvision torchaudio

import os
import time
import pickle
import torch
import torch.nn as nn
import torch.optim as optim
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.metrics import roc_auc_score
from sklearn.preprocessing import StandardScaler



Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch)
  Downloading nvidia_cublas_cu12-12.4.5.8-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cufft-cu12==11.2.1.3 (from torch)
  Downloading nvidia_cufft_cu12-11.2.1.3-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-curand-cu12==10.3.5.147 (from torch)
  Downloading nvidia_curand_cu12-10.3.5

# Mount Google Drive

In [None]:
from google.colab import drive
drive.mount('/content/drive')


Mounted at /content/drive


# Define File Paths for All Encoding Types

In [None]:
# Define dataset paths manually for each encoding type
encoding_paths = {
    "Label": {
        "train": "/content/drive/My Drive/label-encoding/train_label.csv",
        "val": "/content/drive/My Drive/label-encoding/val_label.csv",
        "test": "/content/drive/My Drive/label-encoding/test_label.csv",
    },
    "Frequency": {
        "train": "/content/drive/My Drive/frequency-encoding/train_frequency.csv",
        "val": "/content/drive/My Drive/frequency-encoding/val_frequency.csv",
        "test": "/content/drive/My Drive/frequency-encoding/test_frequency.csv",
    },
    "Target": {
        "train": "/content/drive/My Drive/target-encoding/train_target.csv",
        "val": "/content/drive/My Drive/target-encoding/val_target.csv",
        "test": "/content/drive/My Drive/target-encoding/test_target.csv",
    },
    "Hybrid": {
        "train": "/content/drive/My Drive/hybrid-label-target-encoding/train_hybrid.csv",
        "val": "/content/drive/My Drive/hybrid-label-target-encoding/val_hybrid.csv",
        "test": "/content/drive/My Drive/hybrid-label-target-encoding/test_hybrid.csv",
    },
    "Label-Frequency": {
        "train": "/content/drive/My Drive/label-frequency-encoding/train_label_freq.csv",
        "val": "/content/drive/My Drive/label-frequency-encoding/val_label_freq.csv",
        "test": "/content/drive/My Drive/label-frequency-encoding/test_label_freq.csv",
    }
}

# Target column
target = "HasDetections"


# Define the Neural Network (MLP - Multi-Layer Perceptron)

In [None]:
class NeuralNet(nn.Module):
    def __init__(self, input_size, hidden_size, activation_fn):
        super(NeuralNet, self).__init__()
        self.layer1 = nn.Linear(input_size, hidden_size)
        self.activation = nn.ReLU() if activation_fn == "relu" else nn.Tanh()
        self.layer2 = nn.Linear(hidden_size, 1)  # Binary classification
        self.sigmoid = nn.Sigmoid()

    def forward(self, x):
        x = self.activation(self.layer1(x))
        x = self.sigmoid(self.layer2(x))
        return x


# Define Function to Train Neural Network

In [None]:
def train_neural_network_cv(X_train, y_train, input_size):
    param_grid = {
        "hidden_size": [32, 64, 128],      # 3 values
        "learning_rate": [0.01, 0.001],    # 2 values
        "batch_size": [32, 64, 128],       # 3 values
        "optimizer": ["adam", "sgd"],      # 2 values
        "activation": ["relu", "tanh"],    # 2 values
    }

    best_auc = 0
    best_model = None
    best_params = None
    start_grid_time = time.time()

    skf = StratifiedKFold(n_splits=3, shuffle=True, random_state=42)

    for hidden_size in param_grid["hidden_size"]:
        for lr in param_grid["learning_rate"]:
            for batch_size in param_grid["batch_size"]:
                for optimizer_name in param_grid["optimizer"]:
                    for activation_name in param_grid["activation"]:
                        fold_auc_scores = []

                        for train_index, val_index in skf.split(X_train, y_train):
                            X_train_fold, X_val_fold = X_train[train_index], X_train[val_index]
                            y_train_fold, y_val_fold = y_train.iloc[train_index], y_train.iloc[val_index]

                            model = NeuralNet(input_size, hidden_size, activation_name)
                            criterion = nn.BCELoss()
                            optimizer = optim.Adam(model.parameters(), lr=lr) if optimizer_name == "adam" else optim.SGD(model.parameters(), lr=lr)

                            # Convert data to PyTorch tensors
                            X_train_tensor = torch.tensor(X_train_fold, dtype=torch.float32)
                            y_train_tensor = torch.tensor(y_train_fold.values, dtype=torch.float32).view(-1, 1)
                            X_val_tensor = torch.tensor(X_val_fold, dtype=torch.float32)
                            y_val_tensor = torch.tensor(y_val_fold.values, dtype=torch.float32).view(-1, 1)

                            # Training loop
                            for epoch in range(10):
                                optimizer.zero_grad()
                                outputs = model(X_train_tensor)
                                loss = criterion(outputs, y_train_tensor)
                                loss.backward()
                                optimizer.step()

                            # Validation
                            with torch.no_grad():
                                y_pred_probs = model(X_val_tensor).numpy()
                            auc_score = roc_auc_score(y_val_fold, y_pred_probs)
                            fold_auc_scores.append(auc_score)

                        mean_auc = np.mean(fold_auc_scores)

                        if mean_auc > best_auc:
                            best_auc = mean_auc
                            best_model = model
                            best_params = {
                                "hidden_size": hidden_size,
                                "learning_rate": lr,
                                "batch_size": batch_size,
                                "optimizer": optimizer_name,
                                "activation": activation_name
                            }

    end_grid_time = time.time()
    grid_search_time = end_grid_time - start_grid_time

    return grid_search_time, best_model, best_params, best_auc


# Run Experiments on All Encoding Types

In [None]:
results = []

for encoding_name, paths in encoding_paths.items():
    print(f"\n Running Neural Network on {encoding_name} Encoding...")

    # Load training set
    train = pd.read_csv(paths["train"])

    # Extract features & target
    X_train, y_train = train.drop(columns=[target]), train[target]

    # Standardize training data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)

    # Train Neural Network with Cross-Validation
    grid_time, best_model, best_params, best_auc = train_neural_network_cv(X_train_scaled, y_train, X_train.shape[1])

    # Train Best Model on full training set
    start_train_time = time.time()

    optimizer = optim.Adam(best_model.parameters(), lr=best_params["learning_rate"]) if best_params["optimizer"] == "adam" else optim.SGD(best_model.parameters(), lr=best_params["learning_rate"])

    best_model.train()
    for epoch in range(10):
        optimizer.zero_grad()
        outputs = best_model(torch.tensor(X_train_scaled, dtype=torch.float32))
        loss = nn.BCELoss()(outputs, torch.tensor(y_train.values, dtype=torch.float32).view(-1, 1))
        loss.backward()
        optimizer.step()

    end_train_time = time.time()
    train_time = end_train_time - start_train_time

    # Load real validation set
    val = pd.read_csv(paths["val"])
    X_val, y_val = val.drop(columns=[target]), val[target]

    # Standardize validation features using the same scaler
    X_val_scaled = scaler.transform(X_val)

    # Evaluate on real validation set
    start_eval_time = time.time()
    with torch.no_grad():
        y_pred_probs = best_model(torch.tensor(X_val_scaled, dtype=torch.float32)).numpy()
    end_eval_time = time.time()

    eval_time = end_eval_time - start_eval_time

    # Calculate AUC on real validation set
    val_auc = roc_auc_score(y_val, y_pred_probs)

    # Save results
    results.append({
        "Encoding": encoding_name,
        "Grid Search Time (s)": grid_time,
        "Final Training Time (s)": train_time,
        "Evaluation Time (s)": eval_time,
        "AUC Score": val_auc
    })

# Convert to DataFrame
results_df = pd.DataFrame(results)

# Optional: Sort by AUC Score (best model on top)
results_df = results_df.sort_values(by="AUC Score", ascending=False)

# Display final results
print(results_df.to_string(index=False))



 Running Neural Network on Label Encoding...

 Running Neural Network on Frequency Encoding...

 Running Neural Network on Target Encoding...

 Running Neural Network on Hybrid Encoding...

 Running Neural Network on Label-Frequency Encoding...
       Encoding  Grid Search Time (s)  Final Training Time (s)  Evaluation Time (s)  AUC Score
         Target           4503.586350                52.092738             0.330945   0.692748
          Label           4491.598618                51.429377             0.413242   0.676236
         Hybrid           4482.196109                50.338474             0.290280   0.675878
Label-Frequency           4504.433619                50.651973             0.270206   0.673184
      Frequency           4517.495373                51.097507             0.274519   0.673013
