In [None]:
import os
import shutil
import random
import math
import itertools
import pickle
import numpy as np
import pandas as pd

import scipy
import scipy.io
import scipy.io.arff

import sklearn
import sklearn.preprocessing
import sklearn.model_selection
import sklearn.cluster
import sklearn.discriminant_analysis
import sklearn.svm
import sklearn.ensemble
import sklearn.naive_bayes

import imblearn
import imblearn.over_sampling

import torch

seed = 1234
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
torch.cuda.manual_seed_all(seed)
torch.backends.cudnn.deterministic = True
torch.backends.cudnn.benchmark = False

In [None]:
# Load the data into a numpy array
X = ...
Y = ...

In [None]:
# Prepare feature matrix and target vector
X = data.drop(outcome, axis = 1).values
Y = data[outcome].values

# Encode the categorical target labels to integers
encoder = sklearn.preprocessing.LabelEncoder()
Y = encoder.fit_transform(Y)

# Get unique classes and their counts
classes, counts = np.unique(Y, return_counts=True)

# Display the number of unique classes
print(f"Classes: {len(classes)}")

# Calculate the imbalance ratio (IR)
majc = np.max(counts)
minc = np.min(counts)
ir = majc / minc
print(f"IR: {ir:.2f}")

# Create a data frame to store the f1-scores for each evaluated method
f1_scores = pd.DataFrame(columns = [f"Fold_{i}" for i in range(1, 10)])

In [None]:
# Create a function for generating batches of pairs
def get_pair_batch(batch_size, X, Y):

  # Get the number of unique classes in Y
  n_classes = max(Y) + 1

  # Randomly select 'batch_size' number of classes
  classes = np.random.choice(np.arange(n_classes), size = batch_size)

  # Get the number of features in X
  n_features = X.shape[1]

  # Initialize arrays to store the pairs and their classes
  pairs = [np.zeros((batch_size, n_features), dtype = np.float32) for i in range(3)]

  # Store the classes for later filtering of positive and negative samples
  pairs[2] = classes

  for i in range(batch_size):

    # Get indices of all samples that belong to the chosen class
    choices = np.where(Y == classes[i])[0]

    # Randomly select two samples of the same class
    idx_A = np.random.choice(choices)
    idx_B = np.random.choice(choices)

     # Save the samples to the pair list
    pairs[0][i] = X[idx_A]
    pairs[1][i] = X[idx_B]

  return pairs

In [None]:
# PyTorch data utils
def batch_to_torch(pair_batch, device = None):

  # Unpack the pair batch into individual variables
  x1, x2, y = pair_batch

  if device is not None:
    # Convert arrays to PyTorch tensors and move to the specified device
    x1 = torch.tensor(x1).to(device)
    x2 = torch.tensor(x2).to(device)
    y = torch.tensor(y).to(device)

  return (x1, x2, y)

In [None]:
# NT-Xent loss function
class NTXentLoss(torch.nn.Module):
  def __init__(self, temperature = 0.5):
    super(NTXentLoss, self).__init__()
    self.temperature = temperature

  def forward(self, diss, y):
    size = diss.shape[0]

    # Mask for positive samples
    y = torch.cat([y, y], dim = 0)
    y1 = torch.tile(y, [size])
    y2 = torch.repeat_interleave(y, size, axis = 0)
    pos_mask = torch.reshape(y1 == y2, (size, size))
    pos_mask.diagonal().fill_(False)

    # Mask for negative samples
    neg_mask = (~torch.eye(size, size, device = diss.device, dtype = bool)).float()

    # Compute nominator
    nominator = torch.sum(pos_mask * torch.exp(diss / self.temperature), dim = 1)

    # Compute denominator
    denominator = torch.sum(neg_mask * torch.exp(diss / self.temperature), dim = 1)

    # Compute loss
    loss_partial = -torch.log(nominator / denominator)
    loss = torch.mean(loss_partial)

    return loss

# Projection head
class ProjectionHead(torch.nn.Module):
  def __init__(self, inpt, layers, dropout_rate = 0.3):
    super(ProjectionHead, self).__init__()

    # Initialize layers
    hidden_layers = [
      torch.nn.Linear(inpt, layers[0]),
      torch.nn.ReLU(),
      torch.nn.Dropout(dropout_rate)
    ]

    # Add additional hidden layers
    for i in range(len(layers) - 1):
      hidden_layers.extend([
        torch.nn.Linear(layers[i], layers[i + 1]),
        torch.nn.ReLU(),
        torch.nn.Dropout(dropout_rate)
      ])

    # Add output layer
    hidden_layers.append(torch.nn.Linear(layers[-1], 1))

    # Convert into a simple sequential model
    self.projection_head = torch.nn.Sequential(*hidden_layers)

  def forward(self, x1, x2):
    return self.projection_head(x1 - x2)

# Contrastive model
class ContrastiveModel(torch.nn.Module):
  def __init__(self, inpt, layers = [128, 64, 32]):
    super(ContrastiveModel, self).__init__()
    self.projection_head = ProjectionHead(inpt, layers)

  def forward(self, x1, x2):
    if self.training:
      batch_size = x1.shape[0]

      x = torch.cat([x1, x2])

      # Repeat the elements to match the input expected the network
      x1 = torch.tile(x, [batch_size * 2, 1])
      x2 = torch.repeat_interleave(x, batch_size * 2, axis = 0)

      # Forward
      dissimilarity = self.projection_head(x1, x2)
      dissimilarity = torch.reshape(dissimilarity, (batch_size * 2, -1))

    else:
      dissimilarity = self.projection_head(x1, x2)

    return (dissimilarity)

In [None]:
def train_model(X_train, Y_train, model_id, temperature = 0.5, batch_size = 32,
                learning_rate = 1e-3, layers = [128, 64, 32], iterations = 10000):

  # Define the model filename based on model_id
  model_filename = f"models/model_{model_id}.pth"
  print(model_filename)

  # Define computation device
  device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

  # Initialize model and optimizer
  contr_model = None
  inpt_features = X_train.shape[1] # Number of input features

  # Load pre-trained model if exists
  if os.path.isfile(model_filename):
    contr_model = ContrastiveModel(inpt_features, layers = layers)
    contr_model.load_state_dict(torch.load(model_filename))
    contr_model.to(device)

  # Initialize loss function
  contr_loss = NTXentLoss(temperature)

  # Train a new model if not loaded
  if contr_model is None:
    contr_model = ContrastiveModel(inpt_features, layers = layers)
    contr_model.to(device)

    optimizer = torch.optim.Adam(contr_model.parameters(), lr = learning_rate)
    contr_model.train()

    train_loss = 0
    for i in range(iterations):

      # Generate a batch of pairs and move to device
      x1, x2, y = batch_to_torch(get_pair_batch(batch_size, X_train, Y_train), device)

      # Zero the gradients
      optimizer.zero_grad()

      # Forward pass
      diss = contr_model(x1, x2)

      # Compute loss and perform a backward pass
      loss = contr_loss(diss, y)
      loss.backward()
      optimizer.step()

      # Update and display the training loss
      train_loss += loss.item()
      try:
        average_train_loss = train_loss / ((i + 1) % 1000)
        print(f"\r{i}: Train loss: {average_train_loss:.4f}", end = "\r")
      except ZeroDivisionError:
        print(f"{i + 1}: Train loss: {train_loss / 1000:.4f}")
        train_loss = 0

    # Save the trained model
    torch.save(contr_model.state_dict(), model_filename)

  # Freeze the model parameters
  for param in contr_model.parameters():
    param.requires_grad = False

  # Set to evaluation mode
  contr_model.eval()

  return contr_model

In [None]:
# Split the data into training and testing
# The testing set will be the same for all scenarios: methods and folds
X_train_full, X_test_full, Y_train_full, Y_test = sklearn.model_selection.train_test_split(X, Y,
                                                                                      test_size = 0.3,
                                                                                      random_state = seed,
                                                                                      stratify = Y)

# Initialize dictionaries to store train datasets
X_train, Y_train, X_test = {}, {}, {}

# Seed for reproducibility
np.random.seed(seed)

# Calculate total number of samples in the dataset
total_samples = X_train_full.shape[0]

# Generate train/test splits for different sample sizes
for i in range(1, 11):

  # Calculate number of samples for the current iteration
  n_samples = math.ceil(i / 10 * total_samples)

  # Perform stratified splitting of the data
  if i < 10:
    X_train[i], _, Y_train[i], _ = sklearn.model_selection.train_test_split(X_train_full, Y_train_full,
                                                                            train_size = n_samples,
                                                                            random_state = seed,
                                                                            stratify = Y_train_full)
  else:
    X_train[i], Y_train[i] = X_train_full, Y_train_full

  # Standardize the features
  scaler = sklearn.preprocessing.StandardScaler()
  scaler.fit(X_train[i])
  X_train[i] = scaler.transform(X_train[i])
  X_test[i] = scaler.transform(X_test_full)

# Initialize dictionaries for the augmented dataset
X_train_smote, Y_train_smote = {}, {}

# Perform SMOTE oversampling for each train dataset
for i in range(1, 11):
  sampl = imblearn.over_sampling.SMOTE(random_state = seed, k_neighbors = 5)
  X_train_smote[i], Y_train_smote[i] = sampl.fit_resample(X_train[i], Y_train[i])

In [None]:
# Initialize an empty dictionary to store the trained models
models = {}

# Loop through the different train datasets to train models
for i in range(1, 11):

  # Generate a unique model identifier based on the current iteration
  model_id = f"k-{i}"

  # Train the model using the augmented dataset and specified hyperparameters
  models[i] = train_model(
    X_train_smote[i], Y_train_smote[i],
    model_id = model_id,
    temperature = 0.5,
    layers = [512, 256, 128],
    iterations = 2000,
    batch_size = 128,
    learning_rate = 1e-4
  )

In [None]:
# Prototype selection

# Initialize variables
n_prototypes = 5
prototypes_filename = "cache/prototypes.pkl"

# Load prototypes if they already exist
if os.path.isfile(prototypes_filename):
  with open(prototypes_filename, "rb") as f:
    prototypes = pickle.load(f)

else:
  # Initialize dictionary to store prototypes
  prototypes = {}

  # Loop through different train datasets to compute prototypes
  for i in range(1, 11):
    print(f"K-fold: {i}")

    # Initialize the prototypes dictionary for this fold
    n_classes = len(np.unique(Y_train_smote[i]))
    n_features = X_train_smote[i].shape[1]
    total_prototypes = n_classes * n_prototypes

    # Initialize K-means prototypes
    prototypes[i] = np.zeros((total_prototypes, n_features), dtype = np.float32)

    # Compute prototypes for each class
    for c in range(n_classes):

      # Extract features corresponding to the current class
      X_embedding = X_train_smote[i][Y_train_smote[i] == c]
      start, end = c * n_prototypes, (c + 1) * n_prototypes

      # Compute K-means prototypes
      kmeans = sklearn.cluster.KMeans(n_clusters = n_prototypes, init = "k-means++", random_state = 1234, n_init = "auto").fit(X_embedding)
      centroids = kmeans.cluster_centers_
      prototypes[i][start:end, :] = centroids

  # Save the computed prototypes
  with open(prototypes_filename, "wb") as f:
    pickle.dump(prototypes, f)

In [None]:
# Dissimilarity space

# Initialize variables
diss_space_filename = "cache/dissimilarity_space.pkl"

# Load precomputed dissimilarity space if exists
if os.path.isfile(diss_space_filename):
  with open(diss_space_filename, "rb") as f:
    diss_space = pickle.load(f)

else:
  # Initialize dictionary to store the dissimilarity space matrix
  diss_space = {}

  # Loop through different train datasets to compute the dissimilarity space matrix
  for i in range(1, 11):

    # Initialize dictionarie to store dissimilarity measures
    diss_space[i] = {}

    # Expand dimensions for broadcasting in numpy operations
    enc_train = X_train[i][:, np.newaxis, :]
    enc_test = X_test[i][:, np.newaxis, :]

    diss_space[i]["train"] = np.linalg.norm(enc_train - prototypes[i], axis = 2)
    diss_space[i]["test"] = np.linalg.norm(enc_test - prototypes[i], axis = 2)

  # Save the generated dissimilarity space
  with open(diss_space_filename, "wb") as f:
    pickle.dump(diss_space, f)

# Classify using the dissimilarity space
np.random.seed(seed)

svm_parameters = [
  {'kernel': ['rbf'],
   'gamma': [1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
   'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]}]

# Loop through different train datasets
for i in range(1, 11):

  # Initialize best F1 score
  best_f1 = 0

  # List of classifiers to be used
  classifiers = [
    sklearn.model_selection.GridSearchCV(sklearn.svm.SVC(), svm_parameters, cv = 5, scoring = 'f1_macro', verbose = 0),
    sklearn.ensemble.RandomForestClassifier(),
    sklearn.naive_bayes.GaussianNB()
  ]

  for clf in classifiers:

    # Fit the model
    clf.fit(diss_space[i]["train"], Y_train[i])

    # Make predictions
    Y_pred = clf.predict(diss_space[i]["test"])

    # Calculate F1 score
    f1 = sklearn.metrics.f1_score(Y_test, Y_pred, average = "macro")

    # Update best F1 score
    best_f1 = max(best_f1, f1)

  # Add to the f1 data frame
  f1_scores.loc["Space", f"Fold_{i}"] = best_f1

  print(f"{i} -> {best_f1:.2f}")

In [None]:
# Contrastive dissimilarity space

# Initialize variables
contr_diss_space_filename = "cache/contr_dissimilarity_space.pkl"

# Load precomputed contrastive dissimilarity space if exists
if os.path.isfile(contr_diss_space_filename):
  with open(contr_diss_space_filename, "rb") as f:
    contr_diss_space = pickle.load(f)

else:
  # Initialize dictionary to store the contr. dissimilarity space matrix
  contr_diss_space = {}

  # Loop through each fold
  for i in range(1, 11):
    contr_diss_space[i] = {}

    for data_type, data_set in zip(["train", "test"], [X_train, X_test]):
      # Convert data to float32 and fetch the prototypes
      x = np.float32(data_set[i])
      y = prototypes[i]

      # Get dimensions for reshaping
      n_enc = x.shape[0]
      n_prot = y.shape[0]

      # Reshape data
      x = np.repeat(x, n_prot, axis = 0)
      y = np.tile(y, [n_enc, 1])

      # Convert to PyTorch tensors and move to GPU
      x = torch.from_numpy(x).to("cuda")
      y = torch.from_numpy(y).to("cuda")

      # Compute dissimilarities
      diss = models[i](x, y)
      diss = np.array(np.split(np.squeeze(diss.cpu().numpy()), n_enc))

      # Store dissimilarities
      contr_diss_space[i][data_type] = diss

  # Save the generated contrastive dissimilarity space
  with open(contr_diss_space_filename, "wb") as f:
    pickle.dump(contr_diss_space, f, -1)

# Classify using the contrastive dissimilarity space
np.random.seed(seed)

svm_parameters = [
  {'kernel': ['rbf'],
   'gamma': [1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
   'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]}]

# Loop through different train datasets
for i in range(1, 11):

  # Initialize best F1 score
  best_f1 = 0

  # List of classifiers to be used
  classifiers = [
    sklearn.model_selection.GridSearchCV(sklearn.svm.SVC(), svm_parameters, cv = 5, scoring = 'f1_macro', verbose = 0),
    sklearn.ensemble.RandomForestClassifier(),
    sklearn.naive_bayes.GaussianNB()
  ]

  for clf in classifiers:

    # Fit the model
    clf.fit(contr_diss_space[i]["train"], Y_train[i])

    # Make predictions
    Y_pred = clf.predict(contr_diss_space[i]["test"])

    # Calculate F1 score
    f1 = sklearn.metrics.f1_score(Y_test, Y_pred, average = "macro")

    # Update best F1 score
    best_f1 = max(best_f1, f1)

  # Add to the f1 data frame
  f1_scores.loc["Contr. space", f"Fold_{i}"] = best_f1

  print(f"{i} -> {best_f1:.2f}")

In [None]:
# Dissimilarity vector

# Initialize variables
# In this case, we do not store the precomputed features into the disk due to its large size
diss_vector = {}

# Loop through each fold
for i in range(1, 11):

  # Initialize dictionary to store the dissimilarity vectors
  diss_vector[i] = {}

  # Generate a list of unique classes and replicate it for each prototype
  prot_Y = np.repeat(np.unique(Y_train[i]), n_prototypes)

  # Create dissimilarity vectors for training labels
  # A pair is similar (True) if both labels are the same, otherwise dissimilar (False)
  diss_vector[i]["Y_train"] = np.transpose([np.repeat(Y_train[i], len(prot_Y)), np.tile(prot_Y, len(Y_train[i]))])
  diss_vector[i]["Y_train"] = diss_vector[i]["Y_train"][:,0] == diss_vector[i]["Y_train"][:,1]

  # Create dissimilarity vectors for test labels
  # A pair is similar (True) if both labels are the same, otherwise dissimilar (False)
  diss_vector[i]["Y_test"] = np.transpose([np.repeat(Y_test, len(prot_Y)), np.tile(prot_Y, len(Y_test))])
  diss_vector[i]["Y_test"] = diss_vector[i]["Y_test"][:,0] == diss_vector[i]["Y_test"][:,1]

  # Extract dimensions for reshaping
  n_features = X_train[i].shape[1]

  # Expand dimensions to facilitate element-wise subtraction with prototypes
  enc_train = X_train[i][:,np.newaxis,:]
  enc_test = X_test[i][:,np.newaxis,:]

  # Make prototypes compatible for element-wise operations
  local_kmeans = prototypes[i][np.newaxis, :, :]

  # Calculate absolute difference between each feature and prototype for training and test sets
  diss_vector[i]["X_train"] = np.abs(enc_train - local_kmeans).reshape(-1, n_features)
  diss_vector[i]["X_test"] = np.abs(enc_test - local_kmeans).reshape(-1, n_features)


# Classify using the dissimilarity vectors
np.random.seed(seed)

svm_parameters = [
  {'kernel': ['rbf'],
   'gamma': [1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
   'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]}]

avg_prototypes = True

# Loop through different train datasets
for i in range(1, 11):

  # Initialize best F1 score
  best_f1 = 0

  # List of classifiers to be used
  classifiers = [
    sklearn.model_selection.GridSearchCV(sklearn.svm.SVC(probability = True), svm_parameters, cv = 5, scoring = 'f1_macro', verbose = 0),
    sklearn.ensemble.RandomForestClassifier(),
    sklearn.naive_bayes.GaussianNB()
  ]

  for clf in classifiers:

    # Fit the model
    clf.fit(diss_vector[i]["X_train"], diss_vector[i]["Y_train"])

    # Make predictions
    Y_pred = clf.predict(diss_vector[i]["X_test"])

    # Reshape to match the number of test samples
    Y_pred = np.reshape(Y_pred[:, 1], (Y_test.shape[0], -1))

    # Avg. the prediction for all prototypes
    if avg_prototypes:
      Y_pred = np.reshape(Y_pred, (Y_test.shape[0], -1, n_prototypes))
      Y_pred = np.mean(Y_pred, axis = -1)
      Y_pred = np.argmax(Y_pred, axis = 1)
    # Get the maximum probability as the prediction
    else:
      Y_pred = np.argmax(Y_pred, axis = 1) // n_prototypes

    f1 = sklearn.metrics.f1_score(Y_test, Y_pred, average = "macro")

  # Add to the f1 data frame
  f1_scores.loc["Vector", f"Fold_{i}"] = best_f1

  print(f"{i} -> {best_f1:.2f}")

In [None]:
# Contrastive dissimilarity vector

# Initialize variables
contr_diss_vector_filename = "cache/contr_dissimilarity_vector.pkl"

# Load precomputed contrastive dissimilarity space if exists
if os.path.isfile(contr_diss_vector_filename):
  with open(contr_diss_vector_filename, "rb") as f:
    contr_diss_vector = pickle.load(f)

else:

  # Initialize dictionary to store the contr. dissimilarity vectors
  contr_diss_vector = {}

  # Loop through each fold
  for i in range(1, 11):
    contr_diss_vector[i] = {"X_train": [], "X_test": []}

    # Generate a list of unique classes and replicate it for each prototype
    prot_Y = np.repeat(np.unique(Y_train[i]), n_prototypes)

    # Generate label pairs for training and testing data
    contr_diss_vector[i]["Y_train"] = np.transpose([np.repeat(Y_train[i], len(prot_Y)), np.tile(prot_Y, len(Y_train[i]))])
    contr_diss_vector[i]["Y_train"] = contr_diss_vector[i]["Y_train"][:,0] == contr_diss_vector[i]["Y_train"][:,1]

    contr_diss_vector[i]["Y_test"] = np.transpose([np.repeat(Y_test, len(prot_Y)), np.tile(prot_Y, len(Y_test))])
    contr_diss_vector[i]["Y_test"] = contr_diss_vector[i]["Y_test"][:,0] == contr_diss_vector[i]["Y_test"][:,1]

    # Enable training mode to activate dropout
    # This way the same input generates slightly different outputs that we treat as "augmentations"
    models[i].train()

    # Number of patches per prototype
    number_patches = 50

    # Number of prototypes
    number_prototypes = prototypes[i].shape[0]

    # Calculate dissimilarity vectors for both training and testing data
    for key, dataset in {"X_train": X_train[i], "X_test": X_test[i]}.items():

      # Loop through each data point in the dataset
      for idx in range(dataset.shape[0]):

        # Prepare patches and prototypes for the projection head
        local_patches = torch.from_numpy(np.float32(dataset[idx])).to("cuda")
        local_patches = torch.tile(local_patches, [number_prototypes * number_patches, 1])

        local_prototypes = np.repeat(prototypes[i], number_patches, axis = 0)
        local_prototypes = torch.from_numpy(local_prototypes).to("cuda")

        # Calculate the dissimilarity vector using the model's projection head
        diss_vector = models[i].projection_head(local_patches, local_prototypes)
        diss_vector = np.array(np.split(diss_vector.cpu().numpy(), number_prototypes))

        contr_diss_vector[i][key].append(diss_vector)

    # Reshape to match the labels
    contr_diss_vector[i]["X_train"] = np.reshape(contr_diss_vector[i]["X_train"], (len(Y_train[i]) * number_prototypes, number_patches))
    contr_diss_vector[i]["X_test"] = np.reshape(contr_diss_vector[i]["X_test"], (len(Y_test) * number_prototypes, number_patches))

    # Switch back to evaluation mode
    models[i].eval()

  # Save the generated contrastive dissimilarity vectors
  with open(contr_diss_vector_filename, "wb") as f:
    pickle.dump(contr_diss_vector, f, -1)

# Classify using the contrastive dissimilarity vectors
np.random.seed(seed)

svm_parameters = [
  {'kernel': ['rbf'],
   'gamma': [1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
   'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]}]

avg_prototypes = True

# Loop through different train datasets
for i in range(1, 11):

  # Initialize best F1 score
  best_f1 = 0

  # List of classifiers to be used
  classifiers = [
    sklearn.model_selection.GridSearchCV(sklearn.svm.SVC(probability = True), svm_parameters, cv = 3, scoring = 'f1_macro', verbose = 0),
    sklearn.ensemble.RandomForestClassifier(),
    sklearn.naive_bayes.GaussianNB()
  ]

  for clf in classifiers:

    # Fit the model
    clf.fit(contr_diss_vector[i]["X_train"], contr_diss_vector[i]["Y_train"])

    # Make predictions
    Y_pred = clf.predict_proba(contr_diss_vector[i]["X_test"])

    # Reshape to match the number of test samples
    Y_pred = np.reshape(Y_pred[:, 1], (Y_test.shape[0], -1))

    # Avg. the prediction for all prototypes
    if avg_prototypes:
      Y_pred = np.reshape(Y_pred, (Y_test.shape[0], -1, n_prototypes))
      Y_pred = np.mean(Y_pred, axis = -1)
      Y_pred = np.argmax(Y_pred, axis = 1)
    # Get the maximum probability as the prediction
    else:
      Y_pred = np.argmax(Y_pred, axis = 1) // n_prototypes

    # Calculate F1 score
    f1 = sklearn.metrics.f1_score(Y_test, Y_pred, average = "macro")

    # Update best F1 score
    best_f1 = max(best_f1, f1)

  # Add to the f1 data frame
  f1_scores.loc["Contr. vector", f"Fold_{i}"] = best_f1

  print(f"{i} -> {best_f1:.2f}")

In [None]:
# SVM Classification
np.random.seed(seed)

svm_parameters = [
  {'kernel': ['rbf'],
   'gamma': [1, 1e-1, 1e-2, 1e-3, 1e-4, 1e-5],
   'C': [0.001, 0.10, 0.1, 10, 25, 50, 100, 1000]}]

for i in range(1, 11):

  # Original dataset
  # Initialize SVM classifier
  svc = sklearn.svm.SVC()

  # Create the grid search with cross-validation
  clf = sklearn.model_selection.GridSearchCV(svc, svm_parameters, cv = 5, scoring = 'f1_macro', verbose = 0)

  # Fit the model
  clf.fit(X_train[i], Y_train[i])

  # Get the best parameters and estimator
  best_params = clf.best_params_
  best_estimator = clf.best_estimator_

  # Make predictions using the best estimator
  Y_pred = best_estimator.predict(X_test[i])

  f1 = sklearn.metrics.f1_score(Y_test, Y_pred, average = "macro")
  acc = sklearn.metrics.accuracy_score(Y_test, Y_pred)

  # Add to the f1 data frame
  f1_scores.loc["SVM", f"Fold_{i}"] = f1

  print(f"{i} -> {f1:.2f}")

  # Augmented dataset
  # Initialize SVM classifier
  svc = sklearn.svm.SVC()

  # Create the grid search with cross-validation
  clf = sklearn.model_selection.GridSearchCV(svc, svm_parameters, cv = 5, scoring = 'f1_macro', verbose = 0)

  # Fit the model
  clf.fit(X_train_smote[i], Y_train_smote[i])

  # Get the best parameters and estimator
  best_params = clf.best_params_
  best_estimator = clf.best_estimator_

  # Make predictions using the best estimator
  Y_pred = best_estimator.predict(X_test[i])

  f1 = sklearn.metrics.f1_score(Y_test, Y_pred, average = "macro")
  acc = sklearn.metrics.accuracy_score(Y_test, Y_pred)

  # Add to the f1 data frame
  f1_scores.loc["SVM (smote)", f"Fold_{i}"] = f1

  print(f"{i} -> {f1:.2f}")

In [None]:
# Random Forest Classification
np.random.seed(1234)

for i in range(1, 11):

  # Original dataset
  # Initialize Random Forest classifier
  clf = sklearn.ensemble.RandomForestClassifier()

  # Fit the model
  clf.fit(X_train[i], Y_train[i])

  # Make predictions using the best estimator
  Y_pred = clf.predict(X_test[i])

  f1 = sklearn.metrics.f1_score(Y_test, Y_pred, average = "macro")
  acc = sklearn.metrics.accuracy_score(Y_test, Y_pred)

  # Add to the f1 data frame
  f1_scores.loc["RF", f"Fold_{i}"] = f1

  print(f"{i} -> {f1:.2f}")

  # Augmented dataset
  # Initialize Random Forest classifier
  clf = sklearn.ensemble.RandomForestClassifier()

  # Fit the model
  clf.fit(X_train_smote[i], Y_train_smote[i])

  # Make predictions using the best estimator
  Y_pred = clf.predict(X_test[i])

  f1 = sklearn.metrics.f1_score(Y_test, Y_pred, average = "macro")
  acc = sklearn.metrics.accuracy_score(Y_test, Y_pred)

  # Add to the f1 data frame
  f1_scores.loc["RF (smote)", f"Fold_{i}"] = f1

  print(f"{i} -> {f1:.2f}")

In [None]:
# Naive Bayes Classification
np.random.seed(1234)

for i in range(1, 11):

  # Original dataset
  # Initialize Random Forest classifier
  clf = sklearn.naive_bayes.GaussianNB()

  # Fit the model
  clf.fit(X_train[i], Y_train[i])

  # Make predictions using the best estimator
  Y_pred = clf.predict(X_test[i])

  f1 = sklearn.metrics.f1_score(Y_test, Y_pred, average = "macro")
  acc = sklearn.metrics.accuracy_score(Y_test, Y_pred)

  # Add to the f1 data frame
  f1_scores.loc["NB", f"Fold_{i}"] = f1

  print(f"{i} -> {f1:.2f}")

  # Augmented dataset
  # Initialize Random Forest classifier
  clf = sklearn.naive_bayes.GaussianNB()

  # Fit the model
  clf.fit(X_train_smote[i], Y_train_smote[i])

  # Make predictions using the best estimator
  Y_pred = clf.predict(X_test[i])

  f1 = sklearn.metrics.f1_score(Y_test, Y_pred, average = "macro")
  acc = sklearn.metrics.accuracy_score(Y_test, Y_pred)

  # Add to the f1 data frame
  f1_scores.loc["NB (smote)", f"Fold_{i}"] = f1

  print(f"{i} -> {f1:.2f}")

In [None]:
# Identify the best classifier for each fold
f1_scores = f1_scores.astype(float).round(2)
for i in range(1, 11):
  fold = f"Fold_{i}"
  best_f1 = f1_scores[fold].max()
  max_rows = f1_scores[f1_scores[fold] == best_f1].index.tolist()
  print(f"For {fold}, the max F1 score is {best_f1:.2f}, achieved by: {max_rows}")

In [None]:
row_names = ["SVM", "RF", "NB"]
f1_scores.loc["Traditional"] = f1_scores.loc[row_names].max()

row_names = ["SVM (smote)", "RF (smote)", "NB (smote)"]
f1_scores.loc["Traditional (smote)"] = f1_scores.loc[row_names].max()

f1_tbl = f1_scores.apply(lambda row: row.name + ' & ' + ' & '.join(row.astype(str)) + r' \\ \hline', axis = 1).to_list()
for row in f1_tbl:
  print(row)

In [None]:
# Compute silhouette score
sil_scores = []
for i in range(1, 11):
  sil_score = sklearn.metrics.silhouette_score(contr_diss_space[i]["train"], Y_train[i])
  sil_scores.append(sil_score)

# Calculate average and standard deviation
avg_sil = round(np.mean(sil_scores), 2)
sd_sil = round(np.std(sil_scores), 2)

print(f"{avg_sil:.2f} ({sd_sil:.2f})")