#**ECE 8803 - MaxViT Implementation**

Installing Libraries

In [3]:
!pip install datasets timm

Collecting datasets
  Downloading datasets-3.5.0-py3-none-any.whl.metadata (19 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Downloading xxhash-3.5.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess<0.70.17 (from datasets)
  Downloading multiprocess-0.70.16-py311-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.12.0,>=2023.1.0 (from fsspec[http]<=2024.12.0,>=2023.1.0->datasets)
  Downloading fsspec-2024.12.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch->timm)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch->timm)
  Downloa

In [2]:
# Load the OLIVES_Dataset

from datasets import load_dataset
ds = load_dataset("gOLIVES/OLIVES_Dataset", "biomarker_detection", cache_dir="./scratch/huggingface_datasets")

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


Resolving data files:   0%|          | 0/32 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/32 [00:00<?, ?it/s]

Loading dataset shards:   0%|          | 0/38 [00:00<?, ?it/s]

Image Pre-Processing

In [3]:
import numpy as np
from PIL import Image

# Define a function to adjust the contrast and brightness of a PIL image
def adjust_image(image, contrast_factor=1.2, brightness_offset=30):
    """
    Adjusts the image contrast and brightness.

    Parameters:
        image (PIL.Image.Image): Input image in mode 'L' (grayscale).
        contrast_factor (float): Multiplicative factor to enhance contrast.
        brightness_offset (float): Constant to add for brightness adjustment.

    Returns:
        np.ndarray: Adjusted image as a NumPy array with dtype uint8.
    """
    # Convert the image to a NumPy array with float precision for arithmetic
    arr = np.array(image).astype(np.float32)

    # Multiply each pixel by the contrast factor and add the brightness offset
    adjusted_arr = arr * contrast_factor + brightness_offset

    # Clip the values to remain in the valid pixel range [0, 255]
    adjusted_arr = np.clip(adjusted_arr, 0, 255)

    # Convert back to unsigned 8-bit integer type
    return adjusted_arr.astype(np.uint8)


Training split(~95% of overall dataset size)

In [4]:
import itertools
import pandas as pd
import numpy as np
from IPython.display import display

# Convert the set to a pandas DataFrame
df = pd.DataFrame(list(ds["train"]))

# Drop rows where any of the target columns (B1 - B6) are NaN.
# This ensures that the remaining X and y have matching indices.
df_clean = df.dropna(subset=["B1", "B2", "B3", "B4", "B5", "B6"])

# Now extract X (the 'Image' column) and y (target columns B1 - B6)
X = df_clean["Image"]
y = df_clean[["B1", "B2", "B3", "B4", "B5", "B6"]]

X = X.apply(lambda img: adjust_image(img, contrast_factor=1.2, brightness_offset=20))


# Display a sample of the cleaned data
print("Cleaned Feature (X) sample (as NumPy arrays):")
print(X.head())
print("\nCleaned Target (y) sample:")
print(y.head())

# Check the range of values in y
print("Minimum value in y:", y.values.min())
print("Maximum value in y:", y.values.max())

print(len(X))
print(len(y))

# 6468 at 30000
# 8428 at 40000
# 11613 at 50000
# 17591 at 78822

Cleaned Feature (X) sample (as NumPy arrays):
0    [[20, 39, 29, 20, 23, 22, 27, 36, 28, 20, 20, ...
1    [[20, 39, 29, 20, 23, 22, 27, 36, 28, 20, 20, ...
2    [[24, 21, 21, 34, 48, 24, 20, 20, 28, 51, 33, ...
3    [[24, 21, 21, 34, 48, 24, 20, 20, 28, 51, 33, ...
4    [[20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, ...
Name: Image, dtype: object

Cleaned Target (y) sample:
    B1   B2   B3   B4   B5   B6
0  1.0  0.0  1.0  1.0  0.0  1.0
1  1.0  0.0  1.0  1.0  0.0  1.0
2  1.0  0.0  1.0  1.0  0.0  0.0
3  1.0  0.0  1.0  1.0  0.0  0.0
4  1.0  0.0  1.0  0.0  0.0  0.0
Minimum value in y: 0.0
Maximum value in y: 1.0
17591
17591


Test split (~5% of overall dataset size)

In [5]:
import itertools
import pandas as pd
import numpy as np
from IPython.display import display

# Convert the subset to a pandas DataFrame
df_test = pd.DataFrame(list(ds["test"]))

# Drop rows where any of the target columns (B1 - B6) are NaN
df_test_clean = df_test.dropna(subset=["B1", "B2", "B3", "B4", "B5", "B6"])

# Now extract X_test (the 'Image' column) and y_test (target columns B1 - B6)
X_test = df_test_clean["Image"]
y_test = df_test_clean[["B1", "B2", "B3", "B4", "B5", "B6"]]


# Apply the adjust_image function on each image in X_test
# This converts and adjusts the images and stores the results as NumPy arrays
X_test = X_test.apply(lambda img: adjust_image(img, contrast_factor=1.2, brightness_offset=20))

# Display a sample of the processed images and the target values
print("Test Feature (X_test) sample (as NumPy arrays):")
print(X_test.head())
print("\nTest Target (y_test) sample:")
print(y_test.head())

# Ddisplay the range of values in y_test
print("\nRange of test targets:")
print("Minimum value in y:", y_test.values.min())
print("Maximum value in y:", y_test.values.max())


Test Feature (X_test) sample (as NumPy arrays):
0    [[20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, ...
1    [[20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, ...
2    [[20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, ...
3    [[20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, ...
4    [[20, 20, 20, 20, 20, 20, 20, 20, 20, 20, 20, ...
Name: Image, dtype: object

Test Target (y_test) sample:
    B1   B2   B3   B4   B5   B6
0  0.0  0.0  1.0  1.0  0.0  0.0
1  0.0  0.0  1.0  0.0  0.0  0.0
2  0.0  0.0  1.0  0.0  0.0  0.0
3  0.0  0.0  1.0  0.0  0.0  0.0
4  0.0  0.0  1.0  0.0  0.0  0.0

Range of test targets:
Minimum value in y: 0.0
Maximum value in y: 1.0


Image Transformations for MaxViT

In [None]:
from torchvision import transforms

maxvit_transform = transforms.Compose([
    transforms.Resize((384, 384)),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.5, 0.5, 0.5],
        std=[0.5, 0.5, 0.5],
    ),
])

Dataloader class definition

In [None]:
import numpy as np
import torch
from torch.utils.data import Dataset, DataLoader

# Define a custom dataset class to manage image data and labels for biomarker detection
class BiomarkerDataset(Dataset):

    def __init__(self, X, y, transform=None):
        self.X = X.reset_index(drop=True)
        self.y = y.reset_index(drop=True)
        self.transform = transform

    # Return the total number of samples in the dataset
    def __len__(self):
        return len(self.y)

    # Retrieve a specific image and its label using its index
    def __getitem__(self, idx):

        # Get the image data and convert to a PIL image
        img = self.X.iloc[idx]
        img = Image.fromarray(img.astype(np.uint8)).convert("RGB")

        # Apply a transformation
        if self.transform:
            img = self.transform(img)

        label = torch.from_numpy(self.y.iloc[idx].to_numpy(dtype=np.float32))

        # Return the image and its label
        return img, label

Display all the available variations of the MaxViT architecture

In [4]:
import timm
pretrained_maxvit_models = timm.list_models('*maxvit*')
print(pretrained_maxvit_models)

['maxvit_base_tf_224', 'maxvit_base_tf_384', 'maxvit_base_tf_512', 'maxvit_large_tf_224', 'maxvit_large_tf_384', 'maxvit_large_tf_512', 'maxvit_nano_rw_256', 'maxvit_pico_rw_256', 'maxvit_rmlp_base_rw_224', 'maxvit_rmlp_base_rw_384', 'maxvit_rmlp_nano_rw_256', 'maxvit_rmlp_pico_rw_256', 'maxvit_rmlp_small_rw_224', 'maxvit_rmlp_small_rw_256', 'maxvit_rmlp_tiny_rw_256', 'maxvit_small_tf_224', 'maxvit_small_tf_384', 'maxvit_small_tf_512', 'maxvit_tiny_pm_256', 'maxvit_tiny_rw_224', 'maxvit_tiny_rw_256', 'maxvit_tiny_tf_224', 'maxvit_tiny_tf_384', 'maxvit_tiny_tf_512', 'maxvit_xlarge_tf_224', 'maxvit_xlarge_tf_384', 'maxvit_xlarge_tf_512']


Importation of a pretrained MaxViT network

In [9]:
import timm
import torch
import torch.nn as nn
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)

model = timm.create_model('maxvit_tiny_tf_384', pretrained=True, num_classes=6)
model.to(device)

cuda


MaxxVit(
  (stem): Stem(
    (conv1): Conv2dSame(3, 64, kernel_size=(3, 3), stride=(2, 2))
    (norm1): BatchNormAct2d(
      64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
      (drop): Identity()
      (act): GELUTanh()
    )
    (conv2): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
  )
  (stages): Sequential(
    (0): MaxxVitStage(
      (blocks): Sequential(
        (0): MaxxVitBlock(
          (conv): MbConvBlock(
            (shortcut): Downsample2d(
              (pool): AvgPool2dSame(kernel_size=(2, 2), stride=(2, 2), padding=(0, 0))
              (expand): Identity()
            )
            (pre_norm): BatchNormAct2d(
              64, eps=0.001, momentum=0.1, affine=True, track_running_stats=True
              (drop): Identity()
              (act): Identity()
            )
            (down): Identity()
            (conv1_1x1): Conv2d(64, 256, kernel_size=(1, 1), stride=(1, 1), bias=False)
            (norm1): BatchNormAct2d(
      

Initial Evaluation

In [None]:
# Initial Evaluation on Test Set
from sklearn.metrics import f1_score
import numpy as np

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():  # Disable gradient calculations during evaluation
    for inputs, targets in testloader:
        inputs = inputs.to(device)
        if inputs.shape[1] == 1:
            inputs = inputs.repeat(1, 3, 1, 1)
        targets = targets.to(device).float()

        outputs = model(inputs)
        probs = torch.sigmoid(outputs)
        preds = (probs > 0.5).cpu().numpy()
        all_preds.append(preds)
        all_labels.append(targets.cpu().numpy())

all_preds = np.vstack(all_preds)
all_labels = np.vstack(all_labels)

macro_f1_before = f1_score(all_labels, all_preds, average='macro')
per_class_f1_before = f1_score(all_labels, all_preds, average=None)

print(f"Pre-training Test macro-F1: {macro_f1_before:.4f}")
for i, f1c in enumerate(per_class_f1_before, 1):
    print(f"  Class B{i} F1: {f1c:.4f}")



#------maxvit_tiny_tf_384 output------
#Pre-training Test macro-F1: 0.2066
#  Class B1 F1: 0.4669
#  Class B2 F1: 0.1401
#  Class B3 F1: 0.1890
#  Class B4 F1: 0.1411
#  Class B5 F1: 0.0861
#  Class B6 F1: 0.2166

#------maxvit_base_tf_384 output------
#Pre-training Test macro-F1: 0.1881
#  Class B1 F1: 0.4518
#  Class B2 F1: 0.0178
#  Class B3 F1: 0.2916
#  Class B4 F1: 0.3038
#  Class B5 F1: 0.0639
#  Class B6 F1: 0.0000

Pre-training Test macro-F1: 0.2693
  Class B1 F1: 0.4563
  Class B2 F1: 0.2599
  Class B3 F1: 0.6097
  Class B4 F1: 0.2860
  Class B5 F1: 0.0000
  Class B6 F1: 0.0037


Training Loop

In [None]:
'''import timm
import torch
import torch.nn as nn
import torch.optim as optim

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

model = timm.create_model('maxvit_tiny_tf_384', pretrained=True, num_classes=6)
model.to(device)'''

criterion = nn.BCEWithLogitsLoss()
optimizer = optim.Adam(model.parameters(), lr=1e-4)

# --- Experimenting with data augmentation techniques when training data
# --- to increase diversity and robustness
# Data augmentation
'''maxvit_transform = transforms.Compose([
    transforms.Resize((384, 384)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1),
    transforms.ToTensor(),
    transforms.Normalize(
        mean=[0.5, 0.5, 0.5],
        std=[0.5, 0.5, 0.5],
    ),
])'''

# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    for inputs, targets in trainloader:
        inputs = inputs.to(device)
        targets = targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()
        optimizer.step()

        running_loss += loss.item() * inputs.size(0)

    epoch_loss = running_loss / len(trainloader.dataset)
    print(f"[Epoch {epoch+1}/{num_epochs}] Loss: {epoch_loss:.4f}")

[Epoch 1/10] Loss: 0.2716
[Epoch 2/10] Loss: 0.1266
[Epoch 3/10] Loss: 0.0595
[Epoch 4/10] Loss: 0.0283
[Epoch 5/10] Loss: 0.0204
[Epoch 6/10] Loss: 0.0211
[Epoch 7/10] Loss: 0.0135
[Epoch 8/10] Loss: 0.0173
[Epoch 9/10] Loss: 0.0122
[Epoch 10/10] Loss: 0.0095


In [12]:
# Final evaluation on test set

model.eval()
all_preds = []
all_labels = []

with torch.no_grad():
    for inputs, targets in testloader:
        inputs = inputs.to(device)
        if inputs.shape[1] == 1:
            inputs = inputs.repeat(1, 3, 1, 1)
        targets = targets.to(device).float()

        outputs = model(inputs)
        probs = torch.sigmoid(outputs)
        preds = (probs > 0.5).cpu().numpy()
        all_preds.append(preds)
        all_labels.append(targets.cpu().numpy())

all_preds = np.vstack(all_preds)
all_labels = np.vstack(all_labels)

macro_f1_after = f1_score(all_labels, all_preds, average='macro')
per_class_f1_after = f1_score(all_labels, all_preds, average=None)

print(f"Post-training Test macro-F1: {macro_f1_after:.4f}")
for i, f1c in enumerate(per_class_f1_after, 1):
    print(f"  Class B{i} F1: {f1c:.4f}")

# macro-F1 simple cnn: 0.3939
# macro-F1 at maxvit_tiny_rw_224: 0.6235
# macro-F1 at maxvit_tiny_tf_384: 0.6662 at 40k samples

Post-training Test macro-F1: 0.6468
  Class B1 F1: 0.7202
  Class B2 F1: 0.6170
  Class B3 F1: 0.8193
  Class B4 F1: 0.5792
  Class B5 F1: 0.4875
  Class B6 F1: 0.6574
