install unet

In [3]:
!pip install segmentation-models-pytorch

Collecting segmentation-models-pytorch
  Downloading segmentation_models_pytorch-0.5.0-py3-none-any.whl.metadata (17 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.4.127 (from torch>=1.8->segmentation-models-pytorch)
  Downloading nvidia_cuda_nvrtc_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-runtime-cu12==12.4.127 (from torch>=1.8->segmentation-models-pytorch)
  Downloading nvidia_cuda_runtime_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.5 kB)
Collecting nvidia-cuda-cupti-cu12==12.4.127 (from torch>=1.8->segmentation-models-pytorch)
  Downloading nvidia_cuda_cupti_cu12-12.4.127-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cudnn-cu12==9.1.0.70 (from torch>=1.8->segmentation-models-pytorch)
  Downloading nvidia_cudnn_cu12-9.1.0.70-py3-none-manylinux2014_x86_64.whl.metadata (1.6 kB)
Collecting nvidia-cublas-cu12==12.4.5.8 (from torch>=1.8->segmentation-models-pytorch)
  Downloading nvidia_cublas_cu12-12.4.5.8-

Imports

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from torchvision import transforms
import segmentation_models_pytorch as smp
from PIL import Image, ImageOps
import numpy as np
from sklearn.model_selection import train_test_split
import os
import matplotlib.pyplot as plt

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


model parameters

In [27]:
image_size = 512
epochs = 100
batch_size = 2
loss = 'loss'
lr = 0.0001

image and mask location

In [None]:
# need to build this dataset with ~20 images
image_folder = 'chair_person_dataset/images'
mask_folder = 'chair_person_dataset/labels_as_images'


Segmentation Dataset Class

In [22]:
class SegmentationDataset(Dataset):
    def __init__(self, image_mask_dict, common_transform=None, image_transform=None, mask_transform=None):
        self.image_mask_dict = image_mask_dict
        self.common_transform = common_transform
        self.image_transform = image_transform
        self.mask_transform = mask_transform

    def __len__(self):
        return len(self.image_mask_dict)

    def __getitem__(self, idx):
        img_name = list(self.image_mask_dict.keys())[idx]
        img, mask = self.image_mask_dict[img_name]
        if self.common_transform:
            img = self.common_transform(img)
            mask = self.common_transform(mask)

        if self.image_transform:
            img = self.image_transform(img)
        if self.mask_transform:
            mask = self.mask_transform(mask)

        return img, mask


Transforms for data augmentation

In [28]:
image_size_tuple = (image_size,) * 2

common_transform = transforms.Compose([
    transforms.RandomHorizontalFlip(p=0.5), # flips on medial axis for L -> R etc.
    transforms.RandomRotation(degrees=15), # accounts for angle differences between images
    transforms.Resize(image_size_tuple), # make sure all the same size
])

image_transform = transforms.Compose([
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.1), # account for lighting changes
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])

mask_transform = transforms.Compose([
    transforms.ToTensor()
])

test_transform = transforms.Compose([
    transforms.Resize(image_size_tuple),
    transforms.ToTensor()
])


Creation of the image, mask dictionary

In [25]:
image_mask_dict = {}

for img_name in os.listdir(image_folder):

  if not img_name.lower().endswith((".png", ".jpg", ".JPEG", ".jpeg")):
        continue

  img_path = os.path.join(image_folder, img_name)
  mask_name = os.path.splitext(img_name)[0] + ".png"
  print(mask_name)
  mask_path = os.path.join(mask_folder, mask_name)

  if not os.path.exists(mask_path):
        print(f"Mask not found for {img_name}, skipping.")
        continue

  img = Image.open(img_path)
  img = ImageOps.exif_transpose(img)
  mask = Image.open(mask_path)

  image_mask_dict[img_name] = (img, mask)

  if img.size != mask.size:
    print(f"mask and image size don't match for {img_name}")


  print(f"{img_name} - Image size: {img.size}, {mask_name} - Mask size: {mask.size}")

sub-001.png
sub-001.JPEG - Image size: (3024, 4032), sub-001.png - Mask size: (3024, 4032)
sub-002.png
sub-002.JPEG - Image size: (3024, 4032), sub-002.png - Mask size: (3024, 4032)
sub-003.png
sub-003.JPEG - Image size: (3024, 4032), sub-003.png - Mask size: (3024, 4032)
sub-004.png
sub-004.JPEG - Image size: (3024, 4032), sub-004.png - Mask size: (3024, 4032)
sub-005.png
sub-005.jpg - Image size: (4284, 5712), sub-005.png - Mask size: (4284, 5712)
sub-006.png
sub-006.jpeg - Image size: (4284, 5712), sub-006.png - Mask size: (4284, 5712)
sub-007.png
sub-007.jpeg - Image size: (4284, 5712), sub-007.png - Mask size: (4284, 5712)
sub-008.png
sub-008.jpeg - Image size: (4284, 5712), sub-008.png - Mask size: (4284, 5712)


Split int train and val

In [29]:
items = list(image_mask_dict.items())

# 80-20 split
train_items, val_items = train_test_split(items, test_size=0.2, random_state=42)

train_dict = dict(train_items)
val_dict = dict(val_items)

creation of dataset with the segmentation class

In [30]:
train_dataset = SegmentationDataset(
    image_mask_dict=train_dict,
    common_transform=common_transform,
    image_transform=image_transform,
    mask_transform=mask_transform
)

val_dataset = SegmentationDataset(
    image_mask_dict=val_dict,
    common_transform=common_transform,
    image_transform=image_transform,
    mask_transform=mask_transform
)

train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False)

Define the basic unet model

In [31]:
model = smp.Unet(
    encoder_name="resnet34",
    encoder_weights="imagenet",
    in_channels=3,  # for rgb
    classes=1,  # outputs binary mask
)

The secret `HF_TOKEN` does not exist in your Colab secrets.
To authenticate with the Hugging Face Hub, create a token in your settings tab (https://huggingface.co/settings/tokens), set it as secret in your Google Colab and restart your session.
You will be able to reuse this secret in all of your notebooks.
Please note that authentication is recommended but still optional to access public models or datasets.


config.json:   0%|          | 0.00/156 [00:00<?, ?B/s]

model.safetensors:   0%|          | 0.00/87.3M [00:00<?, ?B/s]

loss, optimizer, etc.

In [32]:
if loss == 'loss':
  criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
best_val_loss = float('inf')
best_model = None

In [33]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu") # I used a T4 gpu on colab

# move model to gpu
model = model.to(device)

training and validation loop

In [34]:
for epoch in range(epochs):
    model.train()
    running_loss = 0.0

    # training
    for images, masks in train_loader:
        images = images.to(device)
        masks = masks.to(device)

        outputs = model(images)
        loss = criterion(outputs, masks)

        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

    avg_train_loss = running_loss / len(train_loader)
    print(f"Epoch {epoch+1}, Train Loss: {avg_train_loss:.4f}")

    # validation
    model.eval()
    val_loss = 0.0

    with torch.no_grad():
        for val_images, val_masks in val_loader:
            val_images = val_images.cuda()
            val_masks = val_masks.cuda()

            val_outputs = model(val_images)
            loss = criterion(val_outputs, val_masks)
            val_loss += loss.item()

    avg_val_loss = val_loss / len(val_loader)
    print(f"Epoch {epoch+1}, Val Loss: {avg_val_loss:.4f}\n")

    # compare with the best loss
    if avg_val_loss < best_val_loss:
        best_val_loss = avg_val_loss
        best_model = model.state_dict()
        print("val loss improved, saving the model...")

# save the best model
if best_model:
    torch.save(best_model, 'best_model.pth')
    print("saving the best model")


Epoch 1, Train Loss: 0.6873
Epoch 1, Val Loss: 0.7454

val loss improved, saving the model...
Epoch 2, Train Loss: 0.6185
Epoch 2, Val Loss: 0.6913

val loss improved, saving the model...
Epoch 3, Train Loss: 0.5606
Epoch 3, Val Loss: 0.6544

val loss improved, saving the model...
Epoch 4, Train Loss: 0.5388
Epoch 4, Val Loss: 0.6021

val loss improved, saving the model...
Epoch 5, Train Loss: 0.4826
Epoch 5, Val Loss: 0.6284

Epoch 6, Train Loss: 0.4548
Epoch 6, Val Loss: 0.6061

Epoch 7, Train Loss: 0.4263
Epoch 7, Val Loss: 0.5625

val loss improved, saving the model...
Epoch 8, Train Loss: 0.4138
Epoch 8, Val Loss: 0.5173

val loss improved, saving the model...
Epoch 9, Train Loss: 0.4085
Epoch 9, Val Loss: 0.5209

Epoch 10, Train Loss: 0.3780
Epoch 10, Val Loss: 0.4310

val loss improved, saving the model...
Epoch 11, Train Loss: 0.4146
Epoch 11, Val Loss: 0.4016

val loss improved, saving the model...
Epoch 12, Train Loss: 0.3466
Epoch 12, Val Loss: 0.3749

val loss improved, sav

Test the model and visualizate outputs

In [38]:

def segment_person(image):
    model.eval()

    # apply resizing and create a tensor, move to gpu
    image_tensor = test_transform(image).unsqueeze(0).to(device)

    with torch.no_grad():
        output = model(image_tensor)

    # produce binary mask, move back to cpu
    output_mask = torch.sigmoid(output).squeeze().cpu().numpy() > 0.5

    return output_mask

test_image = Image.open("")
segmentation_mask = segment_person(test_image)

plt.figure(figsize=(10,5))
plt.subplot(1, 2, 1)
plt.title("Original Image")
plt.imshow(test_image)

plt.subplot(1, 2, 2)
plt.title("Predicted Mask")
plt.imshow(segmentation_mask, cmap="gray")
plt.show()


AttributeError: 'str' object has no attribute 'read'