# 10. Approaching Image Classification and Segmentation



In [1]:
import os
from dotenv import load_dotenv
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
import torch
import torch.functional as F
import torch.nn as nn
from PIL import Image, ImageFile
from sklearn import ensemble, metrics, model_selection
from tqdm import tqdm

load_dotenv()

sns.set_theme()

M = 0.2 # Sample ratio


First example is detecting pneumothorax from a given x-ray image. It's skewed dataset meaning there are far more negative samples than negative ones.

In [2]:
# def create_dataset(training_df, image_dir):
#     images = []
#     targets = []

#     for index, row in tqdm(
#         training_df.iterrows(), total=len(training_df), desc="Processing images..."
#     ):
#         image_id = row["ImageId"]
#         image_path = os.path.join(image_dir, image_id)
#         image = Image.open(image_path + ".png")
#         image = image.resize((256, 256), resample=Image.BILINEAR)
#         image = np.array(image)
#         image = image.ravel()
#         images.append(image)
#         targets.append(int(row["target"]))

#     images = np.array(images)
#     targets = np.array(targets)
#     print(images.shape)
#     return images, targets

# csv_path = "data/train.csv"
# image_path = "data/train_png"

# df = pd.read_csv(csv_path)
# df["kfold"] = -1
# df = df.sample(frac=M, random_state=42).reset_index(drop=True)
# y = df.target.values

We load dataset into a numpy array by transversing the directory, however we put a limit to sample size otherwise it would consume too much memory.

In [3]:
# # Takes a while, run once
# try:
#     print(f"Dataset loaded: {X_train.shape}")
# except:
#     X_train, y_train = create_dataset(df, image_path)


In [4]:
# random_forrest = ensemble.RandomForestClassifier(n_jobs=-1)
# model_selection.cross_val_score(random_forrest, X_train, y_train, cv=2, n_jobs=-1, scoring="roc_auc")

Now we move on to neural network models.

In [5]:
import torch
import torch.nn as nn
import torch.functional as F


class AlexNet(nn.Module):
    def __init__(self):
        super(AlexNet, self).__init__()
        self.conv1 = nn.Conv2d(
            in_channels=3, out_channels=96, kernel_size=11, stride=4, padding=0
        )
        self.pool1 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.conv2 = nn.Conv2d(
            in_channels=96, out_channels=256, kernel_size=5, stride=1, padding=2
        )
        self.pool2 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.conv3 = nn.Conv2d(
            in_channels=256, out_channels=384, kernel_size=3, stride=1, padding=1
        )
        self.conv4 = nn.Conv2d(
            in_channels=384, out_channels=384, kernel_size=3, stride=1, padding=1
        )
        self.conv5 = nn.Conv2d(
            in_channels=384, out_channels=256, kernel_size=3, stride=1, padding=1
        )
        self.pool3 = nn.MaxPool2d(kernel_size=3, stride=2)
        self.fc1 = nn.Linear(in_features=9216, out_features=4096)
        self.dropout1 = nn.Dropout(0.5)
        self.fc2 = nn.Linear(in_features=4096, out_features=4096)
        self.dropout2 = nn.Dropout(0.5)
        self.fc3 = nn.Linear(in_features=4096, out_features=1000)

    def forward(self, image):

        # original size: (bs, 3, 227, 227)
        bs, c, h, w = image.size()
        x = F.relu(self.conv1(image))  # size: (bs, 96, 55, 55)
        x = self.pool1(x)  # size: (bs, 96, 27, 27)
        x = F.relu(self.conv2(x))  # size: (bs, 256, 27, 27)
        x = self.pool2(x)  # size: (bs, 256, 13, 13)
        x = F.relu(self.conv3(x))  # size: (bs, 384, 13, 13)
        x = F.relu(self.conv4(x))  # size: (bs, 384, 13, 13)
        x = F.relu(self.conv5(x))  # size: (bs, 256, 13, 13)
        x = self.pool3(x)  # size: (bs, 256, 6, 6)
        x = x.view(bs, -1)  # size: (bs, 9216)
        x = F.relu(self.fc1(x))  # size: (bs, 4096)
        x = self.dropout1(x)  # size: (bs, 4096)
        # dropout does not change size
        # dropout is used for regularization
        # 0.3 dropout means that only 70% of the nodes
        # of the current layer are used for the next layer
        x = F.relu(self.fc2(x))  # size: (bs, 4096)
        x = self.dropout2(x)  # size: (bs, 4096)
        x = F.relu(self.fc3(x))  # size: (bs, 1000)
        # 1000 is number of classes in ImageNet Dataset
        # softmax is an activation function that converts
        # linear output to probabilities that add up to 1
        x = torch.softmax(x, axis=1)
        return x


dataset.py

In [6]:
import torch
import numpy as np
from PIL import Image
from PIL import ImageFile

ImageFile.LOAD_TRUNCATED_IMAGES = True


class ClassificationDataset:
    def __init__(self, image_paths, targets, resize=None, augmentations=None):
        self.image_paths = image_paths
        self.targets = targets
        self.resize = resize
        self.augmentations = augmentations

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, item):
        image = Image.open(self.image_paths[item])
        image = image.convert("RGB")
        targets = self.targets[item]

        if self.resize is not None:
            image = image.resize(
                (self.resize[1], self.resize[0]), resample=Image.BILINEAR
            )

        image = np.array(image)

        if self.augmentations is not None:
            augmented = self.augmentations(image=image)
            image = augmented["image"]

        image = np.transpose(image, (2, 0, 1)).astype(np.float32) # HWC

        return {
            "image": torch.tensor(image, dtype=torch.float),
            "targets": torch.tensor(targets, dtype=torch.long),
        }



engine.py

In [7]:
import torch
import torch.nn as nn
from tqdm import tqdm


def train(data_loader, model, optimizer, device):
    model.train()
    for data in data_loader:
        inputs = data["image"]
        targets = data["targets"]

        inputs = inputs.to(device, dtype=torch.float)
        targets = targets.to(device, dtype=torch.float)

        optimizer.zero_grad()
        outputs = model(inputs)

        loss = nn.BCEWithLogitsLoss()(outputs, targets.view(-1, 1))
        loss.backward()
        optimizer.step()
        # Here scheduler


def evaluate(data_loader, model, device):
    model.eval()
    final_targets = []
    final_outputs = []

    with torch.no_grad():
        for data in data_loader:
            inputs = data["image"]
            targets = data["targets"]
            inputs = inputs.to(device, dtype=torch.float)
            targets = targets.to(device, dtype=torch.float)

            output = model(inputs)
            targets = targets.detach().cpu().numpy().tolist()
            output = output.detach().cpu().numpy().tolist()
            final_targets.extend(targets)
            final_outputs.extend(output)

    return final_outputs, final_targets


model.py

In [8]:
import torch.nn as nn
import pretrainedmodels


def get_model(pretrained):
    if pretrained:
        model = pretrainedmodels.__dict__["alexnet"](pretrained="imagenet")
    else:
        model = pretrainedmodels.__dict__["alexnet"](pretrained=None)
        
    model.last_linear = nn.Sequential(
        nn.BatchNorm1d(4096),
        nn.Dropout(p=0.25),
        nn.Linear(in_features=4096, out_features=2048),
        nn.ReLU(),
        nn.BatchNorm1d(2048, eps=1e-05, momentum=0.1),
        nn.Dropout(p=0.5),
        nn.Linear(in_features=2048, out_features=1),
    )
    return model

train.py

In [9]:
import os

import albumentations
import numpy as np
import pandas as pd
import torch
from sklearn import metrics
from sklearn.model_selection import train_test_split

 import torch
 torch.cuda.empty_cache()

data_path = "data"
device = "cuda" if torch.cuda.is_available() else "cpu"

EPOCHS = 1
df = pd.read_csv(os.path.join(data_path, "train.csv")).sample(frac=0.05).reset_index(drop=True)
images = df.ImageId.values.tolist()

images = [os.path.join(data_path, "train_png", i + ".png") for i in images]

# image example and length
print(images[0], len(images))

targets = df.target.values
model = get_model(pretrained=True)
model.to(device)

mean = (0.485, 0.456, 0.406)
std = (0.229, 0.224, 0.225)

aug = albumentations.Compose(
    [albumentations.Normalize(mean, std, max_pixel_value=255.0, always_apply=True)]
)
train_images, valid_images, train_targets, valid_targets = train_test_split(
    images, targets, stratify=targets, random_state=42
)

train_dataset = ClassificationDataset(image_paths=train_images,targets=train_targets,resize=(227,227),augmentations=aug)
train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=2, shuffle=True, num_workers=4)

valid_dataset = ClassificationDataset(image_paths=valid_images, targets=valid_targets, resize=(227, 227), augmentations=aug)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=2, shuffle=False, num_workers=4)

optimizer = torch.optim.Adam(model.parameters(), lr=5e-4)


for epoch in range(EPOCHS):
    train(train_loader, model, optimizer, device=device)
    predictions, valid_targets = evaluate(valid_loader, model, device=device)
    roc_auc = metrics.roc_auc_score(valid_targets, predictions)
    print(f"Epoch={epoch}, Valid ROC AUC={roc_auc}")


data/train_png/1.2.276.0.7230010.3.1.4.8323329.10773.1517875225.585811.png 32


RuntimeError: Input type (torch.cuda.HalfTensor) and weight type (torch.cuda.FloatTensor) should be the same

In [None]:
len(images)

21