In [3]:
import os
import h5py
import imageio.v3 as iio
import numpy as np

### Define data parsing class: PhotoData

In [38]:
from typing import Tuple


class PhotoData(object):

    SRC_FOLDER = 'images/'

    LBL_FOLDER = 'labels/'

    HDF5_FILE = 'all_images.h5'

    N = 600

    def indextofilename(index) -> str:
        return f"{index:04d}.jpg"
    
    def indextolabelname(index) -> Tuple[str]:
        ind = f"{index:04d}"
        return f"{ind}_person.png", f"{ind}_clothes.png"


    def loadimages(self):
        images = np.ndarray((PhotoData.N, 600, 400, 3))
        for i in range(PhotoData.N):
            img_url = os.path.join(PhotoData.SRC_FOLDER, PhotoData.indextofilename(i + 1))
            im = iio.imread(img_url)
            images[i] = im
        return images
    
    def loadlabels(self):
        p_labels = np.ndarray((PhotoData.N, 600, 400), dtype=bool)
        c_labels = np.ndarray((PhotoData.N, 600, 400), dtype=np.int8)

        for i in range(PhotoData.N):
            p_name, c_name = PhotoData.indextolabelname(i+1)
            p_url = os.path.join(PhotoData.LBL_FOLDER, p_name)
            c_url = os.path.join(PhotoData.LBL_FOLDER, c_name)
            p_im = iio.imread(p_url)
            c_im = iio.imread(c_url)
            p_labels[i] = p_im[:,:,0] > 0
            # https://stackoverflow.com/questions/15635025/how-to-map-false-color-image-to-specific-labels-assigned-for-each-color
            r_channel = c_im[:, :, 0] > 0
            g_channel = c_im[:, :, 1] > 0
            b_channel = c_im[:, :, 2] > 0
            labels = (b_channel << 2) + (g_channel << 1) + r_channel
            c_labels[i] = labels
        return p_labels, c_labels
    
    def parsedata(self):
        images = self.loadimages()
        p_labels, c_labels = self.loadlabels()
        return images, p_labels, c_labels

    def storeh5(self, images, p_labels, c_labels):
        file = h5py.File(PhotoData.HDF5_FILE, 'w')
        file.create_dataset('images', np.shape(images), h5py.h5t.STD_U8BE, data=images)
        file.create_dataset('person_labels', np.shape(p_labels), h5py.h5t.STD_U8BE, data=p_labels)
        file.create_dataset('clothing_labels', np.shape(c_labels), h5py.h5t.STD_U8BE, data=c_labels)
        file.close()

    def loadh5(self):
        images, p_labels, c_labels = [], [], []
        file = h5py.File(PhotoData.HDF5_FILE, 'r+')
        images = np.array(file['images']).astype('uint8')
        p_labels = np.array(file['person_labels']).astype('bool')
        c_labels = np.array(file['clothing_labels']).astype('uint8')
        return images, p_labels, c_labels

## Process and Store data
parse and store the data in h5

In [53]:
import math

data = PhotoData()
i, p, c = data.parsedata()

In [54]:
# subdivide images for ViT
def subdivideimage(imgs: np.ndarray, suppixsize=16):
    """Pads the images and subdivides for number of pixels required"""
    N = imgs.shape[0]
    H = imgs.shape[1]
    W = imgs.shape[2]
    rows = math.ceil(H / suppixsize)
    # cols = math.ceil(W / suppixsize)

    has_color = len(imgs.shape) > 3

    dims = (N, 1, W, 3) if has_color else (N, 1, W,)
    for i in range(suppixsize - (H % suppixsize)):
        imgs = np.append(imgs, np.zeros(dims), axis=1)
    new_H = rows * suppixsize
    dims = (N, new_H, 1, 3) if has_color else (N, new_H, 1,)
    for i in range(suppixsize - (W % suppixsize)):
        imgs = np.append(imgs, np.zeros(dims), axis=2)
    # return imgs
    # https://stackoverflow.com/questions/16856788/slice-2d-array-into-smaller-2d-arrays
    swap_dim_1 = [N, rows, suppixsize, -1, suppixsize]
    swap_dim_2 = [N, -1, suppixsize, suppixsize]
    if has_color:
        swap_dim_1.append(3)
        swap_dim_2.append(3)
    return imgs.reshape(swap_dim_1).swapaxes(2,3).reshape(swap_dim_2)

images = subdivideimage(i)
p_labels = subdivideimage(p)
c_labels = subdivideimage(c)

In [55]:
# reshape images
def reshapeimages(images):
    N = images.shape[0]
    M = images.shape[1]
    return images.reshape((N, M, -1))

images = reshapeimages(images)

In [56]:
# produce [2] labels
def processpersonlabels(p_labels):
    N = p_labels.shape[0]
    M = p_labels.shape[1]
    total = p_labels.shape[2] ** 2
    person_arr = (np.sum(p_labels.reshape((N, M, -1)), axis=2) / total).reshape((N, M, 1))
    background_arr = np.ones((N, M, 1)) - person_arr
    return np.append(background_arr, person_arr, axis=2)
pp = processpersonlabels(p_labels=p_labels)
pp.shape

(600, 988, 2)

In [57]:
# produce [7] labels
def processclotheslabels(c_labels):
    N = c_labels.shape[0]
    M = c_labels.shape[1]
    total = c_labels.shape[2] ** 2
    flattened = c_labels.reshape((N, M, -1))
    cc = np.zeros((N, M, 0))
    for i in range(7):
        x = (np.sum(flattened == i, axis=2) / total).reshape((N, M, 1))
        cc = np.append(cc, x, axis=2)
    return cc

cc = processclotheslabels(c_labels)
cc.shape

(600, 988, 7)

In [58]:
data.storeh5(images, pp, cc)

## Load data

In [59]:
data = PhotoData()
images, p_labels, c_labels = data.loadh5()

In [60]:
images.shape

(600, 988, 768)

# Model
Define the model, will be writing a vision transformer (ViT, https://arxiv.org/abs/2010.11929v2).

In [64]:
import torch
from torch import nn

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

encoder_layer = nn.TransformerEncoderLayer(d_model=768, nhead=12)
transformer_encoder = nn.TransformerEncoder(encoder_layer, num_layers=12).to(device)



OutOfMemoryError: CUDA out of memory. Tried to allocate 86.00 MiB. GPU 0 has a total capacity of 12.00 GiB of which 925.00 MiB is free. Of the allocated memory 9.92 GiB is allocated by PyTorch, and 138.84 MiB is reserved by PyTorch but unallocated. If reserved but unallocated memory is large try setting PYTORCH_CUDA_ALLOC_CONF=expandable_segments:True to avoid fragmentation.  See documentation for Memory Management  (https://pytorch.org/docs/stable/notes/cuda.html#environment-variables)

In [None]:
src = torch.rand(10, 968, 768, device=device)
out = transformer_encoder(src)
out