In [74]:
import os
import cv2
import random
import shutil
import numpy as np

from tqdm import tqdm
from torch.utils.data import Dataset
from torch.utils.data import DataLoader

import torch
import torch.nn as nn
import torch.nn.functional as F

In [75]:
# generates test image
def generate_image(character_to_put_on, size=300, x=60, y=240, rand_RGB_value=0, rand_xy_value = 5):
    """function to generate test dataset images

    Args:
        character_to_put_on (str): character to write on image
        size (int, optional): size of image. Defaults to 300.
        x (int, optional): x coordinate of character. Defaults to 60.
        y (int, optional): y coordinate of character. Defaults to 240.
        rand_RGB_value (int, optional): random RGB shift. Defaults to 0.
        rand_xy_value (int, optional): random coordinate shift. Defaults to 5.

    Returns:
        numpy.ndarray: prepared image
    """
    bg = (220 + random.randint(-rand_RGB_value, rand_RGB_value),
          245 + random.randint(-rand_RGB_value, rand_RGB_value),
          245 + random.randint(-rand_RGB_value, rand_RGB_value))
    background = np.full((size, size, 3), bg, dtype=np.uint8)
    
    # put given character text over background
    background = cv2.putText(background, character_to_put_on,
                             (x + random.randint(-rand_xy_value, rand_xy_value),
                              y + random.randint(-rand_xy_value, rand_xy_value)), 
                              cv2.FONT_HERSHEY_SIMPLEX, 9, (0,0,0), 45, cv2.LINE_AA) 

    return background

# generates test dataset
def generate_test_dataset(path, count):
    """function to generate test dataset

    Args:
        path (str): folder path to generate images in
        count (int): number of images
    """
    os.makedirs(path, exist_ok=True)
    for i in range(count):
        character = random.choice("0123456789")
        image = generate_image(character)
        cv2.imwrite(os.path.join(path, str(i) + character + ".png"), image)

test_path = "here"
try:
    shutil.rmtree(test_path)
except:
    pass
generate_test_dataset(test_path, 100)

In [77]:
class ImageDataset(Dataset):
    def __init__(self, root_dir):
        self.root_dir = root_dir
        self.x = os.listdir(self.root_dir)
        self.num_samples = len(self.x)

    def __len__(self):
        return self.num_samples

    def __getitem__(self, idx):
        image_name = self.x[idx]
        image_path = os.path.join(self.root_dir, image_name)
        
        image = cv2.imread(image_path).astype(np.float32)/255
        image = np.moveaxis(image, 2, 0)

        return image

In [78]:
class PowerOf2sAtLeast256(nn.Module):
    def __init__(self):
        super(PowerOf2sAtLeast256, self).__init__()
        self.encoder1 = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=2, padding=(1,1)),
            nn.ReLU(inplace=True),
            
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=(1,1)),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=(1,1)),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2, return_indices=True))
        
        self.encoder2 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=2, padding=(1,1)),
            nn.ReLU(inplace=True),
            nn.Dropout2d(p=0.2),
            nn.MaxPool2d(kernel_size=2, stride=2, return_indices=True))
            
        self.encoder3 = nn.Sequential(
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding='same'),
            nn.ReLU(inplace=True),
            nn.Dropout2d(p=0.3),
            nn.MaxPool2d(kernel_size=2, stride=2, return_indices=True))
            
        self.encoder4 = nn.Sequential(
            nn.Conv2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding='same'),
            nn.ReLU(inplace=True),
            nn.Dropout2d(p=0.4),
            nn.MaxPool2d(kernel_size=2, stride=2, return_indices=True))
            
        self.encoder5 = nn.Sequential(
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding='same'),
            nn.ReLU(inplace=True))
        # ------------------------------------------------------------------------------------------------------------ #
        self.decoder6 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=256, out_channels=128, kernel_size=3, stride=1, padding=1, output_padding=0),
            nn.ReLU(inplace=True))
        
        self.unpool7 = nn.MaxUnpool2d(kernel_size=2, stride=2)
        self.decoder7 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1, output_padding=0),
            #nn.BatchNorm2d(),
            nn.ReLU(inplace=True))
        
        self.unpool8 = nn.MaxUnpool2d(kernel_size=2, stride=2)
        self.decoder8 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=128, out_channels=128, kernel_size=3, stride=1, padding=1, output_padding=0),
            #nn.BatchNorm2d(),
            nn.ReLU(inplace=True))
        
        self.unpool9 = nn.MaxUnpool2d(kernel_size=2, stride=2)
        self.decoder9 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=128, out_channels=64, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(inplace=True),
            nn.Dropout2d(p=0.4))
        
        self.unpool10 = nn.MaxUnpool2d(kernel_size=2, stride=2)
        self.decoder10 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=64, out_channels=32, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(inplace=True),
            nn.Dropout2d(p=0.3),

            nn.ConvTranspose2d(in_channels=32, out_channels=16, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(inplace=True),
            nn.Dropout2d(p=0.2),
            
            nn.ConvTranspose2d(in_channels=16, out_channels=3, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(inplace=True))

    def forward(self, x):
        x, indices1 = self.encoder1(x)
        # print("encoder1:", x.shape, indices1.shape)
        x, indices2 = self.encoder2(x)
        # print("encoder2:", x.shape, indices2.shape)
        x, indices3 = self.encoder3(x)
        # print("encoder3:", x.shape, indices3.shape)
        x, indices4 = self.encoder4(x)
        # print("encoder4:", x.shape, indices4.shape)
        x = self.encoder5(x)
        # print("encoder5: ", x.shape)
        # print("---------------------------------------")
        x = self.decoder6(x)
        # print("decoder6: ", x.shape)
        x = self.unpool7(x, indices4)
        # print("unpool6: ", x.shape)
        x = self.decoder7(x)
        # print("decoder7: ", x.shape)
        x = self.unpool8(x, indices3)
        # print("unpool7: ", x.shape)
        x = self.decoder8(x)
        # print("decoder8: ", x.shape)
        x = self.unpool9(x, indices2)
        # print("unpool8: ", x.shape)
        x = self.decoder9(x)
        # print("decoder9: ", x.shape)
        x = self.unpool10(x, indices1)
        # print("unpool9: ", x.shape)
        x = self.decoder10(x)
        # print("decoder10: ", x.shape)
        return x

In [79]:
class PowerOf2sAtMost128(nn.Module):
    def __init__(self):
        super(PowerOf2sAtMost128, self).__init__()
        self.encoder1 = nn.Sequential(
            nn.Conv2d(in_channels=3, out_channels=16, kernel_size=3, stride=2, padding=(1,1)),
            nn.ReLU(inplace=True),
            
            nn.Conv2d(in_channels=16, out_channels=32, kernel_size=3, stride=2, padding=(1,1)),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            
            nn.Conv2d(in_channels=32, out_channels=64, kernel_size=3, stride=2, padding=(1,1)),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(kernel_size=2, stride=2, return_indices=True))
        
        self.encoder2 = nn.Sequential(
            nn.Conv2d(in_channels=64, out_channels=128, kernel_size=3, stride=1, padding='same'),
            nn.ReLU(inplace=True),
            nn.Dropout2d(p=0.2),
            nn.MaxPool2d(kernel_size=2, stride=2, return_indices=True))
            
        self.encoder3 = nn.Sequential(
            nn.Conv2d(in_channels=128, out_channels=256, kernel_size=3, stride=1, padding='same'),
            nn.ReLU(inplace=True))
        # ------------------------------------------------------------------------------------------------------------ #
        self.decoder4 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=256, out_channels=128, kernel_size=3, stride=1, padding=1, output_padding=0),
            nn.ReLU(inplace=True))
                
        self.unpool5 = nn.MaxUnpool2d(kernel_size=2, stride=2)
        self.decoder5 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=128, out_channels=64, kernel_size=3, stride=1, padding=1, output_padding=0),
            nn.ReLU(inplace=True),
            nn.Dropout2d(p=0.4))
        
        self.unpool6 = nn.MaxUnpool2d(kernel_size=2, stride=2)
        self.decoder6 = nn.Sequential(
            nn.ConvTranspose2d(in_channels=64, out_channels=32, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(inplace=True),
            nn.Dropout2d(p=0.3),

            nn.ConvTranspose2d(in_channels=32, out_channels=16, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(inplace=True),
            nn.Dropout2d(p=0.2),
            
            nn.ConvTranspose2d(in_channels=16, out_channels=3, kernel_size=3, stride=2, padding=1, output_padding=1),
            nn.ReLU(inplace=True))

    def forward(self, x):
        x, indices1 = self.encoder1(x)
        # print("encoder1:", x.shape, indices1.shape)
        x, indices2 = self.encoder2(x)
        # print("encoder2:", x.shape, indices2.shape)
        x = self.encoder3(x)
        # print("encoder3:", x.shape)
        # print("---------------------------------------")
        x = self.decoder4(x)
        # print("decoder4: ", x.shape)
        x = self.unpool5(x, indices2)
        # print("unpool5: ", x.shape)
        x = self.decoder5(x)
        # print("decoder5: ", x.shape)
        x = self.unpool6(x, indices1)
        # print("unpool6: ", x.shape)
        x = self.decoder6(x)
        # print("decoder6: ", x.shape)
        return x

In [80]:
print("Total parameter count of PowerOf2sAtMost128:", sum(p.numel() for p in PowerOf2sAtMost128().parameters() if p.requires_grad))
print("Total parameter count of PowerOf2sAtLeast256:", sum(p.numel() for p in PowerOf2sAtLeast256().parameters() if p.requires_grad))

Total parameter count of PowerOf2sAtMost128: 785155
Total parameter count of PowerOf2sAtLeast256: 1375491


In [83]:
model_input = torch.randn(4, 3, 128, 128)
model = PowerOf2sAtMost128()
with torch.no_grad():
    model_output = model(model_input)

print("PowerOf2sAtMost128 input:", model_input.shape, "output:", model_output.shape, model_input.shape == model_output.shape)

model_input = torch.randn(4, 3, 256, 256)
model = PowerOf2sAtLeast256()
with torch.no_grad():
    model_output = model(model_input)

print("PowerOf2sAtLeast256 input:", model_input.shape, "output:", model_output.shape, model_input.shape == model_output.shape)

PowerOf2sAtMost128 input: torch.Size([4, 3, 128, 128]) output: torch.Size([4, 3, 128, 128]) True
PowerOf2sAtLeast256 input: torch.Size([4, 3, 256, 256]) output: torch.Size([4, 3, 256, 256]) True
