Start by importing packages and data

In [1]:
import torch
import os
import csv
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.utils.data import sampler
from PIL import Image 
import torchvision.datasets as dset
import torchvision.transforms as T
import random
import numpy as np
import matplotlib.pyplot as plt
import statistics
import gc

In [36]:
USE_GPU = True

dtype = torch.float32 # we will be using float throughout this tutorial

# if USE_GPU and torch.cuda.is_available():
#     device = torch.device('cuda')
# else:
device = torch.device('cpu')

print('using device:', device)

using device: cpu


In [3]:
# Data ingest

# Get all labels imported and indexed, with all whitespaces stripped out
labels_list = []
dataset = []
#import dataset to empty arrays of data and label
for (root, dirs, files) in os.walk("crop/"):
    labels_list = dirs
    for dire in dirs:
        curr_dir = root + dire
        for (rt,di,fi) in os.walk(curr_dir):
            for img in fi:
                data = {}
                data["image"] = T.functional.pil_to_tensor(Image.open(curr_dir+"/"+img))#.to(torch.float32)
                data["label"] = labels_list.index(dire)
                _,h,w = data["image"].shape
                if (h <= 784 and w <= 896 and h >= 160 and w >= 160):
                    dataset.append(data)
                # for each file in the sub directory
                # append the image and corresponding label to dataset
    break
random.shuffle(dataset)

In [4]:
print(len(dataset))

6583


In [5]:
heights = []
widths = []
for i in dataset:
    c,h,w = i["image"].shape
    heights.append(h)
    widths.append(w)

#plt.bar(4000,heights)
#plt.show()
print (statistics.mean(heights))
print (statistics.median(heights))
print (statistics.multimode(heights))
print (statistics.mean(widths))
print (statistics.median(widths))
print (statistics.multimode(widths))
print (max(heights))
print (max(widths))

297.358347258089
256
[191]
533.786723378399
537
[679]
784
896


In [6]:
heights.sort(reverse=True)
print(heights[500:510])

[529, 529, 529, 529, 529, 528, 528, 527, 527, 527]


# height greater than 1504 or widths greater than 3104 is discarded

In [7]:
max_height = 784

max_width  = 896

In [8]:

dtype = torch.float32

In [9]:
class Patchwork (nn.Module):
    def __init__ (self, patch_size = 16):
        super().__init__()
        self.patch_size = patch_size
        self.num_patches = (max_height * max_width) // (self.patch_size ** 2)
        self.patch_dim = (self.patch_size ** 2) * 3
        self.linear_size = self.patch_dim // 2
        self.linear_layer = nn.Linear(self.patch_dim,self.linear_size,dtype=torch.float16)
        self.positional_embedding = self.get_pos_embed()
        # self.positional_embedding.requires_grad=False
    
    def get_pos_embed(self):
        positional_embedding = []
        for i in range(self.num_patches+1):
            positional_embedding_curr = []
            for j in range(self.linear_size//2):
                j *= 2
                positional_embedding_curr.append(np.sin(i/(10000 ** (j/self.patch_dim))))
                j += 1
                positional_embedding_curr.append(np.cos(i/(10000 ** (j/self.patch_dim))))
            if (self.patch_dim % 2):
                j = self.patch_dim - 1
                positional_embedding_curr.append(np.sin(i/(10000 ** (j/self.patch_dim))))
            positional_embedding.append(torch.as_tensor(positional_embedding_curr,dtype=dtype))
        return torch.stack(positional_embedding).to(dtype)


    def forward(self,image):
        image = image.to(torch.float16)
        c,h,w = image.shape

        #size standardization
        patch_height = max_height - h
        patch_width = max_width - w

        patch_bottom = patch_height // 2
        if (patch_height % 2):
            patch_top = patch_bottom + 1
        else:
            patch_top = patch_bottom

        patch_right = patch_width // 2

        if (patch_width % 2):
            patch_left = patch_right + 1
        else:
            patch_left = patch_right
            

        padding = torch.nn.ZeroPad2d((patch_left,patch_right,patch_top,patch_bottom))
        image = padding(image)
        
        patches = []
            
        for i in range(self.num_patches):
            x_coor = i // (max_width // self.patch_size)
            y_coor = i - x_coor * (max_width // self.patch_size)
            patch = image[:, x_coor * self.patch_size: (x_coor + 1) * self.patch_size,y_coor * self.patch_size:(y_coor+1)*self.patch_size]
            patches.append(patch.flatten())
            
        patches = torch.stack(patches)
        patches = self.linear_layer(patches)
        
        classification_token = nn.Parameter(torch.rand(self.linear_size,dtype=dtype))
        patches = torch.vstack((classification_token,patches))
        # print(patches.shape)
        patches = patches + self.positional_embedding

        return patches.to(dtype)

In [10]:
patch = Patchwork()
patched_dataset = []
for i in range(100):
    img = dataset[i]["image"]
    patched_dataset.append(patch(img).to(dtype))
    if (not i % 100):
        print (i)
patched_dataset=torch.stack(patched_dataset)


0


In [11]:
patched_dataset.shape

torch.Size([100, 2745, 384])

In [40]:
class ViT (nn.Module):
    def __init__(self, k=16,hidden_dim=384,out_dim=46):
        super().__init__()
        self.hidden_dim = hidden_dim
        self.layerNorm1 = nn.LayerNorm(hidden_dim)
        self.msa = nn.MultiheadAttention(384,k)
        self.layerNorm2 = nn.LayerNorm(hidden_dim)
        self.mlp = nn.Sequential(
            nn.Linear(hidden_dim,hidden_dim*4),
            nn.GELU(),
            nn.Linear(hidden_dim*4,hidden_dim)
        )
        self.output = nn.Sequential(
            nn.Linear(hidden_dim,out_dim),
            nn.Softmax(dim=-1)
        )
    
    def forward(self,x):
        x_1 = x.to(torch.float32)
        x = self.layerNorm1(x.to(torch.float32))
        x_prime = []
        for i in x:
            xp = self.msa(i,i,i,need_weights=False)
            x_prime.append(xp[0])
        x = torch.stack(x_prime)
        x = x + x_1
        x_2 = x
        x = self.layerNorm2(x)
        x = self.mlp(x)
        x = x + x_2

        return self.output(x)[:,0]

        #LN, then MSA, + ,LN, MLP, +
        #D is the constant latent vector size

In [45]:
def training(model, optimizer,epochs=5):
    model = model.to(device=device)
    y = torch.from_numpy(np.array([dataset[i]["label"] for i in range(100)]))
    # for epoch in range(epochs):
    #     model.train()
    #     print(epoch)
    #     x = patched_dataset.to(device=device,dtype=dtype)
    #     y = y.to(device=device,dtype=y.dtype)

    #     scores = model(x)

    #     loss = F.cross_entropy(scores, y)
    #     optimizer.zero_grad()
    #     loss.backward()
    #     optimizer.step()
    num_correct = 0
    num_samples = 0
    # model.eval()  # set model to evaluation mode
    with torch.no_grad():
        x = patched_dataset.to(device=device, dtype=dtype)  # move to device, e.g. GPU
        y = y.to(device=device, dtype=torch.long)
        scores = model(x)
        _, preds = scores.max(1)
        num_correct += (preds == y).sum()
        num_samples += preds.size(0)
        acc = float(num_correct) / num_samples
        print('Got %d / %d correct (%.2f)' % (num_correct, num_samples, 100 * acc))
    return acc



In [46]:
import torch.nn.functional as F
model = ViT()
optimizer = torch.optim.Adam(model.parameters(),lr=0.01)
training(model,optimizer)



Got 2 / 100 correct (2.00)


0.02