In [1]:
import torch
from torch import nn
from torch.utils.data import DataLoader
import math
import einops
import numpy as np
from PIL import Image
import torchvision
import pandas as pd

from utils import PlaneSet2Neurons, configurate_xy_tensors

from transformers import ViTFeatureExtractor, ViTForImageClassification
from IPython.display import display

torch.manual_seed(42)

<torch._C.Generator at 0x2673156bd50>

##### Load model
---

In [2]:
feature_extractor = ViTFeatureExtractor.from_pretrained('google/vit-base-patch16-224')
model = ViTForImageClassification.from_pretrained('google/vit-base-patch16-224')

In [3]:
model_parameters = filter(lambda p: p.requires_grad, model.parameters())
params = sum([np.prod(p.size()) for p in model_parameters])

print("Trainable parameters amount: {:n}".format(params))

Trainable parameters amount: 86 567 656


##### Replace the latest feed-forward layer
---

In [4]:
print(model.classifier)
model.classifier = nn.Linear(768, 2)
print(model.classifier)

Linear(in_features=768, out_features=1000, bias=True)
Linear(in_features=768, out_features=2, bias=True)


##### Fine tune the model
---

In [5]:
distance = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=1e-3)
num_epochs = 2
batch_size = 5
    
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device=device)

# data
# --------------------------
csv_path = r"../train"
images_path = r"../avia-train/"
with open(csv_path, "r") as file:
    data = pd.read_csv(file)
msk = np.random.rand(len(data)) < 0.8
train_df = data[msk]
test_df = data[~msk]
train_dataset = PlaneSet2Neurons(images_path, train_df)
test_dataset = PlaneSet2Neurons(images_path, test_df)
train = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test  = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)

In [6]:
def convert_batch_to_pil(batch):
    return [
        torchvision.transforms.ToPILImage()(i)
        for i in batch
    ]

In [None]:
loss_dict = {}

for epoch in range(num_epochs):
    loss_train_accumulator = []

    for ind, (x, y) in enumerate(train):
        x, y = configurate_xy_tensors(x, y)
        x = convert_batch_to_pil(x)
        x = feature_extractor(images=x, return_tensors="pt")
        y_hat = model(x.pixel_values.cuda())
        logits = y_hat.logits
        optimizer.zero_grad()

        y = einops.rearrange(y, "b h w -> b (h w)")
        loss = distance(logits, y)
        diff = loss.item()

        loss.backward()
        optimizer.step()
        loss_train_accumulator.append(diff)
        
        del x, y, y_hat, logits, loss
        
    loss_dict[epoch+1] = np.mean(loss_train_accumulator)

    print('epoch [{}/{}], loss: {:.5f}'.format(epoch+1, num_epochs, np.mean(loss_train_accumulator)))

# прогонка одной пикчи через модель (отсюда перекинуть нормально в цикл обучения логику)

In [None]:
path = "../avia-train/0a3e36d7-877b-49a1-85dd-ce1e2d018460.png"
img.open(path)

In [None]:
inputs = feature_extractor(images = np.array(img.open(path)), return_tensors="pt")
inputs["pixel_values"].shape

In [None]:
np.array(img.open(path)).shape

In [None]:
tmp = einops.rearrange(
    (inputs["pixel_values"].squeeze(0).numpy() * 255).astype(np.uint8),
    "c h w -> h w c"
)

tmp.shape

In [None]:
img.fromarray(
    tmp
)

In [None]:
outputs = model(**inputs)
logits = outputs.logits

In [None]:
outputs = model(**inputs)

In [None]:
outputs[0].shape

In [None]:
logits = outputs.logits

In [None]:
predicted_class_idx = logits.argmax(-1).item()

In [None]:
print("Predicted class:", model.config.id2label[predicted_class_idx])

##### My implementation of vanilla Transformer [пока не надо]
---

In [None]:
class SelfAttention(nn.Module):
    def __init__(self, dim_in, dim_out):
        super().__init__()
        
        self.sqrt_v = math.sqrt(dim_out) 
        self.to_qkv = nn.Linear(dim_in, 3*dim_out)
        
    def forward(self, x):
        qkv = self.to_qkv(x)
        q, k, v = einops.rearrange(qkv, "h (k w) -> k h w", k=3)

        return torch.sigmoid(q @ k.T) @ v / self.sqrt_v

In [None]:
class MultiHeadAttention(nn.Module):
    def __init__(self, dim_in, dim_out, heads_num):
        super().__init__()
        
        self.heads_num = heads_num
        self.heads = [SelfAttention(dim_in, dim_out) for _ in range(heads_num)]
        self.epic_w = nn.Linear(heads_num * dim_out, dim_in)
        
    def forward(self, x):
        outs = [head(x) for head in self.heads]
        outs = einops.rearrange(outs, "head h w -> h (head w)", head=self.heads_num)
        x = self.epic_w(outs)
        
        return x

In [None]:
x = torch.rand([3, 6])
x

In [None]:
mha = MultiHeadAttention(6, 4, 2)

In [None]:
a = mha(x)

In [None]:
a