In [None]:
!pip install einops



In [None]:
import torch
from torch import nn, optim
import torch.nn.functional as F
from einops import rearrange, repeat
from einops.layers.torch import Rearrange
from torch.utils.data import DataLoader
from torchvision import datasets, transforms
from torch.optim.lr_scheduler import CosineAnnealingLR

In [None]:
def pair(t):
    return t if isinstance(t, tuple) else (t, t)

In [None]:
class PreNorm(nn.Module):
    def __init__(self, dim, fn):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.fn = fn

    def forward(self, x, **kwargs):
        return self.fn(self.norm(x), **kwargs)

In [None]:
class LRLinearSuper(nn.Module):
    def __init__(self, in_channel, out_channel, bias=True, fused=False, sample_ratio=1.0):
        super().__init__()
        self.bias = bias
        self.fused = fused
        self.sample_ratio = sample_ratio
        self.num_components = min(in_channel, out_channel)
        self.VT = nn.Linear(in_channel, int(round(self.num_components * sample_ratio)), bias=False)
        self.U = nn.Linear(int(round(self.num_components * sample_ratio)), out_channel, bias=bias)

    def forward(self, x):
        if self.fused:
            weight = self.U.weight @ self.VT.weight
            if self.bias:
                return F.linear(x, weight, self.U.bias)
            else:
                return F.linear(x, weight)
        else:
            x = self.VT(x)
            return self.U(x)


In [None]:
class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim, dropout=0., ratio=0.5):
        super().__init__()
        self.net = nn.Sequential(
            LRLinearSuper(dim, hidden_dim, fused=True, sample_ratio=ratio),
            nn.GELU(),
            nn.Dropout(dropout),
            LRLinearSuper(hidden_dim, dim, fused=True, sample_ratio=ratio),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        return self.net(x)

In [None]:
class Attention(nn.Module):
    def __init__(self, dim, heads=8, dim_head=64, dropout=0., ratio=0.5):
        super().__init__()
        self.heads = heads
        inner_dim = dim_head * heads
        self.scale = dim_head ** -0.5
        self.attend = nn.Softmax(dim=-1)
        self.to_qkv = LRLinearSuper(dim, inner_dim * 3, fused=True, sample_ratio=ratio)
        self.to_out = nn.Sequential(
            LRLinearSuper(inner_dim, dim, fused=True, sample_ratio=ratio),
            nn.Dropout(dropout)
        )

    def forward(self, x):
        qkv = self.to_qkv(x).chunk(3, dim=-1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h=self.heads), qkv)
        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale
        attn = self.attend(dots)
        out = torch.matmul(attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        return self.to_out(out)

In [None]:
class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim, dropout=0.):
        super().__init__()
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                PreNorm(dim, Attention(dim, heads=heads, dim_head=dim_head, dropout=dropout)),
                PreNorm(dim, FeedForward(dim, mlp_dim, dropout=dropout))
            ]))

    def forward(self, x):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
        return x

In [None]:
class ViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, pool='cls', channels=3, dim_head=64, dropout=0., emb_dropout=0.):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        num_patches = (image_height // patch_height) * (image_width // patch_width)
        patch_dim = channels * patch_height * patch_width
        self.pool = pool

        self.to_patch_embedding = nn.Sequential(
            Rearrange('b c (h p1) (w p2) -> b (h w) (p1 p2 c)', p1=patch_height, p2=patch_width),
            nn.Linear(patch_dim, dim),
        )

        self.pos_embedding = nn.Parameter(torch.randn(1, num_patches + 1, dim))
        self.cls_token = nn.Parameter(torch.randn(1, 1, dim))
        self.dropout = nn.Dropout(emb_dropout)

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim, dropout)

        self.to_latent = nn.Identity()

        self.mlp_head = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, num_classes)
        )

    def forward(self, img):
        x = self.to_patch_embedding(img)
        b, n, _ = x.shape

        cls_tokens = repeat(self.cls_token, '() n d -> b n d', b=b)
        x = torch.cat((cls_tokens, x), dim=1)
        x += self.pos_embedding[:, :(n + 1)]
        x = self.dropout(x)

        x = self.transformer(x)

        if self.pool == 'mean':
            x = x.mean(dim=1)
        elif self.pool == 'cls':
            x = x[:, 0]

        x = self.to_latent(x)
        return self.mlp_head(x)


In [None]:
transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.4914, 0.4822, 0.4465], std=[0.2023, 0.1994, 0.2010])
])

train_dataset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform)
test_dataset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4)


Files already downloaded and verified
Files already downloaded and verified


In [None]:
def compute_accuracy(outputs, labels):
    _, predictions = torch.max(outputs, 1)
    correct = (predictions == labels).type(torch.float).sum().item()
    return correct / labels.size(0)

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ViT(
    image_size=32,
    patch_size=4,
    num_classes=10,
    dim=512,
    depth=6,
    heads=8,
    mlp_dim=512,
    dim_head=64,
    dropout=0.1,
    emb_dropout=0.1
).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.003)
scheduler = CosineAnnealingLR(optimizer, T_max=200)

# Training loop with accuracy, gradient clipping, and LR scheduler
for epoch in range(10):
    model.train()
    train_loss = 0
    train_accuracy = 0
    for imgs, labels in train_loader:
        imgs, labels = imgs.to(device), labels.to(device)

        optimizer.zero_grad()
        outputs = model(imgs)
        loss = criterion(outputs, labels)
        loss.backward()
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        optimizer.step()

        train_loss += loss.item()
        train_accuracy += (outputs.argmax(1) == labels).float().mean().item()

    scheduler.step()

    avg_train_loss = train_loss / len(train_loader)
    avg_train_accuracy = train_accuracy / len(train_loader)
    print(f'Epoch {epoch+1}, Loss: {avg_train_loss:.4f}, Accuracy: {avg_train_accuracy:.4f}')

# Testing loop for accuracy after training
model.eval()
test_accuracy = 0
with torch.no_grad():
    for imgs, labels in test_loader:
        imgs, labels = imgs.to(device), labels.to(device)
        outputs = model(imgs)
        test_accuracy += (outputs.argmax(1) == labels).float().mean().item()

avg_test_accuracy = test_accuracy / len(test_loader)
print(f'Test Accuracy: {avg_test_accuracy:.4f}')

Epoch 1, Loss: 2.2885, Accuracy: 0.1385
Epoch 2, Loss: 2.1111, Accuracy: 0.1931
Epoch 3, Loss: 2.0879, Accuracy: 0.1971
Epoch 4, Loss: 2.1000, Accuracy: 0.1940
Epoch 5, Loss: 2.1084, Accuracy: 0.1899
Epoch 6, Loss: 2.0698, Accuracy: 0.2036
Epoch 7, Loss: 2.0625, Accuracy: 0.2110
Epoch 8, Loss: 2.0526, Accuracy: 0.2109
Epoch 9, Loss: 2.0328, Accuracy: 0.2193
Epoch 10, Loss: 2.0264, Accuracy: 0.2238


KeyboardInterrupt: 