In [1]:
!pip install einops
!pip install utils



In [None]:
!nvcc --version


nvcc: NVIDIA (R) Cuda compiler driver
Copyright (c) 2005-2022 NVIDIA Corporation
Built on Wed_Sep_21_10:33:58_PDT_2022
Cuda compilation tools, release 11.8, V11.8.89
Build cuda_11.8.r11.8/compiler.31833905_0


In [2]:
from google.colab import drive
drive.mount('/content/drive')

import os
os.chdir('/content/drive/My Drive/TTIC DL Final Project')

%run SimCLR_Functions.ipynb

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


  set_matplotlib_formats('svg', 'pdf') # For export
INFO:lightning_fabric.utilities.seed:Seed set to 42


Device: cuda:0
Number of workers: 2


<Figure size 640x480 with 0 Axes>

INFO:pytorch_lightning.utilities.rank_zero:GPU available: True (cuda), used: True
INFO:pytorch_lightning.utilities.rank_zero:TPU available: False, using: 0 TPU cores
INFO:pytorch_lightning.utilities.rank_zero:IPU available: False, using: 0 IPUs
INFO:pytorch_lightning.utilities.rank_zero:HPU available: False, using: 0 HPUs


Found pretrained model at ../saved_models/tutorial17/SimCLR.ckpt, loading...


INFO:pytorch_lightning.utilities.migration.utils:Lightning automatically upgraded your loaded checkpoint from v1.3.4 to v2.1.2. To apply the upgrade to your files permanently, run `python -m pytorch_lightning.utilities.upgrade_checkpoint ../saved_models/tutorial17/SimCLR.ckpt`


In [3]:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import transforms, datasets
from einops import rearrange
from einops.layers.torch import Rearrange
from einops import rearrange
from einops.layers.torch import Rearrange

# helpers

def pair(t):
    return t if isinstance(t, tuple) else (t, t)

def posemb_sincos_2d(h, w, dim, temperature: int = 10000, dtype = torch.float32):
    y, x = torch.meshgrid(torch.arange(h), torch.arange(w), indexing="ij")
    assert (dim % 4) == 0, "feature dimension must be multiple of 4 for sincos emb"
    omega = torch.arange(dim // 4) / (dim // 4 - 1)
    omega = 1.0 / (temperature ** omega)

    y = y.flatten()[:, None] * omega[None, :]
    x = x.flatten()[:, None] * omega[None, :]
    pe = torch.cat((x.sin(), x.cos(), y.sin(), y.cos()), dim=1)
    return pe.type(dtype)

# classes

class FeedForward(nn.Module):
    def __init__(self, dim, hidden_dim):
        super().__init__()
        self.net = nn.Sequential(
            nn.LayerNorm(dim),
            nn.Linear(dim, hidden_dim),
            nn.GELU(),
            nn.Linear(hidden_dim, dim),
        )
    def forward(self, x):
        return self.net(x)

class Attention(nn.Module):
    def __init__(self, dim, heads = 8, dim_head = 64):
        super().__init__()
        inner_dim = dim_head *  heads
        self.heads = heads
        self.scale = dim_head ** -0.5
        self.norm = nn.LayerNorm(dim)

        self.attend = nn.Softmax(dim = -1)

        self.to_qkv = nn.Linear(dim, inner_dim * 3, bias = False)
        self.to_out = nn.Linear(inner_dim, dim, bias = False)

    def forward(self, x):
        x = self.norm(x)

        qkv = self.to_qkv(x).chunk(3, dim = -1)
        q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = self.heads), qkv)

        dots = torch.matmul(q, k.transpose(-1, -2)) * self.scale

        attn = self.attend(dots)

        out = torch.matmul(attn, v)
        out = rearrange(out, 'b h n d -> b n (h d)')
        return self.to_out(out)

class Transformer(nn.Module):
    def __init__(self, dim, depth, heads, dim_head, mlp_dim):
        super().__init__()
        self.norm = nn.LayerNorm(dim)
        self.layers = nn.ModuleList([])
        for _ in range(depth):
            self.layers.append(nn.ModuleList([
                Attention(dim, heads = heads, dim_head = dim_head),
                FeedForward(dim, mlp_dim)
            ]))
    def forward(self, x):
        for attn, ff in self.layers:
            x = attn(x) + x
            x = ff(x) + x
        return self.norm(x)

class SimCLRViT(nn.Module):
  # Added simclr_encoder
    def __init__(self, simclr_model, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)
        self.simclr_model = simclr_model
        self.num_patches = (image_size // patch_size) ** 2

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        patch_dim = channels * patch_height * patch_width

        self.patching = nn.Sequential(
            nn.Linear(128//self.num_patches, 256)
        )

        self.pos_embedding = posemb_sincos_2d(
            h = image_height // patch_height,
            w = image_width // patch_width,
            dim = dim,
        )

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)

        self.pool = "mean"
        self.to_latent = nn.Identity()

        self.linear_head = nn.Linear(dim, num_classes)

    def forward(self, img):
        device = img.device

        # Apply simclr encoder
        x = self.simclr_model.convnet(img).view(-1, 128, 1)
        b, _, _ = x.shape

        # Segment the encoding as a "patching" operation
        x = x.view(b, self.num_patches, -1)
        x = self.patching(x)

        x = x.view(-1, self.num_patches, 256)
        x += self.pos_embedding.to(device, dtype=x.dtype)

        x = self.transformer(x)
        x = x.mean(dim = 1)

        x = self.to_latent(x)
        return self.linear_head(x)

In [None]:
class SimpleViTDrop(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)
        self.num_patches = (image_size // patch_size) ** 2

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        patch_dim = channels * patch_height * patch_width

        self.to_patch_embedding = nn.Sequential(
            Rearrange("b c (h p1) (w p2) -> b (h w) (p1 p2 c)", p1 = patch_height, p2 = patch_width),
            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
            nn.LayerNorm(dim),
        )

        self.pos_embedding = posemb_sincos_2d(
            h = image_height // patch_height,
            w = image_width // patch_width,
            dim = dim,
        )

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)

        self.pool = "mean"
        self.to_latent = nn.Identity()

        self.linear_head = nn.Linear(dim, num_classes)

    def forward(self, img):
        device = img.device
        # img is [b, 3, 32, 32]
        x = self.to_patch_embedding(img) # Now [b, num_patches, 256]

        perm = torch.randperm(self.num_patches)
        # Select num_patches // 2 patches for each image in the batch
        x = x[:, perm[:self.num_patches // 2], :]

        y = self.pos_embedding.to(device, dtype=x.dtype)[perm[:self.num_patches // 2], :]
        x += y
        x = self.transformer(x)

        x = x.mean(dim = 1)

        x = self.to_latent(x)
        return self.linear_head(x)

In [7]:
class SimpleViT(nn.Module):
    def __init__(self, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)
        self.num_patches = (image_size // patch_size) ** 2

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        patch_dim = channels * patch_height * patch_width

        self.to_patch_embedding = nn.Sequential(
            Rearrange("b c (h p1) (w p2) -> b (h w) (p1 p2 c)", p1 = patch_height, p2 = patch_width),
            nn.LayerNorm(patch_dim),
            nn.Linear(patch_dim, dim),
            nn.LayerNorm(dim),
        )

        self.pos_embedding = posemb_sincos_2d(
            h = image_height // patch_height,
            w = image_width // patch_width,
            dim = dim,
        )

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)

        self.pool = "mean"
        self.to_latent = nn.Identity()

        self.linear_head = nn.Linear(dim, num_classes)

    def forward(self, img):
        device = img.device
        # img is [b, 3, 32, 32]
        x = self.to_patch_embedding(img) # Now [b, num_patches, 256]

        y = self.pos_embedding.to(device, dtype=x.dtype)
        x += y
        x = self.transformer(x)

        x = x.mean(dim = 1)

        x = self.to_latent(x)
        return self.linear_head(x)

In [None]:
class SimCLRViT2(nn.Module):
    def __init__(self, simclr_model, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)

        self.patch_size = patch_size
        self.channels = channels
        self.simclr_model = simclr_model

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        patch_dim = channels * patch_height * patch_width

        self.to_patches = nn.Sequential(
            Rearrange("b c (h p1) (w p2) -> b (h w) c p1 p2", p1 = patch_height, p2 = patch_width),
            # h x w == number of total patches, c == channels, b == number of batches
            nn.LayerNorm(normalized_shape=(self.channels, self.patch_size, self.patch_size), elementwise_affine=True)
            )

        self.increase_dim = nn.Sequential(
            nn.Linear(in_features=128, out_features=256),
            nn.LayerNorm(256), # added
        )

        self.pos_embedding = posemb_sincos_2d(
            h = image_height // patch_height,
            w = image_width // patch_width,
            dim = dim,
        )

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)

        self.pool = "mean"
        self.to_latent = nn.Identity()

        self.linear_head = nn.Linear(dim, num_classes)

    def forward(self, img):
        device = img.device # img has shape [b, 3, 32, 32], b = 64
        patches = self.to_patches(img) # .view(-1, 16, 3, 8, 8)  # Shape: [batch_size, num_patches, patch_dim]
        # print(f"patches after Rearrange has shape {patches.shape}")

        # Apply simclr_model.convnet() to each patch
        processed_patches = []
        for patch in patches.unbind(dim=1):
            processed_patch = self.simclr_model.convnet(patch)
            processed_patches.append(processed_patch)

        processed_patches = torch.stack(processed_patches, dim=1) # Shape: [batch_size, num_patches, processed_patch_dim]
        processed_patches = self.increase_dim(processed_patches)
        processed_patches += self.pos_embedding.to(device, dtype=processed_patches.dtype)

        x = self.transformer(processed_patches)
        x = x.mean(dim=1)
        x = self.to_latent(x)
        return self.linear_head(x)


In [None]:
class SimCLRViT2Drop(nn.Module):
    def __init__(self, simclr_model, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)
        self.num_patches = (image_size // patch_size) ** 2

        self.patch_size = patch_size
        self.channels = channels
        self.simclr_model = simclr_model

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        patch_dim = channels * patch_height * patch_width

        self.to_patches = nn.Sequential(
            Rearrange("b c (h p1) (w p2) -> b (h w) c p1 p2", p1 = patch_height, p2 = patch_width),
            # h x w == number of total patches, c == channels, b == number of batches
            nn.LayerNorm(normalized_shape=(self.channels, self.patch_size, self.patch_size), elementwise_affine=True)
            )

        self.increase_dim = nn.Sequential(
            nn.Linear(in_features=128, out_features=256),
            nn.LayerNorm(256), # added
        )

        self.pos_embedding = posemb_sincos_2d(
            h = image_height // patch_height,
            w = image_width // patch_width,
            dim = dim,
        )

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)

        self.pool = "mean"
        self.to_latent = nn.Identity()

        self.linear_head = nn.Linear(dim, num_classes)

    def forward(self, img):
        device = img.device # img has shape [b, 3, 32, 32], b = 64
        patches = self.to_patches(img) # .view(-1, 16, 3, 8, 8)  # Shape: [batch_size, num_patches, patch_dim]
        # print(f"patches after Rearrange has shape {patches.shape}")

        # Apply simclr_model.convnet() to each patch
        processed_patches = []
        for patch in patches.unbind(dim=1):
            processed_patch = self.simclr_model.convnet(patch)
            processed_patches.append(processed_patch)

        processed_patches = torch.stack(processed_patches, dim=1) # Shape: [batch_size, num_patches, processed_patch_dim]
        processed_patches = self.increase_dim(processed_patches)
        processed_patches += self.pos_embedding.to(device, dtype=processed_patches.dtype)
        perm = torch.randperm(self.num_patches)

        # Select num_patches // 2 patches for each image in the batch
        selected_patches = processed_patches[:, perm[:self.num_patches // 4], :]

        x = self.transformer(processed_patches)
        x = x.mean(dim=1)
        x = self.to_latent(x)
        return self.linear_head(x)


In [None]:
class SimCLRViTDrop(nn.Module):
  # Added simclr_encoder
    def __init__(self, simclr_model, *, image_size, patch_size, num_classes, dim, depth, heads, mlp_dim, channels = 3, dim_head = 64):
        super().__init__()
        image_height, image_width = pair(image_size)
        patch_height, patch_width = pair(patch_size)
        self.simclr_model = simclr_model
        self.num_patches = (image_size // patch_size) ** 2

        assert image_height % patch_height == 0 and image_width % patch_width == 0, 'Image dimensions must be divisible by the patch size.'

        patch_dim = channels * patch_height * patch_width

        self.patching = nn.Sequential(
            nn.Linear(128//self.num_patches, 256)
        )

        self.pos_embedding = posemb_sincos_2d(
            h = image_height // patch_height,
            w = image_width // patch_width,
            dim = dim,
        )

        self.transformer = Transformer(dim, depth, heads, dim_head, mlp_dim)

        self.pool = "mean"
        self.to_latent = nn.Identity()

        self.linear_head = nn.Linear(dim, num_classes)

    def forward(self, img):
        device = img.device

        # Apply simclr encoder
        x = self.simclr_model.convnet(img).view(-1, 128, 1)
        b, _, _ = x.shape

        # Segment the encoding as a "patching" operation
        x = x.view(b, self.num_patches, -1)
        x = self.patching(x)

        x = x.view(-1, self.num_patches, 256)
        perm = torch.randperm(self.num_patches)
        x = x[:, perm[:self.num_patches // 4], :]
        # print(f"x post perm is {x.shape}")
        # Do not add position embedding
        x = self.transformer(x)
        x = x.mean(dim = 1)

        x = self.to_latent(x)
        return self.linear_head(x)

In [4]:
transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

train_dataset = datasets.STL10(root='./data', split='train', download=True, transform=transform)
test_dataset = datasets.STL10(root='./data', split='test', download=True, transform=transform)

# Create data loaders
train_loader = DataLoader(train_dataset, batch_size=64, shuffle=True, num_workers=4)
test_loader = DataLoader(test_dataset, batch_size=64, shuffle=False, num_workers=4)

Files already downloaded and verified
Files already downloaded and verified




In [None]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define data transformations
transform = transforms.Compose([
    transforms.Resize((32, 32)),  # Resize to CIFAR-10 image size
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),  # Normalize for CIFAR-10
])

# Instantiate the SimCLRViT model
v = SimCLRViT(
    simclr_model=simclr_model,
    image_size=32,
    patch_size=4,  # Adjust patch size for smaller images
    num_classes=10,  # Number of STL10 classes
    dim=256,  # Adjust the dimensionality of the model
    depth=6,
    heads=8,
    mlp_dim=512
).to(device)

# Define optimizer and loss function
optimizer = torch.optim.Adam(v.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    v.train()
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = v(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# Testing
v.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = v(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy * 100:.2f}%')


Epoch [1/10], Loss: 1.4333
Epoch [2/10], Loss: 1.6067
Epoch [3/10], Loss: 0.9650
Epoch [4/10], Loss: 2.4216
Epoch [5/10], Loss: 1.3538
Epoch [6/10], Loss: 0.3398
Epoch [7/10], Loss: 0.5787
Epoch [8/10], Loss: 1.5520
Epoch [9/10], Loss: 1.2613
Epoch [10/10], Loss: 1.7118
Test Accuracy: 59.15%


In [13]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define data transformations
transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

# Instantiate the SimpleViT model
v = SimpleViT(
    image_size= 32, # 32, 256
    patch_size=16, # 4, 32  # Adjust patch size for smaller images
    num_classes=10,  # 1000 Number of STL10 classes
    dim=256,# 256,  # 256, 1024 Adjust the dimensionality of the model
    depth=6, #6
    heads=8, # 8,16
    mlp_dim=512 # 512,2048
).to(device)

# Define optimizer and loss function
optimizer = torch.optim.Adam(v.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    v.train()
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = v(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# Testing
v.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = v(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy * 100:.2f}%')


Epoch [1/10], Loss: 1.6393
Epoch [2/10], Loss: 1.9787
Epoch [3/10], Loss: 2.2211
Epoch [4/10], Loss: 2.6683
Epoch [5/10], Loss: 1.8880
Epoch [6/10], Loss: 1.5890
Epoch [7/10], Loss: 1.5190
Epoch [8/10], Loss: 1.1467
Epoch [9/10], Loss: 1.4744
Epoch [10/10], Loss: 2.1445
Test Accuracy: 35.59%


In [None]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define data transformations
transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

# Instantiate the SimCLRViT2 model
v = SimCLRViT2(
    simclr_model=simclr_model,
    image_size= 32, # 32, 256
    patch_size=32, # 4, 16  # Adjust patch size for smaller images
    num_classes=10,  # 1000 Number of STL10 classes
    dim=256,# 256,  # 256, 1024 Adjust the dimensionality of the model
    depth=6, #6
    heads=8, # 8,16
    mlp_dim=512 # 512,2048
).to(device)

# Define optimizer and loss function
optimizer = torch.optim.Adam(v.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    v.train()
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = v(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# Testing
v.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = v(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy * 100:.2f}%')


patch_height = 32, patch_width = 32
img_height = 32, img_width = 32




Epoch [1/10], Loss: 1.7429
Epoch [2/10], Loss: 1.2007
Epoch [3/10], Loss: 0.8865
Epoch [4/10], Loss: 1.8348
Epoch [5/10], Loss: 1.7632
Epoch [6/10], Loss: 0.8601
Epoch [7/10], Loss: 1.0887
Epoch [8/10], Loss: 1.3899
Epoch [9/10], Loss: 2.0633
Epoch [10/10], Loss: 2.5338
Test Accuracy: 58.93%


In [None]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define data transformations
transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

# Instantiate the SimCLRViT2 model
v = SimCLRViT2Drop(
    simclr_model=simclr_model,
    image_size= 32, # 32, 256
    patch_size=16, # 4, 16  # Adjust patch size for smaller images
    num_classes=10,  # 1000 Number of STL10 classes
    dim=256,# 256,  # 256, 1024 Adjust the dimensionality of the model
    depth=6, #6
    heads=8, # 8,16
    mlp_dim=512 # 512,2048
).to(device)

# Define optimizer and loss function
optimizer = torch.optim.Adam(v.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    v.train()
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = v(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# Testing
v.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = v(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy * 100:.2f}%')


Epoch [1/10], Loss: 3.3061
Epoch [2/10], Loss: 0.9389
Epoch [3/10], Loss: 2.9403
Epoch [4/10], Loss: 0.5847
Epoch [5/10], Loss: 1.4566
Epoch [6/10], Loss: 1.8415
Epoch [7/10], Loss: 2.0606
Epoch [8/10], Loss: 0.8756
Epoch [9/10], Loss: 2.4057
Epoch [10/10], Loss: 2.5448
Test Accuracy: 47.91%


In [None]:
# Set device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Define data transformations
transform = transforms.Compose([
    transforms.Resize((32, 32)),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
])

# Instantiate the SimCLRViT2 model
v = SimCLRViTDrop(
    simclr_model=simclr_model,
    image_size= 32, # 32, 256
    patch_size=16, # 4, 16  # Adjust patch size for smaller images
    num_classes=10,  # 1000 Number of STL10 classes
    dim=256,# 256,  # 256, 1024 Adjust the dimensionality of the model
    depth=6, #6
    heads=8, # 8,16
    mlp_dim=512 # 512,2048
).to(device)

# Define optimizer and loss function
optimizer = torch.optim.Adam(v.parameters(), lr=3e-4)
criterion = nn.CrossEntropyLoss()

# Training loop
num_epochs = 10

for epoch in range(num_epochs):
    v.train()
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = v(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
    print(f'Epoch [{epoch + 1}/{num_epochs}], Loss: {loss.item():.4f}')

# Testing
v.eval()
correct = 0
total = 0

with torch.no_grad():
    for images, labels in test_loader:
        images, labels = images.to(device), labels.to(device)
        outputs = v(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

accuracy = correct / total
print(f'Test Accuracy: {accuracy * 100:.2f}%')


Epoch [1/10], Loss: 0.3675
Epoch [2/10], Loss: 1.0045
Epoch [3/10], Loss: 0.1432
Epoch [4/10], Loss: 3.6719
Epoch [5/10], Loss: 0.6380
Epoch [6/10], Loss: 1.6379
Epoch [7/10], Loss: 2.0503
Epoch [8/10], Loss: 0.1290
Epoch [9/10], Loss: 0.2755
Epoch [10/10], Loss: 0.8163
Test Accuracy: 61.08%
