In [1]:
import torch
torch.cuda.is_available(), torch.version.cuda, torch.cuda.get_device_name(0)


(True, '12.1', 'NVIDIA GeForce RTX 3060 Laptop GPU')

In [3]:
!pip install timm --quiet


In [2]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, random_split
from torchvision import datasets, transforms
import timm
from tqdm import tqdm
import os
from pathlib import Path
from PIL import Image


  from .autonotebook import tqdm as notebook_tqdm


In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

DATA_DIR = "C:/Users/asus/Desktop/PV dataset/SPHERE DATASET/SPHERE DATASET/SPHERE - LABELLED DATASET/CNN_Data_Arranged/Original"  # <-- change to your dataset path
IMAGE_SIZE = 518
BATCH_SIZE = 2       # fits in 6GB VRAM
VAL_SPLIT = 0.2
NUM_WORKERS = 2


Device: cuda


In [4]:
train_transform = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.ColorJitter(0.1,0.1,0.1),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406], [0.229,0.224,0.225]),
])

val_transform = transforms.Compose([
    transforms.Resize((IMAGE_SIZE, IMAGE_SIZE)),
    transforms.ToTensor(),
    transforms.Normalize([0.485,0.456,0.406], [0.229,0.224,0.225]),
])

dataset = datasets.ImageFolder(DATA_DIR, transform=train_transform)

train_size = int(len(dataset)*(1-VAL_SPLIT))
val_size = len(dataset) - train_size
train_dataset, val_dataset = random_split(dataset, [train_size, val_size])

# Set validation transform
val_dataset.dataset.transform = val_transform

train_loader = DataLoader(train_dataset, batch_size=BATCH_SIZE, shuffle=True, num_workers=NUM_WORKERS)
val_loader = DataLoader(val_dataset, batch_size=BATCH_SIZE, shuffle=False, num_workers=NUM_WORKERS)

class_names = dataset.classes
print("Classes:", class_names)


Classes: ['broken', 'clean', 'dirty']


In [5]:
import timm
import torch

device = "cuda" if torch.cuda.is_available() else "cpu"

model = timm.create_model(
    "vit_base_patch14_dinov2.lvd142m",  # exactly as listed
    pretrained=True,
    num_classes=len(class_names)
).to(device)
print(model)

VisionTransformer(
  (patch_embed): PatchEmbed(
    (proj): Conv2d(3, 768, kernel_size=(14, 14), stride=(14, 14))
    (norm): Identity()
  )
  (pos_drop): Dropout(p=0.0, inplace=False)
  (patch_drop): Identity()
  (norm_pre): Identity()
  (blocks): Sequential(
    (0): Block(
      (norm1): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (attn): Attention(
        (qkv): Linear(in_features=768, out_features=2304, bias=True)
        (q_norm): Identity()
        (k_norm): Identity()
        (attn_drop): Dropout(p=0.0, inplace=False)
        (norm): Identity()
        (proj): Linear(in_features=768, out_features=768, bias=True)
        (proj_drop): Dropout(p=0.0, inplace=False)
      )
      (ls1): LayerScale()
      (drop_path1): Identity()
      (norm2): LayerNorm((768,), eps=1e-06, elementwise_affine=True)
      (mlp): Mlp(
        (fc1): Linear(in_features=768, out_features=3072, bias=True)
        (act): GELU(approximate='none')
        (drop1): Dropout(p=0.0, inplace=Fal

In [6]:
EPOCHS=10
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.AdamW(model.parameters(), lr=3e-5, weight_decay=0.01)
scheduler = torch.optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=EPOCHS)


In [7]:
def validate(model, loader):
    model.eval()
    correct = 0
    total = 0
    loss_sum = 0

    with torch.no_grad():
        for x, y in loader:
            x, y = x.to(device), y.to(device)
            outputs = model(x)
            loss = criterion(outputs, y)
            loss_sum += loss.item() * x.size(0)
            preds = outputs.argmax(1)
            correct += (preds == y).sum().item()
            total += y.size(0)

    return loss_sum / total, correct / total


In [8]:
torch.cuda.empty_cache()


In [9]:
best_acc = 0

for epoch in range(1, EPOCHS+1):
    model.train()
    loop = tqdm(train_loader, total=len(train_loader), desc=f"Epoch {epoch}")

    for x, y in loop:
        x, y = x.to(device), y.to(device)
        optimizer.zero_grad()
        outputs = model(x)
        loss = criterion(outputs, y)
        loss.backward()
        optimizer.step()
        loop.set_postfix(loss=loss.item())

    val_loss, val_acc = validate(model, val_loader)
    scheduler.step()

    print(f"Val Loss: {val_loss:.4f} | Val Acc: {val_acc:.4f}")

    if val_acc > best_acc:
        best_acc = val_acc
        torch.save(model.state_dict(), "best_vit_dinov2_b14.pth")
        print("Saved new best model.")


Epoch 1: 100%|█████████████████████████████████████████████████████████| 2432/2432 [20:46<00:00,  1.95it/s, loss=0.475]


Val Loss: 0.5839 | Val Acc: 0.7344
Saved new best model.


Epoch 2: 100%|█████████████████████████████████████████████████████████| 2432/2432 [21:27<00:00,  1.89it/s, loss=0.344]


Val Loss: 0.5740 | Val Acc: 0.7294


Epoch 3: 100%|██████████████████████████████████████████████████████████| 2432/2432 [20:15<00:00,  2.00it/s, loss=1.18]


Val Loss: 0.5667 | Val Acc: 0.7558
Saved new best model.


Epoch 4: 100%|█████████████████████████████████████████████████████████| 2432/2432 [19:53<00:00,  2.04it/s, loss=0.623]


Val Loss: 0.5088 | Val Acc: 0.7804
Saved new best model.


Epoch 5: 100%|██████████████████████████████████████████████████████████| 2432/2432 [19:51<00:00,  2.04it/s, loss=0.18]


Val Loss: 0.5209 | Val Acc: 0.7730


Epoch 6: 100%|████████████████████████████████████████████████████████| 2432/2432 [19:51<00:00,  2.04it/s, loss=0.0535]


Val Loss: 0.4283 | Val Acc: 0.8314
Saved new best model.


Epoch 7: 100%|██████████████████████████████████████████████████████████| 2432/2432 [19:51<00:00,  2.04it/s, loss=2.63]


Val Loss: 0.3530 | Val Acc: 0.8503
Saved new best model.


Epoch 8: 100%|████████████████████████████████████████████████████████| 2432/2432 [19:53<00:00,  2.04it/s, loss=0.0207]


Val Loss: 0.3755 | Val Acc: 0.8470


Epoch 9: 100%|████████████████████████████████████████████████████████| 2432/2432 [19:55<00:00,  2.03it/s, loss=0.0139]


Val Loss: 0.3562 | Val Acc: 0.8602
Saved new best model.


Epoch 10: 100%|██████████████████████████████████████████████████████| 2432/2432 [20:20<00:00,  1.99it/s, loss=0.00223]


Val Loss: 0.3701 | Val Acc: 0.8635
Saved new best model.


In [11]:
def predict(img_path):
    img = Image.open(img_path).convert("RGB")
    t = val_transform(img).unsqueeze(0).to(device)
    logits = model(t)
    pred = logits.argmax(1).item()
    return class_names[pred]

# Example usage:
print(predict("C:/Users/asus/Desktop/PV dataset/SPHERE DATASET/SPHERE DATASET/SPHERE - LABELLED DATASET/CNN_Data_Arranged/Original/clean/0b28c6ad-ADP110500117_2_Clean_99_1631.png"))


clean
