In [1]:
import os
import torch
import torchvision.transforms as transforms
from torchvision import datasets, models
from torch.utils.data import DataLoader, random_split, Dataset
import torch.nn as nn
import torch.optim as optim
from torch.optim.lr_scheduler import OneCycleLR
from sklearn.metrics import f1_score
import pandas as pd
from PIL import Image

In [2]:
# Load Pretrained Vision Transformer (ViT)
class ViTClassifier(nn.Module):
    def __init__(self, num_classes=10):
        super(ViTClassifier, self).__init__()
        self.model = models.vit_b_16(pretrained=True)
        in_features = self.model.heads.head.in_features
        self.model.heads.head = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(in_features, 512),
            nn.ReLU(),
            nn.Dropout(0.3),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        return self.model(x)

In [3]:
from torchvision import transforms, datasets
from torch.utils.data import DataLoader, random_split

# Define transformations for training images (data augmentation included)
train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(224, scale=(0.9, 1.0)),  # Random crop with slight scaling
    transforms.RandomHorizontalFlip(),  # Flip images horizontally with a probability of 0.5
    transforms.ToTensor(),  # Convert image to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize using ImageNet stats
])

# Define transformations for validation images (no augmentation, just resizing and normalization)
val_transforms = transforms.Compose([
    transforms.Resize(256),  # Resize to 256 pixels on the shorter side
    transforms.CenterCrop(224),  # Crop the center to 224x224
    transforms.ToTensor(),  # Convert image to tensor
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])  # Normalize using ImageNet stats
])

# Define the directory containing training images
train_data_dir = '/kaggle/input/deep-learning-practice-week-9-image-c-lassifica/train'

# Load the dataset using ImageFolder (applies train_transforms initially)
full_dataset = datasets.ImageFolder(root=train_data_dir, transform=train_transforms)

# Split dataset into training (90%) and validation (10%)
train_size = int(0.9 * len(full_dataset))
val_size = len(full_dataset) - train_size
train_dataset, val_dataset = random_split(full_dataset, [train_size, val_size])

# Override the validation dataset's transform (ensures correct preprocessing for validation)
val_dataset.dataset.transform = val_transforms  # Potential issue: This modifies the original dataset’s transform

# Define batch size for DataLoader
batch_size = 64

# Create DataLoader for training (shuffle enabled for randomness)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, num_workers=4)

# Create DataLoader for validation (shuffle disabled to maintain order)
val_loader = DataLoader(val_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

In [4]:
# Initializing the training process
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = ViTClassifier(num_classes=10).to(device)

criterion = nn.CrossEntropyLoss()
optimizer = optim.AdamW(model.parameters(), lr=3e-4, weight_decay=1e-4)
scheduler = OneCycleLR(optimizer, max_lr=3e-4, epochs=10, steps_per_epoch=len(train_loader))

Downloading: "https://download.pytorch.org/models/vit_b_16-c867db91.pth" to /root/.cache/torch/hub/checkpoints/vit_b_16-c867db91.pth
100%|██████████| 330M/330M [00:02<00:00, 150MB/s]  


In [5]:
# Training loop
num_epochs = 10
for epoch in range(num_epochs):
    model.train()
    running_loss = 0.0
    print(f"Starting Epoch {epoch+1}/{num_epochs}")
    
    for i, (images, labels) in enumerate(train_loader):
        images, labels = images.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(images)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item() * images.size(0)
        if (i+1) % 10 == 0:
            print(f"Epoch {epoch+1}, Step {i+1}/{len(train_loader)}")
    
    epoch_loss = running_loss / len(train_loader.dataset)
    scheduler.step()
    
    model.eval()
    running_val_loss = 0.0
    val_preds, val_labels = [], []
    with torch.no_grad():
        for images, labels in val_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            loss = criterion(outputs, labels)
            running_val_loss += loss.item() * images.size(0)
            _, preds = torch.max(outputs, 1)
            val_preds.extend(preds.cpu().numpy())
            val_labels.extend(labels.cpu().numpy())
    
    epoch_val_loss = running_val_loss / len(val_loader.dataset)
    f1 = f1_score(val_labels, val_preds, average='macro')
    print(f"Epoch {epoch+1} completed. Train Loss: {epoch_loss:.4f}, Val Loss: {epoch_val_loss:.4f}, Macro F1: {f1:.4f}\n")

Starting Epoch 1/10
Epoch 1, Step 10/141
Epoch 1, Step 20/141
Epoch 1, Step 30/141
Epoch 1, Step 40/141
Epoch 1, Step 50/141
Epoch 1, Step 60/141
Epoch 1, Step 70/141
Epoch 1, Step 80/141
Epoch 1, Step 90/141
Epoch 1, Step 100/141
Epoch 1, Step 110/141
Epoch 1, Step 120/141
Epoch 1, Step 130/141
Epoch 1, Step 140/141
Epoch 1 completed. Train Loss: 1.8572, Val Loss: 1.1658, Macro F1: 0.8005

Starting Epoch 2/10
Epoch 2, Step 10/141
Epoch 2, Step 20/141
Epoch 2, Step 30/141
Epoch 2, Step 40/141
Epoch 2, Step 50/141
Epoch 2, Step 60/141
Epoch 2, Step 70/141
Epoch 2, Step 80/141
Epoch 2, Step 90/141
Epoch 2, Step 100/141
Epoch 2, Step 110/141
Epoch 2, Step 120/141
Epoch 2, Step 130/141
Epoch 2, Step 140/141
Epoch 2 completed. Train Loss: 0.8421, Val Loss: 0.5965, Macro F1: 0.8503

Starting Epoch 3/10
Epoch 3, Step 10/141
Epoch 3, Step 20/141
Epoch 3, Step 30/141
Epoch 3, Step 40/141
Epoch 3, Step 50/141
Epoch 3, Step 60/141
Epoch 3, Step 70/141
Epoch 3, Step 80/141
Epoch 3, Step 90/141
Epo

In [6]:
# Test dataset and submission generation
class TestDataset(Dataset):
    def __init__(self, test_dir, transform=None):
        self.test_dir = test_dir
        self.transform = transform
        self.image_files = sorted([f for f in os.listdir(test_dir) if f.endswith('.jpg')])
    
    def __len__(self):
        return len(self.image_files)
    
    def __getitem__(self, idx):
        img_name = self.image_files[idx]
        img_path = os.path.join(self.test_dir, img_name)
        image = Image.open(img_path).convert('RGB')
        if self.transform:
            image = self.transform(image)
        image_id = os.path.splitext(img_name)[0]
        return image, image_id

test_data_dir = '/kaggle/input/deep-learning-practice-week-9-image-c-lassifica/test'
test_transforms = transforms.Compose([
    transforms.Resize(256),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])
])
test_dataset = TestDataset(test_data_dir, transform=test_transforms)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=4)

In [7]:
# Inference
model.eval()
predictions = []
image_ids = []

with torch.no_grad():
    for images, ids in test_loader:
        images = images.to(device)
        outputs = model(images)
        _, preds = torch.max(outputs, 1)
        predictions.extend(preds.cpu().numpy())
        image_ids.extend(ids)

submission = pd.DataFrame({'Image_ID': image_ids, 'Label': predictions})
submission.to_csv('submission.csv', index=False)
print("Submission file created!")

Submission file created!


In [8]:
df = pd.read_csv('/kaggle/working/submission.csv')
df

Unnamed: 0,Image_ID,Label
0,Image_0001,0
1,Image_0002,7
2,Image_0003,5
3,Image_0004,1
4,Image_0005,4
...,...,...
1995,Image_1996,8
1996,Image_1997,8
1997,Image_1998,8
1998,Image_1999,7
