In [17]:
import random
import math
import pandas as pd
import numpy as np
from sklearn.utils import shuffle
import torch
from torch import cuda
from torch.utils.data import Dataset, DataLoader
from torch import nn
import torch.optim as optim
from torchvision import transforms, models
from tqdm import tqdm
from torch.optim.lr_scheduler import CosineAnnealingLR, SequentialLR, LinearLR
from PIL import Image
from torchvision.transforms import AutoAugment, AutoAugmentPolicy, RandAugment

In [18]:
device = torch.device("cuda" if cuda.is_available() else "cpu")
device

device(type='cuda')

In [19]:
!ls /kaggle/input/food-101/food-101/food-101/

images	license_agreement.txt  meta  README.txt


In [20]:
with open("/kaggle/input/food-101/food-101/food-101/meta/classes.txt", 'r') as f:
    classes = f.read().splitlines()

In [21]:
class Label_encoder:
    def __init__(self, labels):
        self.labels = {label: idx for idx, label in enumerate(labels)}
    def get_label(self, idx):
        return list(self.labels.keys())[idx]
    def get_idx(self, label):
        return self.labels.get(label)

encoder = Label_encoder(classes)

In [22]:
class Food101(Dataset):
    def __init__(self, dataframe, transform=None):
        self.dataframe = dataframe.reset_index(drop=True)
        self.transform = transform

    def __len__(self):
        return len(self.dataframe)

    def __getitem__(self, idx):
        img_name = self.dataframe.path.iloc[idx]
        image = Image.open(img_name)
        if image.mode != 'RGB':
            image = image.convert('RGB')

        label = encoder.get_idx(self.dataframe.label.iloc[idx])

        if self.transform:
            image = self.transform(image)

        return image, label

In [23]:
def prep_df(path: str) -> pd.DataFrame:
    with open(path, 'r') as f:
        names = f.read().splitlines()
    img_path = "/kaggle/input/food-101/food-101/food-101/images/"
    labels = [n.split('/')[0] for n in names]
    full_paths = [img_path + n + ".jpg" for n in names]
    df = pd.DataFrame({'label': labels, 'path': full_paths})
    df = shuffle(df).reset_index(drop=True)
    return df

In [24]:
train_imgs = prep_df('/kaggle/input/food-101/food-101/food-101/meta/train.txt')
test_imgs = prep_df('/kaggle/input/food-101/food-101/food-101/meta/test.txt')

In [25]:
img_size = 224
batch_size = 32
base_lr = 3e-4
backbone_lr = 1e-5
weight_decay = 0.05
total_epochs = 30
warmup_epochs = 3
mixup_prob = 0.5
mixup_alpha = 0.8
cutmix_alpha = 1.0
label_smoothing = 0.1


mean = [0.485, 0.456, 0.406]
std  = [0.229, 0.224, 0.225]

In [26]:
train_transforms = transforms.Compose([
    transforms.RandomResizedCrop(img_size, scale=(0.7, 1.0)),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(0.3, 0.3, 0.2, 0.05),
    
    RandAugment(num_ops=2, magnitude=9), 
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])

test_transforms = transforms.Compose([
    transforms.Resize(int(img_size*256/224)),
    transforms.CenterCrop(img_size),
    transforms.ToTensor(),
    transforms.Normalize(mean, std),
])

In [27]:
trainset = Food101(train_imgs, transform=train_transforms)
trainloader = DataLoader(trainset, batch_size=batch_size, shuffle=True, pin_memory=True, num_workers=4)

testset = Food101(test_imgs, transform=test_transforms)
testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, pin_memory=True, num_workers=4)


In [28]:
model = models.vit_b_16(weights=models.ViT_B_16_Weights.IMAGENET1K_V1)

# Freeze everything first
for param in model.parameters():
    param.requires_grad = False

# Replace head (classifier)
num_classes = len(classes)
in_features = model.heads.head.in_features
model.heads.head = nn.Sequential(
        nn.Linear(in_features, 512),
        nn.ReLU(),
        nn.Dropout(0.4),
        nn.Linear(512, num_classes)
    )

In [29]:
unfreeze_last_n = 4
layers = model.encoder.layers
if unfreeze_last_n > 0:
    for layer in layers[-unfreeze_last_n:]:
        for p in layer.parameters():
            p.requires_grad = True
for p in model.encoder.ln.parameters():
        p.requires_grad = True
        

# Move model to device
model = model.to(device)

In [30]:
head_params = list(model.heads.parameters())
head_param_ids = {id(p) for p in head_params}

encoder_params = [p for p in model.parameters() if p.requires_grad and id(p) not in head_param_ids]

optimizer = optim.AdamW([
    {'params': head_params, 'lr': base_lr},
    {'params': encoder_params, 'lr': base_lr * 0.1}
], weight_decay=1e-4)
# Criterion (we will use mix loss combination for mixup/cutmix)
criterion = nn.CrossEntropyLoss(label_smoothing=label_smoothing)

# Scheduler: linear warmup then cosine
scheduler = SequentialLR(
    optimizer,
    schedulers=[
        LinearLR(optimizer, start_factor=0.1, total_iters=warmup_epochs),
        CosineAnnealingLR(optimizer, T_max=max(1, total_epochs - warmup_epochs))
    ],
    milestones=[warmup_epochs]
)

In [31]:
def rand_bbox(size, lam):
    W = size[2]
    H = size[3]
    cut_rat = np.sqrt(1. - lam)
    cut_w = int(W * cut_rat)
    cut_h = int(H * cut_rat)
    # uniform
    cx = np.random.randint(W)
    cy = np.random.randint(H)
    bbx1 = np.clip(cx - cut_w // 2, 0, W)
    bby1 = np.clip(cy - cut_h // 2, 0, H)
    bbx2 = np.clip(cx + cut_w // 2, 0, W)
    bby2 = np.clip(cy + cut_h // 2, 0, H)
    return bbx1, bby1, bbx2, bby2

def mixup_data(x, y, alpha=1.0):
    if alpha <= 0:
        return x, y, y, 1.0
    lam = np.random.beta(alpha, alpha)
    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(device)
    mixed_x = lam * x + (1 - lam) * x[index, :]
    y_a, y_b = y, y[index]
    return mixed_x, y_a, y_b, lam

def cutmix_data(x, y, alpha=1.0):
    if alpha <= 0:
        return x, y, y, 1.0
    lam = np.random.beta(alpha, alpha)
    batch_size = x.size()[0]
    index = torch.randperm(batch_size).to(device)
    y_a, y_b = y, y[index]
    bbx1, bby1, bbx2, bby2 = rand_bbox(x.size(), lam)
    x[:, :, bbx1:bbx2, bby1:bby2] = x[index, :, bbx1:bbx2, bby1:bby2]
    # adjust lambda to the exact area ratio
    lam = 1 - ((bbx2 - bbx1) * (bby2 - bby1) / (x.size()[-1] * x.size()[-2]))
    return x, y_a, y_b, lam


In [37]:
scaler = torch.amp.GradScaler(device='cuda')
best_acc = 0.0

checkpoint = torch.load("/kaggle/input/model-checkpoint/model_vit_epoch_8.pth", map_location=device)
model.load_state_dict(checkpoint['model_state_dict'])
optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
scheduler.load_state_dict(checkpoint['scheduler_state_dict'])
scaler.load_state_dict(checkpoint['scaler_state_dict'])

start_epoch = checkpoint['epoch'] + 1
print("Checkpoint loaded. Continue from epoch:", start_epoch)

Checkpoint loaded. Continue from epoch: 9


In [38]:
print("Starting training...\n")

for epoch in range(start_epoch, total_epochs):
    model.train()
    running_loss = 0.0
    correct_train = 0
    total_train = 0
    loop = tqdm(trainloader, desc=f"Training Epoch {epoch + 1}", leave=False)

    for inputs, targets in loop:
        inputs, targets = inputs.to(device), targets.to(device)

        # Randomly apply mixup or cutmix
        do_mix = random.random() < mixup_prob
        if do_mix:
            if random.random() < 0.5:
                inputs, targets_a, targets_b, lam = mixup_data(inputs, targets, alpha=mixup_alpha)
            else:
                inputs, targets_a, targets_b, lam = cutmix_data(inputs, targets, alpha=cutmix_alpha)
        else:
            targets_a, targets_b, lam = targets, targets, 1.0

        optimizer.zero_grad()
        with torch.cuda.amp.autocast(enabled=(device.type == 'cuda')):
            outputs = model(inputs)
            if lam != 1.0:
                loss = lam * criterion(outputs, targets_a) + (1 - lam) * criterion(outputs, targets_b)
            else:
                loss = criterion(outputs, targets)

        scaler.scale(loss).backward()
        # gradient clipping helps stability
        scaler.unscale_(optimizer)
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=1.0)
        scaler.step(optimizer)
        scaler.update()

        running_loss += loss.item()
        _, predicted = torch.max(outputs.detach(), 1)
        
        total_train += targets.size(0)
        
        correct_train += (predicted == targets).sum().item()

        loop.set_postfix(loss=loss.item())

    # scheduler step per epoch
    scheduler.step()

    train_accuracy = 100. * correct_train / total_train
    avg_loss = running_loss / len(trainloader)
    print(f"Epoch {epoch+1}: Train Loss = {avg_loss:.4f}, Accuracy = {train_accuracy:.2f}%")

    # -------- validation --------
    model.eval()
    correct_test = 0
    total_test = 0
    with torch.no_grad():
        for inputs, targets in tqdm(testloader, desc=f"Testing Epoch {epoch + 1}", leave=False):
            inputs, targets = inputs.to(device), targets.to(device)
            with torch.amp.autocast(device_type='cuda'):
                outputs = model(inputs)
            _, predicted = torch.max(outputs, 1)
            total_test += targets.size(0)
            correct_test += predicted.eq(targets).sum().item()

    test_accuracy = 100. * correct_test / total_test
    print(f"Test Accuracy after Epoch {epoch + 1}: {test_accuracy:.2f}%")

    # save best
    if test_accuracy > best_acc:
        best_acc = test_accuracy
        print(f"Best model saved with accuracy: {best_acc:.2f}%")
        checkpoint = {
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
            'scheduler_state_dict': scheduler.state_dict(),
            'scaler_state_dict': scaler.state_dict(),
            'rng_state': torch.get_rng_state(),
        }
        torch.save(checkpoint, f'model_vit_epoch_{epoch}.pth')


Starting training...



  with torch.cuda.amp.autocast(enabled=(device.type == 'cuda')):
                                                                                  

Epoch 10: Train Loss = 1.8501, Accuracy = 68.61%


                                                                   

Test Accuracy after Epoch 10: 85.60%
Best model saved with accuracy: 85.60%


                                                                                  

Epoch 11: Train Loss = 1.8065, Accuracy = 68.96%


                                                                   

Test Accuracy after Epoch 11: 85.79%
Best model saved with accuracy: 85.79%


                                                                                  

Epoch 12: Train Loss = 1.7768, Accuracy = 71.07%


                                                                   

Test Accuracy after Epoch 12: 85.88%
Best model saved with accuracy: 85.88%


                                                                                  

Epoch 13: Train Loss = 1.7847, Accuracy = 69.96%


                                                                   

Test Accuracy after Epoch 13: 85.87%


                                                                                  

Epoch 14: Train Loss = 1.7176, Accuracy = 71.72%


                                                                   

Test Accuracy after Epoch 14: 86.21%
Best model saved with accuracy: 86.21%


                                                                                  

Epoch 15: Train Loss = 1.6904, Accuracy = 72.51%


                                                                   

Test Accuracy after Epoch 15: 86.00%


                                                                                  

Epoch 16: Train Loss = 1.6888, Accuracy = 72.10%


                                                                   

Test Accuracy after Epoch 16: 86.40%
Best model saved with accuracy: 86.40%


                                                                                  

Epoch 17: Train Loss = 1.6595, Accuracy = 73.99%


                                                                   

Test Accuracy after Epoch 17: 86.29%


                                                                                  

Epoch 18: Train Loss = 1.6427, Accuracy = 74.19%


                                                                   

Test Accuracy after Epoch 18: 86.36%


                                                                                  

Epoch 19: Train Loss = 1.6440, Accuracy = 74.26%


                                                                   

Test Accuracy after Epoch 19: 86.17%


                                                                                  

Epoch 20: Train Loss = 1.6373, Accuracy = 73.78%


                                                                   

Test Accuracy after Epoch 20: 86.18%


                                                                                  

Epoch 21: Train Loss = 1.5746, Accuracy = 76.66%


                                                                   

Test Accuracy after Epoch 21: 86.30%


                                                                                  

KeyboardInterrupt: 

In [40]:
torch.save(model.state_dict(), f'vit_food_101.pth')