In [None]:
import torch
import torch.nn as nn
import torchvision.models as models
import torch.optim as optim
from torch.utils.data import DataLoader, random_split
import torchvision
import torchvision.transforms as transforms
from torchvision.models import resnet34
import torchvision.datasets
import numpy as np

print(torch.cuda.is_available())

True


In [None]:
batch_size = 64
num_epochs = 10
lr = 0.1
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
num_classes = 10

In [None]:
transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.RandomRotation(10),
    transforms.RandomApply([
        transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2)
    ], p=0.7),
    transforms.RandomAffine(degrees=0, translate=(0.1, 0.1)),
    transforms.RandomApply([
        transforms.RandomGrayscale(p=1.0)
    ], p=0.25),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465),
                         (0.2023, 0.1994, 0.2010)),
    transforms.RandomErasing(p=0.25, scale=(0.02, 0.1))
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465),
                         (0.2470, 0.2435, 0.2616)),
])

In [None]:
trainset = torchvision.datasets.CIFAR10(root='./data', train=True,
                                        download=True, transform=transform_train)
testset = torchvision.datasets.CIFAR10(root='./data', train=False,
                                       download=True, transform=transform_test)
val_ratio = 0.1
train_size = int((1 - val_ratio) * len(trainset))
val_size = len(trainset) - train_size

100%|██████████| 170M/170M [00:04<00:00, 42.0MB/s]


In [None]:
class EarlyStopping:
    def __init__(self, patience=10, min_delta=0.0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.best_loss = float('inf')
        self.early_stop = False
        self.best_model_state = None

    def __call__(self, val_loss, model):
        if val_loss < self.best_loss - self.min_delta:
            self.best_loss = val_loss
            self.counter = 0
            self.best_model_state = {k: v.cpu().clone() for k, v in model.state_dict().items()}
        else:
            self.counter += 1
            if self.counter >= self.patience:
                self.early_stop = True

    def restore_best_model(self, model):
        if self.best_model_state is not None:
            model.load_state_dict(self.best_model_state)


In [None]:
def train_model(restart_seed=None):
    if restart_seed is not None:
        torch.manual_seed(restart_seed)

    train_subset, val_subset = random_split(trainset, [train_size, val_size])
    trainloader = DataLoader(train_subset, batch_size=batch_size, shuffle=True, num_workers=4, pin_memory=True)
    valloader = DataLoader(val_subset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
    testloader = DataLoader(testset, batch_size=batch_size, shuffle=False, num_workers=4, pin_memory=True)
    model = resnet34(weights=None)
    model.fc = nn.Linear(model.fc.in_features, num_classes)
    model.conv1 = nn.Conv2d(3, 64, kernel_size=3, stride=1, padding=1, bias=False)
    model.maxpool = nn.Identity()
    model = model.to(device)

    optimizer = optim.SGD(model.parameters(), lr=lr, momentum=0.9, weight_decay=5e-4)
    scheduler = optim.lr_scheduler.CosineAnnealingLR(optimizer, T_max=num_epochs)
    criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
    early_stopper = EarlyStopping(patience=10, min_delta=0.001)

    scaler = torch.amp.GradScaler("cuda")

    for epoch in range(num_epochs):
        model.train()
        for inputs, targets in trainloader:
            inputs, targets = inputs.to(device), targets.to(device)

            with torch.amp.autocast("cuda"):
                outputs = model(inputs)
                loss = criterion(outputs, targets)

            scaler.scale(loss).backward()
            scaler.step(optimizer)
            scaler.update()
            optimizer.zero_grad()
        model.eval()
        val_loss = 0.0
        with torch.no_grad():
            for inputs, targets in valloader:
                inputs, targets = inputs.to(device), targets.to(device)
                outputs = model(inputs)
                loss = criterion(outputs, targets)
                val_loss += loss.item()

        val_loss /= len(valloader)
        print(f"Epoch {epoch+1}/{num_epochs} - Val Loss: {val_loss:.4f}")

        early_stopper(val_loss, model)
        if early_stopper.early_stop:
            early_stopper.restore_best_model(model)
            print("Early stopping triggered.")
            break

        scheduler.step()

    return model

In [None]:
cnt = 10
num_epochs = 5
for i in range(cnt):
    lr = 10**np.random.uniform(-2, -1)
    print(f"\nTrial {i+1}/{cnt} — Testing learning rate: {lr:.6f}")
    train_model(restart_seed=100 + i)


Trial 1/10 — Testing learning rate: 0.034092
Epoch 1/5 - Val Loss: 1.7926
Epoch 2/5 - Val Loss: 1.5885
Epoch 3/5 - Val Loss: 1.4899
Epoch 4/5 - Val Loss: 1.1568
Epoch 5/5 - Val Loss: 1.0573

Trial 2/10 — Testing learning rate: 0.021645
Epoch 1/5 - Val Loss: 6.7041
Epoch 2/5 - Val Loss: 1.4343
Epoch 3/5 - Val Loss: 1.3255
Epoch 4/5 - Val Loss: 1.1984
Epoch 5/5 - Val Loss: 1.0743

Trial 3/10 — Testing learning rate: 0.015041
Epoch 1/5 - Val Loss: 1.8403
Epoch 2/5 - Val Loss: 1.5076
Epoch 3/5 - Val Loss: 1.3079
Epoch 4/5 - Val Loss: 1.1724
Epoch 5/5 - Val Loss: 1.0846

Trial 4/10 — Testing learning rate: 0.044454
Epoch 1/5 - Val Loss: 1.7491
Epoch 2/5 - Val Loss: 4.8845
Epoch 3/5 - Val Loss: 1.4124
Epoch 4/5 - Val Loss: 1.1544
Epoch 5/5 - Val Loss: 1.0681

Trial 5/10 — Testing learning rate: 0.018177
Epoch 1/5 - Val Loss: 1.6589
Epoch 2/5 - Val Loss: 1.5767
Epoch 3/5 - Val Loss: 1.3023
Epoch 4/5 - Val Loss: 1.1474
Epoch 5/5 - Val Loss: 1.0380

Trial 6/10 — Testing learning rate: 0.053361

In [None]:
num_models = 10
num_epochs = 100
lr = 0.018

ensemble = []
for i in range(num_models):
    print(f"Training model {i+1}/{num_models}")
    model = train_model(restart_seed=42+i)
    ensemble.append(model)

Training model 1/10
Epoch 1/100 - Val Loss: 1.7871
Epoch 2/100 - Val Loss: 1.4514
Epoch 3/100 - Val Loss: 1.3698
Epoch 4/100 - Val Loss: 1.2614
Epoch 5/100 - Val Loss: 1.1942
Epoch 6/100 - Val Loss: 1.2296
Epoch 7/100 - Val Loss: 1.0914
Epoch 8/100 - Val Loss: 1.0355
Epoch 9/100 - Val Loss: 1.0522
Epoch 10/100 - Val Loss: 0.9980
Epoch 11/100 - Val Loss: 0.9733
Epoch 12/100 - Val Loss: 0.9999
Epoch 13/100 - Val Loss: 0.9564
Epoch 14/100 - Val Loss: 1.0041
Epoch 15/100 - Val Loss: 0.9585
Epoch 16/100 - Val Loss: 0.9957
Epoch 17/100 - Val Loss: 0.9652
Epoch 18/100 - Val Loss: 0.9055
Epoch 19/100 - Val Loss: 0.9327
Epoch 20/100 - Val Loss: 0.8772
Epoch 21/100 - Val Loss: 0.9095
Epoch 22/100 - Val Loss: 0.8635
Epoch 23/100 - Val Loss: 0.8667
Epoch 24/100 - Val Loss: 0.8936
Epoch 25/100 - Val Loss: 0.8817
Epoch 26/100 - Val Loss: 0.8879
Epoch 27/100 - Val Loss: 0.8898
Epoch 28/100 - Val Loss: 0.8744
Epoch 29/100 - Val Loss: 0.8362
Epoch 30/100 - Val Loss: 0.9332
Epoch 31/100 - Val Loss: 0.84

In [None]:
def evaluate_ensemble(loader):
    predictions = []
    with torch.no_grad():
        for inputs, _ in loader:
            inputs = inputs.to(device)
            outputs = sum(model(inputs) for model in ensemble) / num_models
            predictions.append(outputs.cpu())
    return torch.cat(predictions)

In [None]:
preds = evaluate_ensemble(DataLoader(testset, batch_size*2))
acc = (preds.argmax(1) == torch.tensor(testset.targets)).float().mean()
print(f"Ensemble Accuracy: {acc.item():.4f}")

Ensemble Accuracy: 0.9509
