In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import torchvision
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
from tqdm import tqdm

device = torch.device("cpu")
print(f"Using device: {device}")

transform_train = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

transform_test = transforms.Compose([
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
])

trainset = datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
trainloader = DataLoader(trainset, batch_size=64, shuffle=True, num_workers=0)

testset = datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
testloader = DataLoader(testset, batch_size=64, shuffle=False, num_workers=0)

class SimpleCNN(nn.Module):
    def __init__(self):
        super(SimpleCNN, self).__init__()
        self.conv1 = nn.Conv2d(3, 32, kernel_size=3, padding=1)
        self.pool = nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
        self.conv2 = nn.Conv2d(32, 64, kernel_size=3, padding=1)
        self.fc1 = nn.Linear(64 * 8 * 8, 128)
        self.dropout = nn.Dropout(0.5)
        self.fc2 = nn.Linear(128, 10)

    def forward(self, x):
        x = self.pool(F.relu(self.conv1(x)))
        x = self.pool(F.relu(self.conv2(x)))
        x = x.view(-1, 64 * 8 * 8)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        return x

class RDropLoss(nn.Module):
    def __init__(self, alpha):
        super(RDropLoss, self).__init__()
        self.cross_entropy = nn.CrossEntropyLoss()
        self.alpha = alpha

    def compute_kl_loss(self, p, q):
        p_prob = F.softmax(p, dim=-1)
        q_prob = F.softmax(q, dim=-1)
        p_log_prob = F.log_softmax(p, dim=-1)
        q_log_prob = F.log_softmax(q, dim=-1)
        kl_pq = F.kl_div(p_log_prob, q_prob, reduction='batchmean')
        kl_qp = F.kl_div(q_log_prob, p_prob, reduction='batchmean')
        return (kl_pq + kl_qp) / 2

    def forward(self, p, q, labels):
        ce_loss = (self.cross_entropy(p, labels) + self.cross_entropy(q, labels)) / 2
        kl_loss = self.compute_kl_loss(p, q)
        return ce_loss + self.alpha * kl_loss

def train_baseline(model, trainloader, criterion, optimizer, epoch):
    model.train()
    running_loss = 0.0
    for i, data in enumerate(tqdm(trainloader, desc=f"Epoch {epoch+1} [Baseline Training]")):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Loss: {running_loss / len(trainloader):.3f}")

def train_rdrop(model, trainloader, criterion_rdrop, optimizer, epoch):
    model.train()
    running_loss = 0.0
    for i, data in enumerate(tqdm(trainloader, desc=f"Epoch {epoch+1} [R-Drop Training]")):
        inputs, labels = data
        inputs, labels = inputs.to(device), labels.to(device)
        optimizer.zero_grad()
        outputs_1 = model(inputs)
        outputs_2 = model(inputs)
        loss = criterion_rdrop(outputs_1, outputs_2, labels)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()
    print(f"Loss: {running_loss / len(trainloader):.3f}")

def evaluate_model(model, testloader):
    model.eval()
    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
    accuracy = 100 * correct / total
    print(f'Accuracy: {accuracy:.2f} %')
    return accuracy

def run_training(num_epochs, learning_rate, alpha=None):
    model = SimpleCNN().to(device)
    optimizer = optim.Adam(model.parameters(), lr=learning_rate)
    start_epoch = 0

    if alpha is not None:
        model_type = f"R-Drop Model {alpha}"
        criterion = RDropLoss(alpha=alpha)
        train_function = train_rdrop
        model_save_path = f'rdrop_model_{alpha}_checkpoint.pth'
    else:
        model_type = "Baseline Model"
        criterion = nn.CrossEntropyLoss()
        train_function = train_baseline
        model_save_path = 'baseline_model_checkpoint.pth'

    print(f"--- Starting {model_type} Training ---")

    try:
        checkpoint = torch.load(model_save_path)
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        start_epoch = checkpoint['epoch'] + 1
        print(f"Loaded {model_type} from checkpoint. Resuming training from epoch {start_epoch}.")
    except FileNotFoundError:
        print(f"No saved {model_type} found. Starting training from scratch.")
    except Exception as e:
        print(f"Error loading {model_type} checkpoint: {e}. Starting training from scratch.")

    for epoch in range(start_epoch, start_epoch + num_epochs):
        train_function(model, trainloader, criterion, optimizer, epoch)
        evaluate_model(model, testloader)
        torch.save({
            'epoch': epoch,
            'model_state_dict': model.state_dict(),
            'optimizer_state_dict': optimizer.state_dict(),
        }, model_save_path)
        print(f"{model_type} saved at epoch {epoch+1} to {model_save_path}")

    print(f"\n--- Finished {model_type} Training ---\n")

Using device: cpu


100%|██████████| 170M/170M [00:01<00:00, 105MB/s]


In [None]:
# Baseline Training
run_training(num_epochs=10, learning_rate=0.001)

--- Starting Baseline Model Training ---
Loaded Baseline Model from checkpoint. Resuming training from epoch 20.


Epoch 21 [Baseline Training]: 100%|██████████| 782/782 [01:40<00:00,  7.77it/s]


Loss: 0.909
Accuracy: 74.38 %
Baseline Model saved at epoch 21 to baseline_model_checkpoint.pth


Epoch 22 [Baseline Training]: 100%|██████████| 782/782 [01:40<00:00,  7.77it/s]


Loss: 0.893
Accuracy: 73.88 %
Baseline Model saved at epoch 22 to baseline_model_checkpoint.pth


Epoch 23 [Baseline Training]: 100%|██████████| 782/782 [01:41<00:00,  7.70it/s]


Loss: 0.893
Accuracy: 74.41 %
Baseline Model saved at epoch 23 to baseline_model_checkpoint.pth


Epoch 24 [Baseline Training]: 100%|██████████| 782/782 [01:40<00:00,  7.79it/s]


Loss: 0.890
Accuracy: 74.61 %
Baseline Model saved at epoch 24 to baseline_model_checkpoint.pth


Epoch 25 [Baseline Training]: 100%|██████████| 782/782 [01:40<00:00,  7.76it/s]


Loss: 0.888
Accuracy: 75.73 %
Baseline Model saved at epoch 25 to baseline_model_checkpoint.pth


Epoch 26 [Baseline Training]: 100%|██████████| 782/782 [01:40<00:00,  7.81it/s]


Loss: 0.881
Accuracy: 75.59 %
Baseline Model saved at epoch 26 to baseline_model_checkpoint.pth


Epoch 27 [Baseline Training]: 100%|██████████| 782/782 [01:40<00:00,  7.77it/s]


Loss: 0.872
Accuracy: 74.47 %
Baseline Model saved at epoch 27 to baseline_model_checkpoint.pth


Epoch 28 [Baseline Training]: 100%|██████████| 782/782 [01:41<00:00,  7.74it/s]


Loss: 0.864
Accuracy: 75.91 %
Baseline Model saved at epoch 28 to baseline_model_checkpoint.pth


Epoch 29 [Baseline Training]: 100%|██████████| 782/782 [01:40<00:00,  7.77it/s]


Loss: 0.870
Accuracy: 75.65 %
Baseline Model saved at epoch 29 to baseline_model_checkpoint.pth


Epoch 30 [Baseline Training]: 100%|██████████| 782/782 [01:40<00:00,  7.75it/s]


Loss: 0.858
Accuracy: 76.02 %
Baseline Model saved at epoch 30 to baseline_model_checkpoint.pth

--- Finished Baseline Model Training ---



In [None]:
# R-Drop Training (alpha=0.4)
run_training(num_epochs=30, learning_rate=0.001, alpha=0.4)

--- Starting R-Drop Model 0.4 Training ---
No saved R-Drop Model 0.4 found. Starting training from scratch.


Epoch 1 [R-Drop Training]: 100%|██████████| 782/782 [02:52<00:00,  4.53it/s]


Loss: 1.705
Accuracy: 53.89 %
R-Drop Model 0.4 saved at epoch 1 to rdrop_model_0.4_checkpoint.pth


Epoch 2 [R-Drop Training]: 100%|██████████| 782/782 [02:54<00:00,  4.47it/s]


Loss: 1.450
Accuracy: 58.71 %
R-Drop Model 0.4 saved at epoch 2 to rdrop_model_0.4_checkpoint.pth


Epoch 3 [R-Drop Training]: 100%|██████████| 782/782 [02:53<00:00,  4.51it/s]


Loss: 1.353
Accuracy: 62.32 %
R-Drop Model 0.4 saved at epoch 3 to rdrop_model_0.4_checkpoint.pth


Epoch 4 [R-Drop Training]: 100%|██████████| 782/782 [02:52<00:00,  4.53it/s]


Loss: 1.299
Accuracy: 63.72 %
R-Drop Model 0.4 saved at epoch 4 to rdrop_model_0.4_checkpoint.pth


Epoch 5 [R-Drop Training]: 100%|██████████| 782/782 [02:53<00:00,  4.51it/s]


Loss: 1.245
Accuracy: 65.04 %
R-Drop Model 0.4 saved at epoch 5 to rdrop_model_0.4_checkpoint.pth


Epoch 6 [R-Drop Training]: 100%|██████████| 782/782 [02:54<00:00,  4.48it/s]


Loss: 1.214
Accuracy: 66.90 %
R-Drop Model 0.4 saved at epoch 6 to rdrop_model_0.4_checkpoint.pth


Epoch 7 [R-Drop Training]: 100%|██████████| 782/782 [02:51<00:00,  4.55it/s]


Loss: 1.187
Accuracy: 67.67 %
R-Drop Model 0.4 saved at epoch 7 to rdrop_model_0.4_checkpoint.pth


Epoch 8 [R-Drop Training]: 100%|██████████| 782/782 [02:51<00:00,  4.56it/s]


Loss: 1.161
Accuracy: 67.12 %
R-Drop Model 0.4 saved at epoch 8 to rdrop_model_0.4_checkpoint.pth


Epoch 9 [R-Drop Training]: 100%|██████████| 782/782 [02:50<00:00,  4.58it/s]


Loss: 1.139
Accuracy: 69.06 %
R-Drop Model 0.4 saved at epoch 9 to rdrop_model_0.4_checkpoint.pth


Epoch 10 [R-Drop Training]: 100%|██████████| 782/782 [02:50<00:00,  4.57it/s]


Loss: 1.121
Accuracy: 70.31 %
R-Drop Model 0.4 saved at epoch 10 to rdrop_model_0.4_checkpoint.pth


Epoch 11 [R-Drop Training]: 100%|██████████| 782/782 [02:50<00:00,  4.59it/s]


Loss: 1.101
Accuracy: 69.25 %
R-Drop Model 0.4 saved at epoch 11 to rdrop_model_0.4_checkpoint.pth


Epoch 12 [R-Drop Training]: 100%|██████████| 782/782 [02:50<00:00,  4.59it/s]


Loss: 1.082
Accuracy: 71.75 %
R-Drop Model 0.4 saved at epoch 12 to rdrop_model_0.4_checkpoint.pth


Epoch 13 [R-Drop Training]: 100%|██████████| 782/782 [02:50<00:00,  4.59it/s]


Loss: 1.067
Accuracy: 71.91 %
R-Drop Model 0.4 saved at epoch 13 to rdrop_model_0.4_checkpoint.pth


Epoch 14 [R-Drop Training]: 100%|██████████| 782/782 [02:49<00:00,  4.61it/s]


Loss: 1.054
Accuracy: 72.53 %
R-Drop Model 0.4 saved at epoch 14 to rdrop_model_0.4_checkpoint.pth


Epoch 15 [R-Drop Training]: 100%|██████████| 782/782 [02:49<00:00,  4.60it/s]


Loss: 1.030
Accuracy: 72.62 %
R-Drop Model 0.4 saved at epoch 15 to rdrop_model_0.4_checkpoint.pth


Epoch 16 [R-Drop Training]: 100%|██████████| 782/782 [02:51<00:00,  4.56it/s]


Loss: 1.024
Accuracy: 72.69 %
R-Drop Model 0.4 saved at epoch 16 to rdrop_model_0.4_checkpoint.pth


Epoch 17 [R-Drop Training]: 100%|██████████| 782/782 [02:49<00:00,  4.61it/s]


Loss: 1.013
Accuracy: 72.95 %
R-Drop Model 0.4 saved at epoch 17 to rdrop_model_0.4_checkpoint.pth


Epoch 18 [R-Drop Training]: 100%|██████████| 782/782 [02:48<00:00,  4.63it/s]


Loss: 1.005
Accuracy: 71.36 %
R-Drop Model 0.4 saved at epoch 18 to rdrop_model_0.4_checkpoint.pth


Epoch 19 [R-Drop Training]: 100%|██████████| 782/782 [02:49<00:00,  4.61it/s]


Loss: 0.994
Accuracy: 73.64 %
R-Drop Model 0.4 saved at epoch 19 to rdrop_model_0.4_checkpoint.pth


Epoch 20 [R-Drop Training]: 100%|██████████| 782/782 [02:48<00:00,  4.63it/s]


Loss: 0.987
Accuracy: 73.46 %
R-Drop Model 0.4 saved at epoch 20 to rdrop_model_0.4_checkpoint.pth


Epoch 21 [R-Drop Training]: 100%|██████████| 782/782 [02:49<00:00,  4.61it/s]


Loss: 0.980
Accuracy: 73.84 %
R-Drop Model 0.4 saved at epoch 21 to rdrop_model_0.4_checkpoint.pth


Epoch 22 [R-Drop Training]: 100%|██████████| 782/782 [02:49<00:00,  4.61it/s]


Loss: 0.973
Accuracy: 74.16 %
R-Drop Model 0.4 saved at epoch 22 to rdrop_model_0.4_checkpoint.pth


Epoch 23 [R-Drop Training]: 100%|██████████| 782/782 [02:49<00:00,  4.62it/s]


Loss: 0.962
Accuracy: 74.56 %
R-Drop Model 0.4 saved at epoch 23 to rdrop_model_0.4_checkpoint.pth


Epoch 24 [R-Drop Training]: 100%|██████████| 782/782 [02:48<00:00,  4.63it/s]


Loss: 0.959
Accuracy: 74.79 %
R-Drop Model 0.4 saved at epoch 24 to rdrop_model_0.4_checkpoint.pth


Epoch 25 [R-Drop Training]: 100%|██████████| 782/782 [02:49<00:00,  4.63it/s]


Loss: 0.949
Accuracy: 74.74 %
R-Drop Model 0.4 saved at epoch 25 to rdrop_model_0.4_checkpoint.pth


Epoch 26 [R-Drop Training]: 100%|██████████| 782/782 [02:50<00:00,  4.59it/s]


Loss: 0.943
Accuracy: 74.06 %
R-Drop Model 0.4 saved at epoch 26 to rdrop_model_0.4_checkpoint.pth


Epoch 27 [R-Drop Training]: 100%|██████████| 782/782 [02:48<00:00,  4.63it/s]


Loss: 0.941
Accuracy: 74.91 %
R-Drop Model 0.4 saved at epoch 27 to rdrop_model_0.4_checkpoint.pth


Epoch 28 [R-Drop Training]: 100%|██████████| 782/782 [02:49<00:00,  4.61it/s]


Loss: 0.934
Accuracy: 76.16 %
R-Drop Model 0.4 saved at epoch 28 to rdrop_model_0.4_checkpoint.pth


Epoch 29 [R-Drop Training]: 100%|██████████| 782/782 [02:50<00:00,  4.58it/s]


Loss: 0.930
Accuracy: 74.59 %
R-Drop Model 0.4 saved at epoch 29 to rdrop_model_0.4_checkpoint.pth


Epoch 30 [R-Drop Training]: 100%|██████████| 782/782 [02:49<00:00,  4.61it/s]


Loss: 0.928
Accuracy: 75.36 %
R-Drop Model 0.4 saved at epoch 30 to rdrop_model_0.4_checkpoint.pth

--- Finished R-Drop Model 0.4 Training ---



In [None]:
# R-Drop Training (alpha=0.6)
run_training(num_epochs=30, learning_rate=0.001, alpha=0.6)

--- Starting R-Drop Model 0.6 Training ---
No saved R-Drop Model 0.6 found. Starting training from scratch.


Epoch 1 [R-Drop Training]: 100%|██████████| 782/782 [02:50<00:00,  4.58it/s]


Loss: 1.782
Accuracy: 50.83 %
R-Drop Model 0.6 saved at epoch 1 to rdrop_model_0.6_checkpoint.pth


Epoch 2 [R-Drop Training]: 100%|██████████| 782/782 [02:52<00:00,  4.52it/s]


Loss: 1.546
Accuracy: 56.20 %
R-Drop Model 0.6 saved at epoch 2 to rdrop_model_0.6_checkpoint.pth


Epoch 3 [R-Drop Training]: 100%|██████████| 782/782 [02:51<00:00,  4.56it/s]


Loss: 1.456
Accuracy: 59.64 %
R-Drop Model 0.6 saved at epoch 3 to rdrop_model_0.6_checkpoint.pth


Epoch 4 [R-Drop Training]: 100%|██████████| 782/782 [02:52<00:00,  4.54it/s]


Loss: 1.388
Accuracy: 63.53 %
R-Drop Model 0.6 saved at epoch 4 to rdrop_model_0.6_checkpoint.pth


Epoch 5 [R-Drop Training]: 100%|██████████| 782/782 [02:50<00:00,  4.58it/s]


Loss: 1.343
Accuracy: 63.57 %
R-Drop Model 0.6 saved at epoch 5 to rdrop_model_0.6_checkpoint.pth


Epoch 6 [R-Drop Training]: 100%|██████████| 782/782 [02:52<00:00,  4.55it/s]


Loss: 1.315
Accuracy: 65.55 %
R-Drop Model 0.6 saved at epoch 6 to rdrop_model_0.6_checkpoint.pth


Epoch 7 [R-Drop Training]: 100%|██████████| 782/782 [02:51<00:00,  4.57it/s]


Loss: 1.288
Accuracy: 65.50 %
R-Drop Model 0.6 saved at epoch 7 to rdrop_model_0.6_checkpoint.pth


Epoch 8 [R-Drop Training]: 100%|██████████| 782/782 [02:49<00:00,  4.61it/s]


Loss: 1.272
Accuracy: 67.11 %
R-Drop Model 0.6 saved at epoch 8 to rdrop_model_0.6_checkpoint.pth


Epoch 9 [R-Drop Training]: 100%|██████████| 782/782 [02:51<00:00,  4.56it/s]


Loss: 1.232
Accuracy: 67.25 %
R-Drop Model 0.6 saved at epoch 9 to rdrop_model_0.6_checkpoint.pth


Epoch 10 [R-Drop Training]: 100%|██████████| 782/782 [02:50<00:00,  4.59it/s]


Loss: 1.209
Accuracy: 68.23 %
R-Drop Model 0.6 saved at epoch 10 to rdrop_model_0.6_checkpoint.pth


Epoch 11 [R-Drop Training]: 100%|██████████| 782/782 [02:49<00:00,  4.62it/s]


Loss: 1.190
Accuracy: 67.53 %
R-Drop Model 0.6 saved at epoch 11 to rdrop_model_0.6_checkpoint.pth


Epoch 12 [R-Drop Training]: 100%|██████████| 782/782 [02:49<00:00,  4.62it/s]


Loss: 1.181
Accuracy: 70.37 %
R-Drop Model 0.6 saved at epoch 12 to rdrop_model_0.6_checkpoint.pth


Epoch 13 [R-Drop Training]: 100%|██████████| 782/782 [02:49<00:00,  4.63it/s]


Loss: 1.158
Accuracy: 70.36 %
R-Drop Model 0.6 saved at epoch 13 to rdrop_model_0.6_checkpoint.pth


Epoch 14 [R-Drop Training]: 100%|██████████| 782/782 [02:49<00:00,  4.61it/s]


Loss: 1.152
Accuracy: 70.09 %
R-Drop Model 0.6 saved at epoch 14 to rdrop_model_0.6_checkpoint.pth


Epoch 15 [R-Drop Training]: 100%|██████████| 782/782 [02:49<00:00,  4.61it/s]


Loss: 1.142
Accuracy: 70.29 %
R-Drop Model 0.6 saved at epoch 15 to rdrop_model_0.6_checkpoint.pth


Epoch 16 [R-Drop Training]: 100%|██████████| 782/782 [02:49<00:00,  4.60it/s]


Loss: 1.128
Accuracy: 70.96 %
R-Drop Model 0.6 saved at epoch 16 to rdrop_model_0.6_checkpoint.pth


Epoch 17 [R-Drop Training]: 100%|██████████| 782/782 [02:48<00:00,  4.63it/s]


Loss: 1.110
Accuracy: 71.41 %
R-Drop Model 0.6 saved at epoch 17 to rdrop_model_0.6_checkpoint.pth


Epoch 18 [R-Drop Training]: 100%|██████████| 782/782 [02:49<00:00,  4.60it/s]


Loss: 1.102
Accuracy: 72.11 %
R-Drop Model 0.6 saved at epoch 18 to rdrop_model_0.6_checkpoint.pth


Epoch 19 [R-Drop Training]: 100%|██████████| 782/782 [02:50<00:00,  4.59it/s]


Loss: 1.097
Accuracy: 71.12 %
R-Drop Model 0.6 saved at epoch 19 to rdrop_model_0.6_checkpoint.pth


Epoch 20 [R-Drop Training]: 100%|██████████| 782/782 [02:48<00:00,  4.63it/s]


Loss: 1.085
Accuracy: 72.52 %
R-Drop Model 0.6 saved at epoch 20 to rdrop_model_0.6_checkpoint.pth


Epoch 21 [R-Drop Training]: 100%|██████████| 782/782 [02:47<00:00,  4.66it/s]


Loss: 1.077
Accuracy: 72.39 %
R-Drop Model 0.6 saved at epoch 21 to rdrop_model_0.6_checkpoint.pth


Epoch 22 [R-Drop Training]: 100%|██████████| 782/782 [02:49<00:00,  4.63it/s]


Loss: 1.073
Accuracy: 72.37 %
R-Drop Model 0.6 saved at epoch 22 to rdrop_model_0.6_checkpoint.pth


Epoch 23 [R-Drop Training]: 100%|██████████| 782/782 [02:48<00:00,  4.65it/s]


Loss: 1.068
Accuracy: 72.99 %
R-Drop Model 0.6 saved at epoch 23 to rdrop_model_0.6_checkpoint.pth


Epoch 24 [R-Drop Training]: 100%|██████████| 782/782 [02:47<00:00,  4.67it/s]


Loss: 1.049
Accuracy: 73.53 %
R-Drop Model 0.6 saved at epoch 24 to rdrop_model_0.6_checkpoint.pth


Epoch 25 [R-Drop Training]: 100%|██████████| 782/782 [02:46<00:00,  4.69it/s]


Loss: 1.055
Accuracy: 72.74 %
R-Drop Model 0.6 saved at epoch 25 to rdrop_model_0.6_checkpoint.pth


Epoch 26 [R-Drop Training]: 100%|██████████| 782/782 [02:47<00:00,  4.68it/s]


Loss: 1.043
Accuracy: 72.80 %
R-Drop Model 0.6 saved at epoch 26 to rdrop_model_0.6_checkpoint.pth


Epoch 27 [R-Drop Training]: 100%|██████████| 782/782 [02:46<00:00,  4.69it/s]


Loss: 1.042
Accuracy: 72.94 %
R-Drop Model 0.6 saved at epoch 27 to rdrop_model_0.6_checkpoint.pth


Epoch 28 [R-Drop Training]: 100%|██████████| 782/782 [02:46<00:00,  4.69it/s]


Loss: 1.033
Accuracy: 73.08 %
R-Drop Model 0.6 saved at epoch 28 to rdrop_model_0.6_checkpoint.pth


Epoch 29 [R-Drop Training]: 100%|██████████| 782/782 [02:46<00:00,  4.69it/s]


Loss: 1.033
Accuracy: 73.06 %
R-Drop Model 0.6 saved at epoch 29 to rdrop_model_0.6_checkpoint.pth


Epoch 30 [R-Drop Training]: 100%|██████████| 782/782 [02:46<00:00,  4.70it/s]


Loss: 1.021
Accuracy: 72.94 %
R-Drop Model 0.6 saved at epoch 30 to rdrop_model_0.6_checkpoint.pth

--- Finished R-Drop Model 0.6 Training ---



In [None]:
# R-Drop Training (alpha=0.8)
run_training(num_epochs=30, learning_rate=0.001, alpha=0.8)

--- Starting R-Drop Model 0.8 Training ---
No saved R-Drop Model 0.8 found. Starting training from scratch.


Epoch 1 [R-Drop Training]: 100%|██████████| 782/782 [02:45<00:00,  4.72it/s]


Loss: 1.743
Accuracy: 51.10 %
R-Drop Model 0.8 saved at epoch 1 to rdrop_model_0.8_checkpoint.pth


Epoch 2 [R-Drop Training]: 100%|██████████| 782/782 [02:48<00:00,  4.65it/s]


Loss: 1.501
Accuracy: 58.40 %
R-Drop Model 0.8 saved at epoch 2 to rdrop_model_0.8_checkpoint.pth


Epoch 3 [R-Drop Training]: 100%|██████████| 782/782 [02:48<00:00,  4.65it/s]


Loss: 1.405
Accuracy: 61.18 %
R-Drop Model 0.8 saved at epoch 3 to rdrop_model_0.8_checkpoint.pth


Epoch 4 [R-Drop Training]: 100%|██████████| 782/782 [02:47<00:00,  4.68it/s]


Loss: 1.346
Accuracy: 63.58 %
R-Drop Model 0.8 saved at epoch 4 to rdrop_model_0.8_checkpoint.pth


Epoch 5 [R-Drop Training]: 100%|██████████| 782/782 [02:46<00:00,  4.70it/s]


Loss: 1.314
Accuracy: 63.78 %
R-Drop Model 0.8 saved at epoch 5 to rdrop_model_0.8_checkpoint.pth


Epoch 6 [R-Drop Training]: 100%|██████████| 782/782 [02:46<00:00,  4.70it/s]


Loss: 1.278
Accuracy: 65.76 %
R-Drop Model 0.8 saved at epoch 6 to rdrop_model_0.8_checkpoint.pth


Epoch 7 [R-Drop Training]: 100%|██████████| 782/782 [02:45<00:00,  4.71it/s]


Loss: 1.257
Accuracy: 66.72 %
R-Drop Model 0.8 saved at epoch 7 to rdrop_model_0.8_checkpoint.pth


Epoch 8 [R-Drop Training]: 100%|██████████| 782/782 [02:45<00:00,  4.71it/s]


Loss: 1.230
Accuracy: 67.39 %
R-Drop Model 0.8 saved at epoch 8 to rdrop_model_0.8_checkpoint.pth


Epoch 9 [R-Drop Training]: 100%|██████████| 782/782 [02:45<00:00,  4.72it/s]


Loss: 1.213
Accuracy: 67.66 %
R-Drop Model 0.8 saved at epoch 9 to rdrop_model_0.8_checkpoint.pth


Epoch 10 [R-Drop Training]: 100%|██████████| 782/782 [02:45<00:00,  4.72it/s]


Loss: 1.194
Accuracy: 68.70 %
R-Drop Model 0.8 saved at epoch 10 to rdrop_model_0.8_checkpoint.pth


Epoch 11 [R-Drop Training]: 100%|██████████| 782/782 [02:45<00:00,  4.72it/s]


Loss: 1.175
Accuracy: 69.00 %
R-Drop Model 0.8 saved at epoch 11 to rdrop_model_0.8_checkpoint.pth


Epoch 12 [R-Drop Training]: 100%|██████████| 782/782 [02:45<00:00,  4.73it/s]


Loss: 1.172
Accuracy: 69.73 %
R-Drop Model 0.8 saved at epoch 12 to rdrop_model_0.8_checkpoint.pth


Epoch 13 [R-Drop Training]: 100%|██████████| 782/782 [02:46<00:00,  4.69it/s]


Loss: 1.148
Accuracy: 69.49 %
R-Drop Model 0.8 saved at epoch 13 to rdrop_model_0.8_checkpoint.pth


Epoch 14 [R-Drop Training]: 100%|██████████| 782/782 [02:46<00:00,  4.70it/s]


Loss: 1.135
Accuracy: 70.33 %
R-Drop Model 0.8 saved at epoch 14 to rdrop_model_0.8_checkpoint.pth


Epoch 15 [R-Drop Training]: 100%|██████████| 782/782 [02:47<00:00,  4.66it/s]


Loss: 1.129
Accuracy: 69.14 %
R-Drop Model 0.8 saved at epoch 15 to rdrop_model_0.8_checkpoint.pth


Epoch 16 [R-Drop Training]: 100%|██████████| 782/782 [02:48<00:00,  4.64it/s]


Loss: 1.110
Accuracy: 70.15 %
R-Drop Model 0.8 saved at epoch 16 to rdrop_model_0.8_checkpoint.pth


Epoch 17 [R-Drop Training]: 100%|██████████| 782/782 [02:48<00:00,  4.63it/s]


Loss: 1.102
Accuracy: 71.45 %
R-Drop Model 0.8 saved at epoch 17 to rdrop_model_0.8_checkpoint.pth


Epoch 18 [R-Drop Training]: 100%|██████████| 782/782 [02:48<00:00,  4.63it/s]


Loss: 1.091
Accuracy: 70.77 %
R-Drop Model 0.8 saved at epoch 18 to rdrop_model_0.8_checkpoint.pth


Epoch 19 [R-Drop Training]: 100%|██████████| 782/782 [02:49<00:00,  4.62it/s]


Loss: 1.088
Accuracy: 70.66 %
R-Drop Model 0.8 saved at epoch 19 to rdrop_model_0.8_checkpoint.pth


Epoch 20 [R-Drop Training]: 100%|██████████| 782/782 [02:48<00:00,  4.63it/s]


Loss: 1.078
Accuracy: 71.64 %
R-Drop Model 0.8 saved at epoch 20 to rdrop_model_0.8_checkpoint.pth


Epoch 21 [R-Drop Training]: 100%|██████████| 782/782 [02:48<00:00,  4.65it/s]


Loss: 1.073
Accuracy: 71.30 %
R-Drop Model 0.8 saved at epoch 21 to rdrop_model_0.8_checkpoint.pth


Epoch 22 [R-Drop Training]: 100%|██████████| 782/782 [02:47<00:00,  4.66it/s]


Loss: 1.064
Accuracy: 71.45 %
R-Drop Model 0.8 saved at epoch 22 to rdrop_model_0.8_checkpoint.pth


Epoch 23 [R-Drop Training]: 100%|██████████| 782/782 [02:47<00:00,  4.66it/s]


Loss: 1.058
Accuracy: 71.69 %
R-Drop Model 0.8 saved at epoch 23 to rdrop_model_0.8_checkpoint.pth


Epoch 24 [R-Drop Training]: 100%|██████████| 782/782 [02:47<00:00,  4.66it/s]


Loss: 1.049
Accuracy: 73.00 %
R-Drop Model 0.8 saved at epoch 24 to rdrop_model_0.8_checkpoint.pth


Epoch 25 [R-Drop Training]: 100%|██████████| 782/782 [02:47<00:00,  4.67it/s]


Loss: 1.045
Accuracy: 73.83 %
R-Drop Model 0.8 saved at epoch 25 to rdrop_model_0.8_checkpoint.pth


Epoch 26 [R-Drop Training]: 100%|██████████| 782/782 [02:44<00:00,  4.76it/s]


Loss: nan
Accuracy: 10.00 %
R-Drop Model 0.8 saved at epoch 26 to rdrop_model_0.8_checkpoint.pth


Epoch 27 [R-Drop Training]: 100%|██████████| 782/782 [02:42<00:00,  4.81it/s]


Loss: nan
Accuracy: 10.00 %
R-Drop Model 0.8 saved at epoch 27 to rdrop_model_0.8_checkpoint.pth


Epoch 28 [R-Drop Training]: 100%|██████████| 782/782 [02:41<00:00,  4.84it/s]


Loss: nan
Accuracy: 10.00 %
R-Drop Model 0.8 saved at epoch 28 to rdrop_model_0.8_checkpoint.pth


Epoch 29 [R-Drop Training]: 100%|██████████| 782/782 [02:41<00:00,  4.83it/s]


Loss: nan
Accuracy: 10.00 %
R-Drop Model 0.8 saved at epoch 29 to rdrop_model_0.8_checkpoint.pth


Epoch 30 [R-Drop Training]: 100%|██████████| 782/782 [02:41<00:00,  4.84it/s]


Loss: nan
Accuracy: 10.00 %
R-Drop Model 0.8 saved at epoch 30 to rdrop_model_0.8_checkpoint.pth

--- Finished R-Drop Model 0.8 Training ---

