In [1]:
import torch
import torchvision
import torchvision.transforms as transforms
from torch.utils.data import DataLoader
import torch.optim as optim
import torch.nn as nn
import torch.optim.lr_scheduler as lr_scheduler
from torchvision.models import vgg11_bn

In [2]:
torch.cuda.empty_cache()

In [3]:
transform_train = transforms.Compose([
    transforms.Resize(144),
    transforms.CenterCrop(128),
    transforms.RandomHorizontalFlip(),
    transforms.ColorJitter(brightness=0.2, contrast=0.2, saturation=0.2, hue=0.2),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261)), 
])
transform_test = transforms.Compose([
    transforms.Resize(128),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.247, 0.243, 0.261)),
])

In [4]:
train_set = torchvision.datasets.CIFAR10(root='./data', train=True, download=True, transform=transform_train)
train_loader = DataLoader(dataset=train_set , batch_size=256 ,shuffle=True)

test_set = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform_test)
test_loader = torch.utils.data.DataLoader(test_set, batch_size=256, shuffle=False, num_workers=2)


Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to ./data/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [00:02<00:00, 84215324.15it/s]


Extracting ./data/cifar-10-python.tar.gz to ./data
Files already downloaded and verified


In [5]:
from torchvision import datasets
from torch.utils.data import DataLoader, random_split

val_size = 5000  
test_size = len(test_set) - val_size

val_set, test_set = random_split(test_set, [val_size, test_size])

val_loader = DataLoader(dataset=val_set, batch_size=128, shuffle=False)
test_loader = DataLoader(dataset=test_set, batch_size=128, shuffle=False)

print(f"Training dataset size: {len(train_set)}")
print(f"Validation dataset size: {len(val_set)}")
print(f"Test dataset size: {len(test_set)}")

Training dataset size: 50000
Validation dataset size: 5000
Test dataset size: 5000


In [6]:
train_set[0][0].shape

torch.Size([3, 128, 128])

In [7]:
# Load Pretrained vgg11_bn
model = vgg11_bn(pretrained=True)


Downloading: "https://download.pytorch.org/models/vgg11_bn-6002323d.pth" to /root/.cache/torch/hub/checkpoints/vgg11_bn-6002323d.pth
100%|██████████| 507M/507M [00:02<00:00, 222MB/s]  


In [8]:
model

VGG(
  (features): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (4): Conv2d(64, 128, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (5): BatchNorm2d(128, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (6): ReLU(inplace=True)
    (7): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
    (8): Conv2d(128, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (9): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (10): ReLU(inplace=True)
    (11): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (12): BatchNorm2d(256, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (13): ReLU(inplace=True)
    (14): MaxPool2d(ke

In [9]:
features = nn.Sequential(*list(model.features.children())[:28])

classifier = nn.Sequential(
    nn.AdaptiveAvgPool2d(output_size=(3, 3)),
    nn.Flatten(),
    nn.Linear(4608, 512),
    nn.BatchNorm1d(512),
    nn.ReLU(),
    nn.Dropout(p=0.5),
    nn.Linear(512, 512),  
    nn.BatchNorm1d(512),
    nn.ReLU(),
    nn.Dropout(p=0.5),
    nn.Linear(512, 10)
)

model = nn.Sequential(
    features,
    classifier
)

In [15]:
x = torch.randn(256,3,128,128)
model(x).shape

torch.Size([256, 10])

In [16]:
print(f"Available GPUs: {torch.cuda.device_count()}")


Available GPUs: 2


In [19]:
# Training settings
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

if torch.cuda.device_count() > 1:
    print(f"Using {torch.cuda.device_count()} GPUs!")
    model = torch.nn.DataParallel(model)

model = model.to(device)
print(torch.cuda.is_available())

criterion = nn.CrossEntropyLoss(label_smoothing=0.1)
optimizer = optim.SGD(model.parameters(), lr=0.01,momentum=0.9 ,weight_decay = 5e-4 )
scheduler = lr_scheduler.StepLR(optimizer, step_size = 10, gamma = 0.1)

Using 2 GPUs!
True


In [20]:
# Training Loop
num_epochs = 400

for epoch in range(num_epochs):
    # Training Phase
    model.train()  # Set the model to training mode
    running_train_loss = 0.0
    correct_train = 0
    total_train = 0
    
    for images, labels in train_loader:
        images, labels = images.to(device), labels.to(device)
        
        optimizer.zero_grad()  # Zero the gradients
        outputs = model(images)  # Forward pass
        loss = criterion(outputs, labels)  # Compute loss
        loss.backward()  # Backpropagation
        optimizer.step()  # Update model parameters
        
        running_train_loss += loss.item() 
        
        # Track training accuracy
        _, predicted = outputs.max(1)
        total_train += labels.size(0)
        correct_train += predicted.eq(labels).sum().item()
    
    train_loss = running_train_loss / len(train_loader)
    train_accuracy = 100 * correct_train / total_train

    # Validation Phase
    model.eval()  
    running_val_loss = 0.0
    correct_val = 0
    total_val = 0
    
    with torch.cuda.amp.autocast(): 
        for images, labels in test_loader:
            images, labels = images.to(device), labels.to(device)
            outputs = model(images)  
            loss = criterion(outputs, labels)  
            running_val_loss += loss.item()
            
            _, predicted = outputs.max(1)
            total_val += labels.size(0)
            correct_val += predicted.eq(labels).sum().item()
    
    val_loss = running_val_loss / len(test_loader)
    val_accuracy = 100 * correct_val / total_val

    print(f'Epoch [{epoch + 1}/{num_epochs}]')
    print(f'Train Loss: {train_loss:.4f} | Train Accuracy: {train_accuracy:.2f}%')
    print(f'Val Loss: {val_loss:.4f} | Val Accuracy: {val_accuracy:.2f}%')
    
    scheduler.step()  


  with torch.cuda.device(device), torch.cuda.stream(stream), autocast(enabled=autocast_enabled):
  with torch.cuda.amp.autocast():  # No need to compute gradients during validation


Epoch [1/400]
Train Loss: 1.0832 | Train Accuracy: 76.56%
Val Loss: 0.7662 | Val Accuracy: 89.82%
Epoch [2/400]
Train Loss: 0.7940 | Train Accuracy: 89.77%
Val Loss: 0.6990 | Val Accuracy: 92.34%
Epoch [3/400]
Train Loss: 0.7224 | Train Accuracy: 92.66%
Val Loss: 0.6777 | Val Accuracy: 92.56%
Epoch [4/400]
Train Loss: 0.6807 | Train Accuracy: 94.11%
Val Loss: 0.6593 | Val Accuracy: 93.40%
Epoch [5/400]
Train Loss: 0.6478 | Train Accuracy: 95.44%
Val Loss: 0.6572 | Val Accuracy: 93.60%
Epoch [6/400]
Train Loss: 0.6209 | Train Accuracy: 96.56%
Val Loss: 0.6517 | Val Accuracy: 93.70%
Epoch [7/400]
Train Loss: 0.6048 | Train Accuracy: 97.19%
Val Loss: 0.6597 | Val Accuracy: 93.62%
Epoch [8/400]
Train Loss: 0.5875 | Train Accuracy: 97.84%
Val Loss: 0.6665 | Val Accuracy: 93.56%
Epoch [9/400]
Train Loss: 0.5773 | Train Accuracy: 98.20%
Val Loss: 0.6582 | Val Accuracy: 93.66%
Epoch [10/400]
Train Loss: 0.5698 | Train Accuracy: 98.47%
Val Loss: 0.6601 | Val Accuracy: 93.40%
Epoch [11/400]
Trai

KeyboardInterrupt: 

In [21]:
def check_accuracy(loader,model):
    num_correct = 0
    num_samples = 0
    model.eval()

    with torch.no_grad():
        for x,y in loader:
            x = x.to(device = device)
            y = y.to(device = device)

            scores = model(x)
            _, predictions = scores.max(1)
            num_correct += (predictions == y).sum()
            num_samples += predictions.size(0)
    #model.train()
    return num_correct / num_samples

In [22]:
check_accuracy(test_loader,model)

tensor(0.9412, device='cuda:0')

In [24]:
        torch.save({
        'epoch': epoch + 1,
        'model_state_dict': model.state_dict(),
        'optimizer_state_dict': optimizer.state_dict(),
        'loss': loss,
        }, "model_checkpoint09412.pth")