## Advanced Applied Machine Learning HW 4

For this assignment you will use a gridsearch algorithm, such as the particle swarm or CSO to tune hyperparameters for a Pytorch neural network design, such as Alex Net, to create a data application for the CiFAR10  data set and yield good accuracy on the test set. For CiFAR10, good accuracy on the test set is over 84%.

In [9]:
#Much code taken from https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
#and https://pytorch.org/hub/pytorch_vision_alexnet/
#and class file: "Example AlexNet Design Pytorch.ipynb"
#and class file: "Grid_Search_Algorithms.ipynb"
#First, I'm just going to make sure the model runs and we get fairly good accuracy with default parameters.
import torch
import torchvision 
from torchvision import datasets
import torchvision.transforms as transforms
from torch.utils.data.sampler import SubsetRandomSampler
import numpy as np

In [10]:
# Get gpu, mps or cpu device for training.
device = (
    "cuda"
    if torch.cuda.is_available()
    else "mps"
    if torch.backends.mps.is_available()
    else "cpu"
)
print(f"Using {device} device")

Using cuda device


In [11]:
def get_train_valid_loader(data_dir,
                           batch_size,
                           augment,
                           random_seed,
                           valid_size=0.1,
                           shuffle=True):
    normalize = transforms.Normalize(
        mean=[0.4914, 0.4822, 0.4465],
        std=[0.2023, 0.1994, 0.2010],
    )

    # define transforms
    valid_transform = transforms.Compose([
            transforms.Resize((227,227)),
            transforms.ToTensor(),
            normalize,
    ])
    if augment:
        train_transform = transforms.Compose([
            transforms.RandomCrop(32, padding=4),
            transforms.RandomHorizontalFlip(),
            transforms.ToTensor(),
            normalize,
        ])
    else:
        train_transform = transforms.Compose([
            transforms.Resize((227,227)),
            transforms.ToTensor(),
            normalize,
        ])

    # load the dataset
    train_dataset = datasets.CIFAR10(
        root=data_dir, train=True,
        download=True, transform=train_transform,
    )

    valid_dataset = datasets.CIFAR10(
        root=data_dir, train=True,
        download=True, transform=valid_transform,
    )

    num_train = len(train_dataset)
    indices = list(range(num_train))
    split = int(np.floor(valid_size * num_train))

    if shuffle:
        np.random.seed(random_seed)
        np.random.shuffle(indices)

    train_idx, valid_idx = indices[split:], indices[:split]
    train_sampler = SubsetRandomSampler(train_idx)
    valid_sampler = SubsetRandomSampler(valid_idx)

    train_loader = torch.utils.data.DataLoader(
        train_dataset, batch_size=batch_size, sampler=train_sampler)

    valid_loader = torch.utils.data.DataLoader(
        valid_dataset, batch_size=batch_size, sampler=valid_sampler)

    return (train_loader, valid_loader)


def get_test_loader(data_dir,
                    batch_size,
                    shuffle=True):
    normalize = transforms.Normalize(
        mean=[0.485, 0.456, 0.406],
        std=[0.229, 0.224, 0.225],
    )

    # define transform
    transform = transforms.Compose([
        transforms.Resize((227,227)),
        transforms.ToTensor(),
        normalize,
    ])

    dataset = datasets.CIFAR10(
        root=data_dir, train=False,
        download=True, transform=transform,
    )

    data_loader = torch.utils.data.DataLoader(
        dataset, batch_size=batch_size, shuffle=shuffle
    )

    return data_loader


# CIFAR10 dataset
train_loader, valid_loader = get_train_valid_loader(data_dir = './data',                                      batch_size = 64,
                       augment = False,                             		     random_seed = 1)

test_loader = get_test_loader(data_dir = './data',
                              batch_size = 64)

Files already downloaded and verified
Files already downloaded and verified
Files already downloaded and verified


In [12]:
class AlexNet(nn.Module):
    def __init__(self, num_classes=10):
        super(AlexNet, self).__init__()
        self.layer1 = nn.Sequential(
            nn.Conv2d(3, 96, kernel_size=11, stride=4, padding=0),
            nn.BatchNorm2d(96),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3, stride = 2))
        self.layer2 = nn.Sequential(
            nn.Conv2d(96, 256, kernel_size=5, stride=1, padding=2),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3, stride = 2))
        self.layer3 = nn.Sequential(
            nn.Conv2d(256, 384, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(384),
            nn.ReLU())
        self.layer4 = nn.Sequential(
            nn.Conv2d(384, 384, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(384),
            nn.ReLU())
        self.layer5 = nn.Sequential(
            nn.Conv2d(384, 256, kernel_size=3, stride=1, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(),
            nn.MaxPool2d(kernel_size = 3, stride = 2))
        self.fc = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(9216, 4096),
            nn.ReLU())
        self.fc1 = nn.Sequential(
            nn.Dropout(0.5),
            nn.Linear(4096, 4096),
            nn.ReLU())
        self.fc2= nn.Sequential(
            nn.Linear(4096, num_classes))

    def forward(self, x):
        out = self.layer1(x)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.layer5(out)
        out = out.reshape(out.size(0), -1)
        out = self.fc(out)
        out = self.fc1(out)
        out = self.fc2(out)
        return out

In [13]:
num_classes = 10
num_epochs = 20
batch_size = 40
learning_rate = 0.005

model = AlexNet(num_classes).to(device)


# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay = 0.005, momentum = 0.9)


# Train the model
total_step = len(train_loader)

In [14]:
total_step = len(train_loader)

for epoch in range(num_epochs):
    for i, (images, labels) in enumerate(train_loader):
        # Move tensors to the configured device
        images = images.to(device)
        labels = labels.to(device)

        # Forward pass
        outputs = model(images)
        loss = criterion(outputs, labels)

        # Backward and optimize
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

    # Validation
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in valid_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs

        print('Accuracy of the network on the {} validation images: {} %'.format(5000, 100 * correct / total))

Epoch [1/20], Step [704/704], Loss: 0.8856
Accuracy of the network on the 5000 validation images: 56.9 %
Epoch [2/20], Step [704/704], Loss: 0.3926
Accuracy of the network on the 5000 validation images: 66.74 %
Epoch [3/20], Step [704/704], Loss: 0.9596
Accuracy of the network on the 5000 validation images: 71.76 %
Epoch [4/20], Step [704/704], Loss: 0.8214
Accuracy of the network on the 5000 validation images: 73.3 %
Epoch [5/20], Step [704/704], Loss: 0.4678
Accuracy of the network on the 5000 validation images: 76.66 %
Epoch [6/20], Step [704/704], Loss: 0.5510
Accuracy of the network on the 5000 validation images: 76.7 %
Epoch [7/20], Step [704/704], Loss: 1.9526
Accuracy of the network on the 5000 validation images: 79.1 %
Epoch [8/20], Step [704/704], Loss: 0.3695
Accuracy of the network on the 5000 validation images: 79.58 %
Epoch [9/20], Step [704/704], Loss: 0.5063
Accuracy of the network on the 5000 validation images: 79.12 %
Epoch [10/20], Step [704/704], Loss: 0.6946
Accura

In [15]:
with torch.no_grad():
    correct = 0
    total = 0
    for images, labels in test_loader:
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()
        del images, labels, outputs

    print('Accuracy of the network on the {} test images: {} %'.format(10000, 100 * correct / total))


Accuracy of the network on the 10000 test images: 80.28 %


In [18]:
#Success! Now let's functionalize this process
def runNetWithHypers(train_loader,valid_loader,test_loader,num_classes,num_epochs,batch_size,learning_rate,weight_decay,momentum):
    model = AlexNet(num_classes).to(device)


    # Loss and optimizer
    criterion = nn.CrossEntropyLoss()
    optimizer = torch.optim.SGD(model.parameters(), lr=learning_rate, weight_decay = weight_decay, momentum = momentum)


    # Train the model
    total_step = len(train_loader)
    
    for epoch in range(num_epochs):
        for i, (images, labels) in enumerate(train_loader):
            # Move tensors to the configured device
            images = images.to(device)
            labels = labels.to(device)

            # Forward pass
            outputs = model(images)
            loss = criterion(outputs, labels)

            # Backward and optimize
            optimizer.zero_grad()
            loss.backward()
            optimizer.step()

        print ('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}'
                   .format(epoch+1, num_epochs, i+1, total_step, loss.item()))

    # Validation
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in valid_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs

        print('Accuracy of the network on the {} validation images: {} %'.format(5000, 100 * correct / total))
        
    with torch.no_grad():
        correct = 0
        total = 0
        for images, labels in test_loader:
            images = images.to(device)
            labels = labels.to(device)
            outputs = model(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()
            del images, labels, outputs

    print('Accuracy of the network on the {} test images: {} %'.format(10000, 100 * correct / total))
    return correct/total
    

In [None]:
#Now let's implement PSO to determine good hyperparameters that give us > 84% accuracy

In [None]:
num_classes=10
num_epochs=30
batch_size=40
learning_rate=0.005
weight_decay = 0.005
momentum = 0.9
accuracy = 0

# PSO algorithm. For now we're going to try just optimizing along learning rate. If that proves insufficient, we may expand
# to momentum, weight_decay, and batch_size

def particle_swarm_optimization(num_dimensions, num_particles, max_iter,i_min=-10,i_max=10,bounds=None,w=0.5,c1=0.25,c2=0.75):
    # Initialize the particles
    # This creates a data structure such as a dictionary
    if bounds is None:
        particles = [({'position': [np.random.uniform(i_min, i_max) for _ in range(num_dimensions)],
                    'velocity': [np.random.uniform(-1, 1) for _ in range(num_dimensions)],
                    'pbest': float('inf'),
                    'pbest_position': None})
                    for _ in range(num_particles)]
    else:
        particles = [({'position': [np.random.uniform(bounds[i][0], bounds[i][1]) for i in range(num_dimensions)],
                    'velocity': [np.random.uniform(-1, 1) for _ in range(num_dimensions)],
                    'pbest': float('inf'),
                    'pbest_position': None})
                    for _ in range(num_particles)]

    # Initialize global best
    gbest_value = float('inf')
    gbest_position = None

    for _ in range(max_iter):
        for particle in particles:
            position = particle['position']
            velocity = particle['velocity']
            # Calculate the current value
            current_value = runNetWithHypers(train_loader,valid_loader,test_loader,num_classes=num_classes,num_epochs=num_epochs,batch_size=batch_size,
                 learning_rate=position[0],weight_decay=position[1],momentum=position[2])
            print(position, current_value)

            # Update personal best
            if 1-current_value < particle['pbest']:
                particle['pbest'] = current_value
                particle['pbest_position'] = position.copy()

            # Update global best
            if 1-current_value < gbest_value:
                gbest_value = current_value
                gbest_position = position.copy()

            # Update particle's velocity and position
            for i in range(num_dimensions):
                r1, r2 = np.random.uniform(), np.random.uniform()
                velocity[i] = w * velocity[i] + c1*r1 * (particle['pbest_position'][i] - position[i]) + c2*r2 * (gbest_position[i] - position[i])
                position[i] += velocity[i]
                # legalize the values to the provided bounds
                if bounds is not None:
                    position[i] = np.clip(position[i],bounds[i][0],bounds[i][1])

    return gbest_position, gbest_value


particle_swarm_optimization(num_dimensions=3, num_particles=30, max_iter=10,i_min=-0.001,i_max=0.001,bounds=[(0.0004,0.04),(0.0001,0.0002),(0.3,0.5)],w=0.5,c1=0.25,c2=0.75)
    

Epoch [1/30], Step [704/704], Loss: 1.2881
Epoch [2/30], Step [704/704], Loss: 1.0304
Epoch [3/30], Step [704/704], Loss: 0.9337
Epoch [4/30], Step [704/704], Loss: 1.6509
Epoch [5/30], Step [704/704], Loss: 0.5721
Epoch [6/30], Step [704/704], Loss: 1.0838
Epoch [7/30], Step [704/704], Loss: 1.0250
Epoch [8/30], Step [704/704], Loss: 0.3377
Epoch [9/30], Step [704/704], Loss: 0.1971
Epoch [10/30], Step [704/704], Loss: 0.0226
Epoch [11/30], Step [704/704], Loss: 0.3743
Epoch [12/30], Step [704/704], Loss: 0.4387
Epoch [13/30], Step [704/704], Loss: 0.6110
Epoch [14/30], Step [704/704], Loss: 0.2393
Epoch [15/30], Step [704/704], Loss: 0.3760
Epoch [16/30], Step [704/704], Loss: 0.2641
Epoch [17/30], Step [704/704], Loss: 0.3431
Epoch [18/30], Step [704/704], Loss: 0.2858
Epoch [19/30], Step [704/704], Loss: 0.0271
Epoch [20/30], Step [704/704], Loss: 0.4657
Epoch [21/30], Step [704/704], Loss: 0.5817
Epoch [22/30], Step [704/704], Loss: 0.7168
Epoch [23/30], Step [704/704], Loss: 0.01

Success! With 30 training epochs in AlexNet, a learning rate of 0.011677, a weight decay of 0.000182, and a momentum of 0.438486, we have achieved > 84% accuracy on our test data! This took a little experimentation and fine-tuning of the PSO hyperparameters (increments, bounds, #of particles, etc), but it worked!