<a href="https://colab.research.google.com/github/mudit9/DeepLearningMP1/blob/main/Copy_of_DL_MP1_(1).ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
pip install -U "ray[tune]"



In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
class BasicBlock(nn.Module):

    def __init__(self, in_planes, planes, stride=1):
        super(BasicBlock, self).__init__()
        self.conv1 = nn.Conv2d(
            in_planes, planes, kernel_size=3, stride=stride, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(planes)
        self.conv2 = nn.Conv2d(planes, planes, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn2 = nn.BatchNorm2d(planes)

        self.shortcut = nn.Sequential()
        if stride != 1 or in_planes != planes:
            self.shortcut = nn.Sequential(
                nn.Conv2d(in_planes, planes,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(planes)
            )

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.bn2(self.conv2(out))
        out += self.shortcut(x)
        out = F.relu(out)
        return out



class ResNet(nn.Module):
    def __init__(self, block, num_blocks, num_classes=10):
        super(ResNet, self).__init__()
        self.in_planes = 64
        #self.drop_layer = F.Dropout(p=0.25)
        self.conv1 = nn.Conv2d(3, 64, kernel_size=3,
                               stride=1, padding=1, bias=False)
        self.bn1 = nn.BatchNorm2d(64)
        self.layer1 = self._make_layer(block, 64, num_blocks[0], stride=2)
        self.layer2 = self._make_layer(block, 128, num_blocks[1], stride=1)
        self.layer3 = self._make_layer(block, 256, num_blocks[2], stride=1)
        self.layer4 = self._make_layer(block, 512, num_blocks[3], stride=1)

        self.linear = nn.Linear(512, num_classes)

    def _make_layer(self, block, planes, num_blocks, stride):
        strides = [stride] + [1]*(num_blocks-1)
        layers = []
        for stride in strides:
            layers.append(block(self.in_planes, planes, stride))
            self.in_planes = planes
        return nn.Sequential(*layers)

    def forward(self, x):
        out = F.relu(self.bn1(self.conv1(x)))
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = F.avg_pool2d(out, out.size()[3])
        out = out.view(out.size(0), -1)
        out = self.linear(out)
        return out

def project1_model():
    #return ResNet(BasicBlock, [2, 2, 2, 2])
    return ResNet(BasicBlock,[1,1,1,1])

In [None]:
import numpy as np
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import random_split
import torchvision
import torchvision.transforms as transforms
from ray import tune
from ray.tune import CLIReporter
from ray.tune.schedulers import ASHAScheduler

In [None]:
def train_cifar(config, checkpoint_dir=None, data_dir=None):
    #print('here2.5')

    net = project1_model()
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if torch.cuda.device_count() > 1:
            net = nn.DataParallel(net)
    net.to(device)
    criterion = nn.CrossEntropyLoss()
    if config["optimizer"] == 'sgd':
      optimizer = optim.SGD(net.parameters(), lr=config["lr"],momentum=0.9, weight_decay=5e-4)
    
    if config["optimizer"] == 'adam':
      optimizer = optim.Adam(net.parameters(), lr=config["lr"], weight_decay=5e-4)	
    if config["optimizer"] == 'adagrad':
      optimizer = optim.Adagrad(net.parameters(), lr=config["lr"],weight_decay=5e-4)
    
        
        
    #print('here3')
    if checkpoint_dir:
        model_state, optimizer_state = torch.load(
            os.path.join(checkpoint_dir, "checkpoint"))
        net.load_state_dict(model_state)
        optimizer.load_state_dict(optimizer_state)

    trainset, testset = load_data(data_dir)

    test_abs = int(len(trainset) * 0.8)
    train_subset, val_subset = random_split(
        trainset, [test_abs, len(trainset) - test_abs])

    trainloader = torch.utils.data.DataLoader(
        train_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)
    valloader = torch.utils.data.DataLoader(
        val_subset,
        batch_size=int(config["batch_size"]),
        shuffle=True,
        num_workers=8)
    print('here4')

    for epoch in range(25):  # loop over the dataset multiple times
        running_loss = 0.0
        epoch_steps = 0
        for i, data in enumerate(trainloader, 0):
            # get the inputs; data is a list of [inputs, labels]
            inputs, labels = data
            inputs, labels = inputs.to(device), labels.to(device)

            # zero the parameter gradients
            optimizer.zero_grad()

            # forward + backward + optimize
            outputs = net(inputs)
            loss = criterion(outputs, labels)
            loss.backward()
            optimizer.step()

            # print statistics
            running_loss += loss.item()
            epoch_steps += 1
            if i % 2000 == 1999:  # print every 2000 mini-batches
                print("[%d, %5d] loss: %.3f" % (epoch + 1, i + 1,
                                                running_loss / epoch_steps))
                running_loss = 0.0

        # Validation loss
        val_loss = 0.0
        val_steps = 0
        total = 0
        correct = 0
        for i, data in enumerate(valloader, 0):
            with torch.no_grad():
                inputs, labels = data
                inputs, labels = inputs.to(device), labels.to(device)

                outputs = net(inputs)
                _, predicted = torch.max(outputs.data, 1)
                total += labels.size(0)
                correct += (predicted == labels).sum().item()

                loss = criterion(outputs, labels)
                val_loss += loss.cpu().numpy()
                val_steps += 1

        with tune.checkpoint_dir(epoch) as checkpoint_dir:
            path = os.path.join(checkpoint_dir, "checkpoint")
            torch.save((net.state_dict(), optimizer.state_dict()), path)

        tune.report(loss=(val_loss / val_steps), accuracy=correct / total)
    print("Finished Training")

In [None]:
def load_data(data_dir="./data"):
    transform_train = transforms.Compose([
        transforms.RandomCrop(32, padding=4),
        transforms.RandomHorizontalFlip(),
        #transforms.RandomVerticalFlip(),
        transforms.ToTensor(),
        #transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    transform_test = transforms.Compose([
        transforms.ToTensor(),
        #transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])

    trainset = torchvision.datasets.CIFAR10(
        root='./data', train=True, download=True, transform=transform_train)
    trainloader = torch.utils.data.DataLoader(
        trainset, shuffle=True, num_workers=2)

    testset = torchvision.datasets.CIFAR10(
        root='./data', train=False, download=True, transform=transform_test)


    testloader = torch.utils.data.DataLoader(
        testset, shuffle=False, num_workers=2)

    return trainset, testset

In [None]:
def test_accuracy(net, device="cpu"):
    trainset, testset = load_data()

    testloader = torch.utils.data.DataLoader(
        testset, batch_size=4, shuffle=False, num_workers=2)

    correct = 0
    total = 0
    with torch.no_grad():
        for data in testloader:
            images, labels = data
            images, labels = images.to(device), labels.to(device)
            outputs = net(images)
            _, predicted = torch.max(outputs.data, 1)
            total += labels.size(0)
            correct += (predicted == labels).sum().item()

    return correct / total

In [None]:
def mainfunc(num_samples=10, max_num_epochs=10, gpus_per_trial=1):
    data_dir = os.path.abspath("./data")
    config = {
        "optimizer":tune.choice(["adam", "sgd", "adagrad"]),
        "lr": tune.loguniform(1e-3, 1e-1),
        "batch_size": tune.choice([16, 64, 128, 256])
    }
    print('here')
    scheduler = ASHAScheduler(
        metric="loss",
        mode="min",
        max_t=max_num_epochs,
        grace_period=1,
        reduction_factor=2)
    reporter = CLIReporter(
        parameter_columns=["lr", "batch_size","optimizer"],
        metric_columns=["loss", "accuracy", "training_iteration"])
    print('here2')

    result = tune.run(
        tune.with_parameters(train_cifar, data_dir=data_dir),
        resources_per_trial={"cpu": 2, "gpu": gpus_per_trial},
        config=config,
        num_samples=num_samples,
        scheduler=scheduler,
        progress_reporter=reporter)
    
    best_trial = result.get_best_trial("loss", "min", "last")
    print("Best trial config: {}".format(best_trial.config))
    print("Best trial final validation loss: {}".format(
        best_trial.last_result["loss"]))
    print("Best trial final validation accuracy: {}".format(
        best_trial.last_result["accuracy"]))

    best_trained_model = project1_model()
    device = "cpu"
    if torch.cuda.is_available():
        device = "cuda:0"
        if gpus_per_trial > 1:
            best_trained_model = nn.DataParallel(best_trained_model)
    best_trained_model.to(device)

    best_checkpoint_dir = best_trial.checkpoint.value
    model_state, optimizer_state = torch.load(os.path.join(
        best_checkpoint_dir, "checkpoint"))
    best_trained_model.load_state_dict(model_state)

    test_acc = test_accuracy(best_trained_model, device)
    print("Best trial test set accuracy: {}".format(test_acc))

In [None]:
mainfunc(num_samples=7, max_num_epochs=25, gpus_per_trial=1)

here
here2
== Status ==
Current time: 2022-03-24 05:08:10 (running for 00:00:00.18)
Memory usage on this node: 2.5/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 0/2 CPUs, 0/1 GPUs, 0.0/7.03 GiB heap, 0.0/3.51 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/train_cifar_2022-03-24_05-08-09
Number of trials: 7/7 (7 PENDING)
+-------------------------+----------+-------+------------+--------------+-------------+
| Trial name              | status   | loc   |         lr |   batch_size | optimizer   |
|-------------------------+----------+-------+------------+--------------+-------------|
| train_cifar_65f1b_00000 | PENDING  |       | 0.00225529 |           64 | sgd         |
| train_cifar_65f1b_00001 | PENDING  |       | 0.0247427  |          256 | sgd         |
| train_cifar_65f1b_00002 | PENDING  |       | 0.00212884 |          128 | sgd   

[2m[36m(train_cifar pid=13062)[0m   0%|          | 0/170498071 [00:00<?, ?it/s]
  0%|          | 443392/170498071 [00:00<00:42, 4038385.27it/s]
  3%|▎         | 4438016/170498071 [00:00<00:06, 24340585.68it/s]
  9%|▉         | 15451136/170498071 [00:00<00:03, 44705537.24it/s]
 13%|█▎        | 22135808/170498071 [00:00<00:02, 52609639.44it/s]
 17%|█▋        | 28141568/170498071 [00:00<00:02, 55125975.34it/s]
 21%|██        | 34981888/170498071 [00:00<00:02, 59442395.05it/s]
 24%|██▍       | 41440256/170498071 [00:00<00:02, 61073632.96it/s]
 28%|██▊       | 48025600/170498071 [00:00<00:01, 62564151.78it/s]
 32%|███▏      | 54445056/170498071 [00:01<00:01, 63050470.24it/s]
 36%|███▌      | 61309952/170498071 [00:01<00:01, 64745765.61it/s]
 40%|████      | 68346880/170498071 [00:01<00:01, 66447232.49it/s]
 44%|████▍     | 75840512/170498071 [00:01<00:01, 69016034.33it/s]
 49%|████▊     | 82745344/170498071 [00:01<00:01, 66016699.93it/s]
 53%|█████▎    | 89704448/170498071 [00:01<00:01,

[2m[36m(train_cifar pid=13062)[0m Extracting ./data/cifar-10-python.tar.gz to ./data
== Status ==
Current time: 2022-03-24 05:08:20 (running for 00:00:10.30)
Memory usage on this node: 3.8/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 16.000: None | Iter 8.000: None | Iter 4.000: None | Iter 2.000: None | Iter 1.000: None
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.03 GiB heap, 0.0/3.51 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/train_cifar_2022-03-24_05-08-09
Number of trials: 7/7 (6 PENDING, 1 RUNNING)
+-------------------------+----------+------------------+------------+--------------+-------------+
| Trial name              | status   | loc              |         lr |   batch_size | optimizer   |
|-------------------------+----------+------------------+------------+--------------+-------------|
| train_cifar_65f1b_00000 | RUNNING  | 172.28.0.2:13062 | 0.00225529 |           64 | sgd         |
| train_cifar_65f1b_00001 | PENDIN

[2m[36m(train_cifar pid=13062)[0m   cpuset_checked))


[1;30;43mStreaming output truncated to the last 5000 lines.[0m

== Status ==
Current time: 2022-03-24 05:34:30 (running for 00:26:20.54)
Memory usage on this node: 4.0/12.7 GiB
Using AsyncHyperBand: num_stopped=0
Bracket: Iter 16.000: None | Iter 8.000: -0.6050067815431364 | Iter 4.000: -0.7700059138665534 | Iter 2.000: -1.0408213484059474 | Iter 1.000: -1.2569730198307403
Resources requested: 2.0/2 CPUs, 1.0/1 GPUs, 0.0/7.03 GiB heap, 0.0/3.51 GiB objects (0.0/1.0 accelerator_type:K80)
Result logdir: /root/ray_results/train_cifar_2022-03-24_05-08-09
Number of trials: 7/7 (6 PENDING, 1 RUNNING)
+-------------------------+----------+------------------+------------+--------------+-------------+---------+------------+----------------------+
| Trial name              | status   | loc              |         lr |   batch_size | optimizer   |    loss |   accuracy |   training_iteration |
|-------------------------+----------+------------------+------------+--------------+-------------+-----