
Approximate Operations and Baseline Training Harness
---

Based on operations defined in: https://github.com/adelmanm/approx

[Faster Neural Network Training with Approximate Tensor Operations](https://arxiv.org/abs/1805.08079) by Menachem Adelman, Kfir Y. Levy, Ido Hakimi, Mark Silberstein



In [None]:
!pip install tensorboardX
!pip install torch==1.7 torchvision==0.8

To run experiments with approximate tensor operations, the following operations have to be compiled with a CUDA enabled GPU.

In [None]:
!git clone https://github.com/adelmanm/approx
!cd approx/src/pytorch/cpp
!python setup.py install
!cd ..

In [None]:
# Default Python Libraries
import os
import random
import time

# Required imports
import numpy as np
import matplotlib.pyplot as plt
from tqdm import trange
import tqdm

# PyTorch imports
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader, random_split, SubsetRandomSampler
import torch.optim as optim

# Torchvision
from torchvision.datasets import CIFAR10, MNIST
from torchvision import transforms, datasets
import torchvision

#approx imports
from approx_mul_pytorch import approx_Conv2d
# tensorboard
from tensorboardX import SummaryWriter
from datetime import datetime

def seed_everything(seed: int):
  import random, os
  import numpy as np
  import torch
  random.seed(seed)
  os.environ['PYTHONHASHSEED'] = str(seed)
  np.random.seed(seed)
  torch.manual_seed(seed)
  torch.cuda.manual_seed(seed)
  torch.backends.cudnn.deterministic = True
  torch.backends.cudnn.benchmark = True

seed_everything(9999)

In [None]:
def _init_logger(optimizer):
  d = datetime.now().strftime('%Y-%m-%d~%H:%M:%S')
  path = f'/content/drive/MyDrive/Kappa/baseline/{optimizer}/{d}'

  print(path)
  if not os.path.exists(path + '/ckpt'):
      os.makedirs(path + '/ckpt')
      os.makedirs(path + '/log')
  save_tbx_log = path + '/log'

  writer = SummaryWriter(save_tbx_log)
  return d, path, writer


def train(model, device, train_loader, optimizer, epoch, writer):
    model.train()
    train_correct = 0.0
    running_loss = 0.0
    for batch_idx, (data, target) in enumerate(train_loader):
        data, target = data.to(device), target.to(device)
        
        optimizer.zero_grad()
        
        output = model(data)
        loss = F.nll_loss(output, target)
        loss.backward()
        optimizer.step()
        running_loss += loss.item()

        pred = output.max(1, keepdim=True)[1]
        train_correct += pred.eq(target.view_as(pred)).sum().item()

    writer.add_scalar('Train/train_loss', running_loss / len(train_loader), epoch)
    writer.add_scalar('Train/train_acc', (train_correct / 40000) * 100, epoch)
        

def validate(model, device, validation_loader, epoch, writer):
    model.eval()
    validation_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in validation_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            validation_loss += F.nll_loss(output, target, reduction='sum').item() 
            pred = output.max(1, keepdim=True)[1]
            correct += pred.eq(target.view_as(pred)).sum().item()

    validation_loss /= len(validation_loader.dataset)
    print('Epoch: {}, \nValidation set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%) elapsed_time: {:.2f} sec\n'.format(epoch,
         validation_loss, correct, validation_size,
        100. * correct / validation_size, time.time()-t_start))
    
    writer.add_scalar("Validation/Val Acc", 100. * correct / validation_size, epoch)
    writer.add_scalar("Validation/Val loss", validation_loss, epoch)

    return 100. * correct / validation_size
    
    
def test(model, device, test_loader, writer):
    model.eval()
    test_loss = 0
    correct = 0
    with torch.no_grad():
        for data, target in test_loader:
            data, target = data.to(device), target.to(device)
            output = model(data)
            test_loss += F.nll_loss(output, target, reduction='sum').item() 
            pred = output.max(1, keepdim=True)[1] 
            correct += pred.eq(target.view_as(pred)).sum().item()

    test_loss /= len(test_loader.dataset)
    print('\nTest set: Average loss: {:.4f}, Accuracy: {}/{} ({:.2f}%)\n'.format(
        test_loss, correct, len(test_loader.dataset),
        100. * correct / len(test_loader.dataset)))

    writer.add_scalar('Test/Test Acc', 100. * correct / len(test_loader.dataset), 1)


In [None]:
batch_size = 128
use_cuda =  torch.cuda.is_available()

device = torch.device("cuda" if use_cuda else "cpu")

print("using device " + str(device))
kwargs = {'num_workers': 1, 'pin_memory': True} if use_cuda else {}
batch_size = 128

using device cuda


In [None]:
# cifar10
def load_cifar10():
  train_transform = transforms.Compose([
    transforms.RandomCrop(32, padding=4),
    transforms.RandomHorizontalFlip(),
    transforms.ToTensor(),
    transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010)),
    ])
  test_transform = transforms.Compose([transforms.ToTensor(), transforms.Normalize((0.4914, 0.4822, 0.4465), (0.2023, 0.1994, 0.2010))])

  trainset = datasets.CIFAR10(root='../data', train=True,
                                          download=True, transform=train_transform)
  trainloader = torch.utils.data.DataLoader(trainset, batch_size=batch_size,
                                            shuffle=True, num_workers=2)
  
  testset = datasets.CIFAR10(root='../data', train=False,
                                        download=True, transform=test_transform)
   # Define the indices
  indices = list(range(len(trainset)))  # start with all the indices in training set

  global train_size, validation_size
  validation_size = 10000  # define the split size
  train_size = len(trainset) - validation_size
  print('training set size: {} samples',train_size)
  print('validation set size: {} samples', validation_size)

  # Random, non-contiguous split
  validation_idx = np.random.choice(indices, size=validation_size, replace=False)
  train_idx = list(set(indices) - set(validation_idx))

    # define our samplers -- we use a SubsetRandomSampler because it will return
  # a random subset of the split defined by the given indices without replacement
  train_sampler = SubsetRandomSampler(train_idx)
  validation_sampler = SubsetRandomSampler(validation_idx)

  train_loader = torch.utils.data.DataLoader(
      dataset = trainset,
      batch_size=batch_size, sampler=train_sampler, **kwargs)
  validation_loader = torch.utils.data.DataLoader(
      dataset=trainset,
      batch_size=batch_size, sampler=validation_sampler, **kwargs)  
  test_loader = torch.utils.data.DataLoader(testset, batch_size=32,
                                          shuffle=True, num_workers=2)

  classes = ('plane', 'car', 'bird', 'cat',
            'deer', 'dog', 'frog', 'horse', 'ship', 'truck')
  return train_loader, validation_loader, test_loader

In [None]:
### Based on https://github.com/pytorch/vision/blob/main/torchvision/models/resnet.py

import torch
from torch import Tensor
import torch.nn as nn
from typing import Type, Any, Callable, Union, List, Optional



def conv3x3(in_planes: int, out_planes: int, stride: int = 1, groups: int = 1, dilation: int = 1, approx = False):
    """3x3 convolution with padding"""
    if approx == True:
      return approx_Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                      padding=dilation, groups=groups, bias=False, dilation=dilation, minimal_k=10, sample_ratio=0.5)
    else:
      return nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
                      padding=dilation, groups=groups, bias=False, dilation=dilation)

def conv1x1(in_planes: int, out_planes: int, stride: int = 1, approx = False):
    """1x1 convolution"""
    if approx == True:
      return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
    else:
      return approx_Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False, minimal_k=10, sample_ratio=0.5)


class BasicBlock(nn.Module):
    expansion: int = 1

    def __init__(
        self,
        inplanes: int,
        planes: int,
        stride: int = 1,
        downsample: Optional[nn.Module] = None,
        groups: int = 1,
        base_width: int = 64,
        dilation: int = 1,
        norm_layer: Optional[Callable[..., nn.Module]] = None,
        approx = False
    ) -> None:
        super(BasicBlock, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        if groups != 1 or base_width != 64:
            raise ValueError('BasicBlock only supports groups=1 and base_width=64')
        if dilation > 1:
            raise NotImplementedError("Dilation > 1 not supported in BasicBlock")
        # Both self.conv1 and self.downsample layers downsample the input when stride != 1
        self.conv1 = conv3x3(inplanes, planes, stride, approx = approx)
        self.bn1 = norm_layer(planes)
        self.relu = nn.ReLU(inplace=True)
        self.conv2 = conv3x3(planes, planes, approx = approx)
        self.bn2 = norm_layer(planes)
        self.downsample = downsample
        self.stride = stride

    def forward(self, x: Tensor) -> Tensor:
        identity = x

        out = self.conv1(x)
        out = self.bn1(out)
        out = self.relu(out)

        out = self.conv2(out)
        out = self.bn2(out)

        if self.downsample is not None:
            identity = self.downsample(x)

        out += identity
        out = self.relu(out)

        return out


class ResNet(nn.Module):

    def __init__(
        self,
        block: Type[Union[BasicBlock]],
        layers: List[int],
        num_classes: int = 1000,
        zero_init_residual: bool = False,
        groups: int = 1,
        width_per_group: int = 64,
        replace_stride_with_dilation: Optional[List[bool]] = None,
        norm_layer: Optional[Callable[..., nn.Module]] = None,
        approx = False
    ) -> None:
        super(ResNet, self).__init__()
        if norm_layer is None:
            norm_layer = nn.BatchNorm2d
        self._norm_layer = norm_layer

        self.inplanes = 64
        self.dilation = 1
        if replace_stride_with_dilation is None:
            # each element in the tuple indicates if we should replace
            # the 2x2 stride with a dilated convolution instead
            replace_stride_with_dilation = [False, False, False]
        if len(replace_stride_with_dilation) != 3:
            raise ValueError("replace_stride_with_dilation should be None "
                             "or a 3-element tuple, got {}".format(replace_stride_with_dilation))
        self.groups = groups
        self.base_width = width_per_group
        self.conv1 = nn.Conv2d(3, self.inplanes, kernel_size=7, stride=2, padding=3,
                               bias=False)
        self.bn1 = norm_layer(self.inplanes)
        self.relu = nn.ReLU(inplace=True)
        self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, padding=1)
        self.layer1 = self._make_layer(block, 64, layers[0], approx = False)
        self.layer2 = self._make_layer(block, 128, layers[1], stride=2,
                                       dilate=replace_stride_with_dilation[0], approx = False)
        
        self.layer3 = self._make_layer(block, 256, layers[2], stride=2,
                                       dilate=replace_stride_with_dilation[1], approx = True)
        self.layer4 = self._make_layer(block, 512, layers[3], stride=2,
                                       dilate=replace_stride_with_dilation[2], approx = True)
        self.avgpool = nn.AdaptiveAvgPool2d((1, 1))
        self.fc = nn.Linear(512 * block.expansion, num_classes)

        for m in self.modules():
            if isinstance(m, nn.Conv2d):
                nn.init.kaiming_normal_(m.weight, mode='fan_out', nonlinearity='relu')
            elif isinstance(m, (nn.BatchNorm2d, nn.GroupNorm)):
                nn.init.constant_(m.weight, 1)
                nn.init.constant_(m.bias, 0)

        # Zero-initialize the last BN in each residual branch,
        # so that the residual branch starts with zeros, and each residual block behaves like an identity.
        # This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
        if zero_init_residual:
            for m in self.modules():
                """if isinstance(m, Bottleneck):
                    nn.init.constant_(m.bn3.weight, 0)  # type: ignore[arg-type]"""
                if isinstance(m, BasicBlock):
                    nn.init.constant_(m.bn2.weight, 0)  # type: ignore[arg-type]

    def _make_layer(self, block: Type[Union[BasicBlock]], planes: int, blocks: int,
                    stride: int = 1, dilate: bool = False, approx = False) -> nn.Sequential:
        norm_layer = self._norm_layer
        downsample = None
        previous_dilation = self.dilation
        if dilate:
            self.dilation *= stride
            stride = 1
        if stride != 1 or self.inplanes != planes * block.expansion:
            downsample = nn.Sequential(
                conv1x1(self.inplanes, planes * block.expansion, stride, approx = approx),
                norm_layer(planes * block.expansion),
            )

        layers = []
        layers.append(block(self.inplanes, planes, stride, downsample, self.groups,
                            self.base_width, previous_dilation, norm_layer, approx = approx))
        self.inplanes = planes * block.expansion
        for _ in range(1, blocks):
            layers.append(block(self.inplanes, planes, groups=self.groups,
                                base_width=self.base_width, dilation=self.dilation,
                                norm_layer=norm_layer, approx = approx))

        return nn.Sequential(*layers)

    def _forward_impl(self, x: Tensor) -> Tensor:
        # See note [TorchScript super()]
        x = self.conv1(x)
        x = self.bn1(x)
        x = self.relu(x)
        x = self.maxpool(x)

        x = self.layer1(x)
        x = self.layer2(x)
        x = self.layer3(x)
        x = self.layer4(x)

        x = self.avgpool(x)
        x = torch.flatten(x, 1)
        x = self.fc(x)

        return x

    def forward(self, x: Tensor) -> Tensor:
        return self._forward_impl(x)


def _resnet(
    arch: str,
    block: Type[Union[BasicBlock]],
    layers: List[int],
    pretrained: bool,
    progress: bool,
    approx = False,
    **kwargs: Any
) -> ResNet:
    model = ResNet(block, layers, **kwargs)
    """if pretrained:
        state_dict = load_state_dict_from_url(model_urls[arch],
                                              progress=progress)
        model.load_state_dict(state_dict)"""
    return model


def resnet18(pretrained: bool = False, progress: bool = True, approx = False, **kwargs: Any) -> ResNet:
    r"""ResNet-18 model from
    `"Deep Residual Learning for Image Recognition" <https://arxiv.org/pdf/1512.03385.pdf>`_.
    Args:
        pretrained (bool): If True, returns a model pre-trained on ImageNet
        progress (bool): If True, displays a progress bar of the download to stderr
    """
    return _resnet('resnet18', BasicBlock, [2, 2, 2, 2], pretrained, progress, approx=approx,
                   **kwargs)


In [None]:
best_val_acc = 0

start_epoch = 1

max_epoch = 40
train_loader, validation_loader, test_loader = load_cifar10()

approx = False # Toggle based on experiment

optimizers = ['SGD', 'SGD_mom', 'Adam', 'Adadelta', 'RMSProp', 'Adagrad']
for optim in optimizers:
    dataset = 'cifar10'
    name = optim
    d, path, writer = _init_logger(f'{dataset}_{name}')
    if approx == True:
      model = resnet18(approx=True)
    else:
      model = torchvision.models.resnet18(False)
    model.conv1 = nn.Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    model.maxpool = nn.Identity()
    model.fc = nn.Sequential(nn.Linear(512, 10), nn.LogSoftmax(dim=1))
    model.to(device)
    print(optim)
    
    if optim == 'SGD':
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-2)
    elif optim == 'SGD_mom':
        optimizer = torch.optim.SGD(model.parameters(), lr=1e-2, momentum=0.99, nesterov=True)
    if optim == 'Adam':
        optimizer = torch.optim.Adam(model.parameters(), lr=1e-2)
    elif optim == 'Adadelta':
        optimizer = torch.optim.Adadelta(model.parameters(), lr=1e-2)
    elif optim == 'Adagrad':
        optimizer = torch.optim.Adagrad(model.parameters(), lr=1e-2)
    elif optim == 'RMSProp':
        optimizer = torch.optim.RMSprop(model.parameters(), lr=1e-2)

    global t_start
    t_start = time.time()
    for epoch in tqdm.notebook.tqdm(range(max_epoch)):
        train(model, device, train_loader, optimizer, epoch, writer)
        val_acc = validate(model, device, validation_loader, epoch, writer)
        print(val_acc)
        if (val_acc >= best_val_acc):
          best_val_acc = val_acc
          print('best_val_acc:', best_val_acc)
          state = {
          'epoch': epoch,
          'best_val_acc': best_val_acc,
          'state_dict': model.state_dict(),
          'optimizer': optimizer.state_dict(),
          }
          torch.save(state, f'{path}/ckpt/{d}.pth')

    t_end = time.time()
    print('Total training time: {:.2f} sec'.format(t_end-t_start))

    model.load_state_dict(torch.load(f'{path}/ckpt/{d}.pth')['state_dict'])
    model.eval()
    test(model, device, test_loader, writer)
    writer.close()