## Optimizing Training Pipeline for Plant Pathology in Pytorch

To improve training performance in any deep learning pipeline, pytorch suggests few extra additions of codes in exitin pipeline.
These additions are explained and mentioned well here in [Performance Tuning Guide](https://pytorch.org/tutorials/recipes/recipes/tuning_guide.html) . In this notebook you will encounter few of such features with hands-on dedicated for GPU Computing.

Importing required libraries

In [None]:
import os
import json
import time
import random
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models

In [None]:
######################################################
# Mixed Precision with Apex and Monitoring with Wandb
import wandb
from apex import amp
from apex.optimizers import FusedAdam
######################################################

### Login to [Wandb](https://wandb.ai/home) 

Save API Key once login

In [None]:
##################
wandb.login()
#####################

### Set GPU Device if multiple

In [None]:
!nvidia-smi

In [None]:
##############################################################
os.environ["CUDA_DEVICE_ORDER"]="PCI_BUS_ID"  
os.environ["CUDA_VISIBLE_DEVICES"]="0"
##############################################################

### Use device for `cuda` or `cpu` based on availability

```python
cuda = torch.device('cuda')     # Default CUDA device
cuda0 = torch.device('cuda:0')  # GPU 0 
cuda2 = torch.device('cuda:2')  # GPU 2 (these are 0-indexed)


x = torch.tensor([1., 2.], device=cuda0)
# x.device is device(type='cuda', index=0)
y = torch.tensor([1., 2.]).cuda()
# y.device is device(type='cuda', index=0)

with torch.cuda.device(1):
    # allocates a tensor on GPU 1
    a = torch.tensor([1., 2.], device=cuda)

    # transfers a tensor from CPU to GPU 1
    b = torch.tensor([1., 2.]).cuda()
    # a.device and b.device are device(type='cuda', index=1)

    # You can also use ``Tensor.to`` to transfer a tensor:
    b2 = torch.tensor([1., 2.]).to(device=cuda)
    # b.device and b2.device are device(type='cuda', index=1)

    c = a + b
    # c.device is device(type='cuda', index=1)

    z = x + y
    # z.device is device(type='cuda', index=0)

    # even within a context, you can specify the device
    # (or give a GPU index to the .cuda call)
    d = torch.randn(2, device=cuda2)
    e = torch.randn(2).to(cuda2)
    f = torch.randn(2).cuda(cuda2)
    # d.device, e.device, and f.device are all device(type='cuda', index=2)
```

In [None]:
####################################################################
#GPU using CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device
####################################################################

Create a Model Dir

In [None]:
try:
    os.makedirs("./saved")
except FileExistsError:
    # directory already exists
    pass

Neccesaary and Performance Tuning Hyperparameters

In [None]:
config = dict(
    #Neccessary
    TRAIN_CSV = "../data/train.csv",
    TEST_CSV = "../data/test.csv",
    IMAGE_PATH= "../data/images",
    VOCAB = "labels.json",
    saved_path="./saved/resnet18.pt",
    lr=0.001, 
    EPOCHS = 10,
    BATCH_SIZE = 32,
    IMAGE_SIZE = 224,
    TRAIN_VALID_SPLIT = 0.2,
#################################################################### 
    #For Perforamce Tuning
    device=device,
    SEED = 42,
    pin_memory=True,
    num_workers=8,
    USE_AMP = True,
    channels_last=True)
####################################################################

### Initiate a Wandb Project

In [None]:
#Initiate the Project and Entity
wandb.init(project="pytorch-lab", config=config)
# access all HPs through wandb.config, so logging matches execution!
config = wandb.config

### Reproducibility: Mandate for Constant Results

In [None]:
# For custom operators, you might need to set python seed
random.seed(config.SEED)
# If you or any of the libraries you are using rely on NumPy, you can seed the global NumPy RNG 
np.random.seed(config.SEED)
# Prevent RNG for CPU and GPU using torch
torch.manual_seed(config.SEED)
torch.cuda.manual_seed(config.SEED)

### Create tensors directly on the target device

Instead of calling `torch.rand(size).cuda()` to generate a random tensor, produce the output directly on the target device: `torch.rand(size, device=torch.device('cuda'))`.

This is applicable to all functions which create new tensors and accept device argument: `torch.rand(), torch.zeros(), torch.full()` and similar.

In [None]:
%%timeit
torch.randn((64, 3, 1280, 720)).to(device)

In [None]:
%%timeit
torch.randn((64, 3, 1280, 720), device=device)

CUDA Convolution Benchmarking: Best Convolution Algo but may loose reproducibility if True

In [None]:
# If researcher and wants to reproducuce: False
# If developer wants performance: True
torch.backends.cudnn.benchmarks = True

For Reproducibility in choosing a determinstic alternative algorithm

In [None]:
torch.backends.cudnn.deterministic = True

### Enabling TF32 on Ampere GPUs

In [None]:
# The flag below controls whether to allow TF32 on matmul. This flag defaults to True.
torch.backends.cuda.matmul.allow_tf32 = True

# The flag below controls whether to allow TF32 on cuDNN. This flag defaults to True.
torch.backends.cudnn.allow_tf32 = True

### Data Manipulation (Can be written Separately too)

In [None]:
train_df = pd.read_csv(config.TRAIN_CSV)
test_df = pd.read_csv(config.TEST_CSV)
f = open(config.VOCAB)
vocab = json.load(f)

df_fnames = train_df["image_id"].append(test_df["image_id"],ignore_index=True).tolist()
def create_fname(path,extension):
    def add_extension(fname):
        return os.path.join(path,fname)+extension
    return add_extension

jpeg_extension_creator = create_fname(config.IMAGE_PATH,".jpg")
train_df["image_id"] = train_df["image_id"].apply(jpeg_extension_creator)
test_df["image_id"] = test_df["image_id"].apply(jpeg_extension_creator)
for label in vocab:
    train_df.loc[train_df[label] == 1, "label" ] = vocab[label] 
train_df["label"] = train_df["label"].astype(int)

Data Split: Train and Val

In [None]:
train_df_X, valid_df_X, train_df_y, valid_df_y = train_test_split(train_df["image_id"],
                                                                  train_df["label"], 
                                                                  test_size=config.TRAIN_VALID_SPLIT, 
                                                                  random_state=0)

In [None]:
train_df_split = pd.DataFrame(data={"image_id": train_df_X, "label": train_df_y})
train_df_split.to_csv("../data/train_split.csv", sep=',',index=False)

valid_df_split = pd.DataFrame(data={"image_id": valid_df_X, "label": valid_df_y})
valid_df_split.to_csv("../data/val_split.csv", sep=',',index=False)

In [None]:
print("Number of train input samples is {}".format(len(train_df_X)))
print("Number of valid input samples is {}".format(len(valid_df_X)))
print("Number of train output samples is {}".format(len(train_df_y)))
print("Number of valid output samples is {}".format(len(valid_df_y)))

In [None]:
np.array(Image.open(train_df_X[0])).dtype 

```
--> Image_File_Path (String) 
--> Image.open(File_Path) 
--> np.array(Image.open(File_Path))
--> Images [0-255] uint8 
--> [0-1]; float32 
--> x - Mean_training_dataset  / Std_training_dataset```

Apply Data Transforms (Aumentations + Processing)

In [None]:
data_transforms = {
    'train': transforms.Compose([
        transforms.RandomResizedCrop((config.IMAGE_SIZE,config.IMAGE_SIZE)),
        transforms.RandomHorizontalFlip(),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
    'val': transforms.Compose([
        transforms.Resize((config.IMAGE_SIZE,config.IMAGE_SIZE)),
        transforms.ToTensor(),
        transforms.Normalize([0.485, 0.456, 0.406], [0.229, 0.224, 0.225])
    ]),
}

Custom Dataset and Dataloader for Plant Pathology Images

In [None]:
class PlantPathologyDataset(Dataset):
    def __init__(self,x,y,vocab,transforms):
        self.x = x # File Path in CSV
        self.y = y # Label in CSV
        self.vocab = vocab # Dictionary
        self.transforms = transforms
        
    def __len__(self):
        return len(self.x)
    
    def __getitem__(self,idx): #File Name --> Preprocessed 3-D Tensor
        fname = self.x.iloc[idx]        
        label = self.y.iloc[idx]
        image = Image.open(fname)
        
        if self.transforms:
            image = self.transforms(image)

        return image, label #[3,224,224], [0-3] 

In [None]:
train_ds = PlantPathologyDataset(train_df_X, 
                                 train_df_y, 
                                 vocab,
                                 data_transforms["train"])
valid_ds = PlantPathologyDataset(valid_df_X, 
                                 valid_df_y,
                                 vocab,
                                 data_transforms["val"])

Optimizers:
Gradient Descent:-
    a. Stoicastic Gradient Descent bs = 1; 'n' number of examples. 'n / 1' number of data loader/steps for 1 Epoch
    b. Mini-Batch Gradient Descent bs = 32; 'n' number of examples. 'n / 32' number of dataloaders/step for 1 Epoch 
    c. Full Batch Gradient Descent bs = total_number_of_samples number of dataloader/steps = 1 for 1 Epoch

In [None]:
len(train_ds)

In [None]:
train_ds[0][0].shape #3,224,224

In [None]:
#Number of Iterations
1456 / 32

### GPU Optimizations in Dataloader 

`torch.utils.data.DataLoader` supports asynchronous data loading and data augmentation in separate worker subprocesses. The default setting for DataLoader is `num_workers=0`, which means that the data loading is synchronous and done in the main process. As a result the main training process has to wait for the data to be available to continue the execution.

Setting `num_workers > 0` enables asynchronous data loading and overlap between the training and data loading. `num_workers` should be tuned depending on the workload, CPU, GPU, and location of training data.

DataLoader accepts `pin_memory` argument, which defaults to `False`. When using a GPU it’s better to set `pin_memory=True`, this instructs DataLoader to use pinned memory and enables faster and asynchronous memory copy from the host (CPU) to the GPU.

- Set `pin_memory=True` 
- Set `num_workers=8`

In [None]:
##########################################################
train_dl = DataLoader(train_ds,
                      batch_size=config.BATCH_SIZE,
                      shuffle=True,
                      num_workers=config.num_workers,
                      pin_memory=config.pin_memory)
valid_dl = DataLoader(valid_ds,
                      batch_size=config.BATCH_SIZE,
                      shuffle=False,
                      num_workers=config.num_workers,
                      pin_memory=config.pin_memory)
############################################################

In [None]:
len(train_dl)

Load Model : Pretrained from torchvision model zoo or Saved model

In [None]:
model = models.resnet18(pretrained=True)

#Modify the classifier for agriculture data
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(nn.Linear(num_ftrs,512),
                        nn.ReLU(),
                        nn.Dropout(p=0.3),
                        nn.Linear(512,4))

### Enable `channels_last` memory format for computer vision models

PyTorch 1.5 introduced support for channels_last memory format for convolutional networks. This format is meant to be used in conjunction with Automatic Mixed Precision (AMP) to further accelerate convolutional neural networks with Tensor Cores.

Support for channels_last is experimental, but it’s expected to work for standard computer vision models (e.g. ResNet-50, SSD). To convert models to channels_last format follow Channels Last Memory Format Tutorial. The tutorial includes a section on converting existing models.

Use `memory_format=torch.channels_last` after model initialization and input tensor

In [None]:
#Channel Last Optimization in Model
if config.channels_last:
    model = model.to(config.device, memory_format=torch.channels_last) #CHW --> #HWC
else:
    model = model.to(config.device)

In [None]:
## BackPropagation & Optimization
## W_new = W_old - LR * W_gradient ; Gradient Descent Optimization Formulation

### Apex for Fused Optimizer and Automatic Mixed Precision(AMP)

FusedAdam does following
- Fusion of the Adam update’s elementwise operations

- A multi-tensor apply launch that batches the elementwise updates applied to all the model’s parameters into one or a few kernel launches.

and

Mixed precision leverages Tensor Cores and offers up to 3x overall speedup on Volta and newer GPU architectures. To use Tensor Cores AMP should be enabled and matrix/tensor dimensions should satisfy requirements for calling kernels that use Tensor Cores.

Visit for More info on [Mixed Precision Training](https://docs.nvidia.com/deeplearning/performance/mixed-precision-training/index.html)


`amp.initialize` wraps the model and optimizer for mixed precision training. Note that that the model must already be on the correct GPU before calling `amp.initialize`. The opt_level goes from `O0`, which uses all floats, through `O3`, which uses half-precision throughout. `O1` and `O2` are different degrees of mixed-precision, the details of which can be found in the Apex [documentation](https://nvidia.github.io/apex/).

In [None]:
if config.USE_AMP:
    optimizer = FusedAdam(model.parameters(), config.lr)
    model,optimizer = amp.initialize(model, optimizer, opt_level="O2") #O0/O1/O2
else:
    optimizer = optim.Adam(model.parameters(),lr=config.lr)

CrossEntropyLoss = Softmax(Final Activation Function for Normalizing the output of the FC Layer) + Negative Log Likelihood (NLL) Loss

In [None]:
# Loss Function
criterion = nn.CrossEntropyLoss()

### Training Pipeline Starts

Host to GPU copies are much faster when they originate from pinned (page-locked) memory. CPU tensors and storages expose a `pin_memory()` method, that returns a copy of the object, with data put in a pinned region.`

Also, once you pin a tensor or storage, you can use asynchronous GPU copies. Just pass an additional `non_blocking=True` argument to a `to()` or a `cuda()` call. This can be used to overlap data transfers with computation.

You can make the DataLoader return batches placed in pinned memory by passing `pin_memory=True` to its constructor.

---

Also, Mixed-precision training requires that the loss is scaled in order to prevent the gradients from underflowing. Apex does this automatically.

In [None]:
def train_model(model,criterion,optimizer,num_epochs=10):
    ############################################################
    # tell wandb to watch what the model gets up to: gradients, weights, and more!
    wandb.watch(model, criterion, log="all", log_freq=10)
    ############################################################

    since = time.time()                                            
    batch_ct = 0
    example_ct = 0
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        
        #Training
        model.train()
        for x,y in train_dl: #BS=32 ([BS,3,224,224], [BS,4])            
            if config.channels_last:
                x = x.to(config.device, memory_format=torch.channels_last) #CHW --> #HWC
            else:
                x = x.to(config.device)
            y = y.to(config.device) #CHW --> #HWC
            
            #######################################################################
            # The second code snippet does not zero the memory of each individual parameter, 
            # also the subsequent backward pass uses assignment instead of addition to store gradients,
            # this reduces the number of memory operations.
            
            optimizer.zero_grad()
            #optimizer.zero_grad(set_to_none=True)
            ######################################################################
            
            train_logits = model(x) #Input = [BS,3,224,224] (Image) -- Model --> [BS,4] (Output Scores)
            _, train_preds = torch.max(train_logits, 1)
            train_loss = criterion(train_logits,y)
            
            ########################################################################
            # Apply backward pass on scaled loss function 
            if config.USE_AMP:
                with amp.scale_loss(train_loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                    loss=scaled_loss
            ########################################################################
            else:
                train_loss.backward() # Backpropagation this is where your W_gradient
                loss=train_loss

            optimizer.step() # W_new = W_old - LR * W_gradient 
            example_ct += len(x) 
            batch_ct += 1
            
            ########################################################################
            # Stores Wandb Logs here
            # Report metrics every 25th batch
            if ((batch_ct + 1) % 25) == 0:
                train_log(loss, example_ct, epoch)
            ########################################################################
        
        #validation
        model.eval()
        running_loss = 0.0
        running_corrects = 0
        total = 0
        # Disable gradient calculation for validation or inference using torch.no_rad()
        with torch.no_grad():
            for x,y in valid_dl:
                if config.channels_last:
                    x = x.to(config.device, memory_format=torch.channels_last) #CHW --> #HWC
                else:
                    x = x.to(config.device)
                y = y.to(config.device) #CHW --> #HWC
                valid_logits = model(x)
                _, valid_preds = torch.max(valid_logits, 1)
                valid_loss = criterion(valid_logits,y)
                running_loss += valid_loss.item() * x.size(0)
                running_corrects += torch.sum(valid_preds == y.data)
                total += y.size(0)
                ########################################################################
                # Test Accuracy Logs
                wandb.log({"test_accuracy": running_corrects / total})
                ########################################################################
            
        epoch_loss = running_loss / len(valid_ds)
        epoch_acc = running_corrects.double() / len(valid_ds)
        print("Validation Loss is {}".format(epoch_loss))
        print("Validation Accuracy is {}".format(epoch_acc.cpu()))

            
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    
    torch.save(model.state_dict(), config.saved_path)

In [None]:
def train_log(loss, example_ct, epoch):
    loss = float(loss)
    # where the magic happens
    wandb.log({"epoch": epoch, "loss": loss}, step=example_ct)
    print(f"Loss after " + str(example_ct).zfill(5) + f" examples: {loss:.3f}")

In [None]:
train_model(model, criterion, optimizer, num_epochs=config.EPOCHS)

## Thank You