In [None]:
import os
import json
import time
import argparse
import random
import numpy as np
import pandas as pd
from PIL import Image
from sklearn.model_selection import train_test_split

In [None]:
import torch
import torch.nn as nn
import torch.optim as optim
from torchvision import transforms
from torch.utils.data import Dataset, DataLoader
import torchvision.models as models
from torchvision import datasets

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
!pip install wandb

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting wandb
  Downloading wandb-0.12.21-py2.py3-none-any.whl (1.8 MB)
[K     |████████████████████████████████| 1.8 MB 51.9 MB/s eta 0:00:01
[?25hCollecting promise<3,>=2.0
  Downloading promise-2.3.tar.gz (19 kB)
Collecting shortuuid>=0.5.0
  Downloading shortuuid-1.0.9-py3-none-any.whl (9.4 kB)
Collecting docker-pycreds>=0.4.0
  Downloading docker_pycreds-0.4.0-py2.py3-none-any.whl (9.0 kB)
Collecting pathtools
  Downloading pathtools-0.1.2.tar.gz (11 kB)
Collecting setproctitle
  Downloading setproctitle-1.2.3-cp38-cp38-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_17_x86_64.manylinux2014_x86_64.whl (29 kB)
Collecting sentry-sdk>=1.0.0
  Downloading sentry_sdk-1.8.0-py2.py3-none-any.whl (153 kB)
[K     |████████████████████████████████| 153 kB 63.9 MB/s eta 0:00:01
Collecting GitPython>=1.0.0
  Downloading GitPython-3.1.27-py3-none-any.whl (181 kB)
[K     |████████████████████████████████| 181 kB 

In [None]:
# Mixed Precision with Apex and Monitoring with Wandb
import wandb
from apex import amp
from apex.optimizers import FusedAdam

In [None]:
# FOR DISTRIBUTED: (can also use torch.nn.parallel.DistributedDataParallel instead)
from apex.parallel import DistributedDataParallel

In [None]:
wandb.login()

[34m[1mwandb[0m: Logging into wandb.ai. (Learn how to deploy a W&B server locally: https://wandb.me/wandb-server)
[34m[1mwandb[0m: You can find your API key in your browser here: https://wandb.ai/authorize
[34m[1mwandb[0m: Paste an API key from your profile and hit enter, or press ctrl+c to quit:

  ········································


[34m[1mwandb[0m: Appending key for api.wandb.ai to your netrc file: /root/.netrc


True

In [None]:
#GPU using CUDA
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

try:
    os.makedirs("./saved")
except FileExistsError:
    # directory already exists
    pass

In [None]:
parser = argparse.ArgumentParser()
# FOR DISTRIBUTED:  Parse for the local_rank argument, which will be supplied
# automatically by torch.distributed.launch.
parser.add_argument("--local_rank", default=0, type=int)
args = parser.parse_args()


usage: ipykernel_launcher.py [-h] [--local_rank LOCAL_RANK]
ipykernel_launcher.py: error: unrecognized arguments: -f /root/.local/share/jupyter/runtime/kernel-e3fe5e3a-514c-4b6f-870e-59b6f97ed728.json


SystemExit: 2

In [None]:
!pip install opendatasets

Looking in indexes: https://pypi.org/simple, https://pypi.ngc.nvidia.com
Collecting opendatasets
  Downloading opendatasets-0.1.22-py3-none-any.whl (15 kB)
Collecting kaggle
  Downloading kaggle-1.5.12.tar.gz (58 kB)
[K     |████████████████████████████████| 58 kB 47.6 MB/s eta 0:00:01
Building wheels for collected packages: kaggle
  Building wheel for kaggle (setup.py) ... [?25ldone
[?25h  Created wheel for kaggle: filename=kaggle-1.5.12-py3-none-any.whl size=73051 sha256=2244fefa19538b8d14b36df1525835324e5620bb4dedd3f253555380a07b7c2f
  Stored in directory: /tmp/pip-ephem-wheel-cache-8azjkl6k/wheels/29/da/11/144cc25aebdaeb4931b231e25fd34b394e6a5725cbb2f50106
Successfully built kaggle
Installing collected packages: kaggle, opendatasets
Successfully installed kaggle-1.5.12 opendatasets-0.1.22


In [None]:
import opendatasets as od
od.download("https://www.kaggle.com/datasets/tawsifurrahman/tuberculosis-tb-chest-xray-dataset")

Please provide your Kaggle credentials to download this dataset. Learn more: http://bit.ly/kaggle-creds
Your Kaggle username:

  wnagesh


Your Kaggle Key:

  ································


Downloading tuberculosis-tb-chest-xray-dataset.zip to ./tuberculosis-tb-chest-xray-dataset


100%|██████████| 663M/663M [00:20<00:00, 34.6MB/s] 





In [None]:
!pwd

/workspace/aitrainingandinference


In [None]:
transform = transforms.Compose([transforms.Resize(size=(224,224)), 
                                transforms.ToTensor(),
                                transforms.Normalize(mean=[0.485, 0.456, 0.406], std=[0.229, 0.224, 0.225])])

In [None]:
covid_19_dataset = datasets.ImageFolder('./data/database/',transform=transform)

In [None]:
#Hyperparameters
config = dict(
    saved_path="./resnet50_ddp.pt",
    SEED = 42,
    lr=0.001, 
    EPOCHS = 10,
    BATCH_SIZE = 32,
    IMAGE_SIZE = 224,
    TRAIN_VALID_SPLIT = 0.2,
    device=device,
    pin_memory=False,
    num_workers=8,
    USE_AMP = False,
    channels_last=True,
    distributed = False,
    world_size=4)

In [None]:
#Initiate the Project and Entity
wandb.init(project="Apex-predator", config=config,  group="DDP")
# access all HPs through wandb.config, so logging matches execution!
config = wandb.config

In [None]:
if config.distributed:
    # FOR DISTRIBUTED:  Set the device according to local_rank.
    torch.cuda.set_device(args.local_rank)       
    # FOR DISTRIBUTED:  Initialize the backend.  torch.distributed.launch will provide
    # environment variables, and requires that you use init_method=`env://`.
    torch.distributed.init_process_group(backend='nccl',
                                         init_method='env://')

In [None]:
#Pytorch Reproducibility
random.seed(config.SEED)
np.random.seed(config.SEED)
torch.manual_seed(config.SEED)
torch.cuda.manual_seed(config.SEED)
torch.backends.cudnn.benchmarks = True

In [None]:
from torch.utils.data import Dataset, DataLoader
from torch.utils.data.sampler import SubsetRandomSampler
import numpy as np

In [None]:

batch_size = 4
# percentage of training set to use as validation
test_size = 0.2
valid_size = 0.1

In [None]:
#For test
num_data = len(covid_19_dataset)
indices_data = list(range(num_data))
np.random.shuffle(indices_data)
split_tt = int(np.floor(test_size * num_data))
train_idx, test_idx = indices_data[split_tt:], indices_data[:split_tt]

In [None]:
#For Valid
num_train = len(train_idx)
indices_train = list(range(num_train))
np.random.shuffle(indices_train)
split_tv = int(np.floor(valid_size * num_train))
train_new_idx, valid_idx = indices_train[split_tv:],indices_train[:split_tv]

In [None]:
# define samplers for obtaining training and validation batches
train_sampler = SubsetRandomSampler(train_new_idx)
test_sampler = SubsetRandomSampler(test_idx)
valid_sampler = SubsetRandomSampler(valid_idx)

In [None]:
#Dataloader -> Set pin_memory=True and num_workers=8
train_dl = DataLoader(covid_19_dataset,
                      batch_size=config.BATCH_SIZE,
                      shuffle=False,
                      num_workers=config.num_workers,
                      pin_memory=True,
                      sampler=train_sampler)
valid_dl = DataLoader(covid_19_dataset,
                      batch_size=config.BATCH_SIZE,
                      shuffle=False,
                      num_workers=config.num_workers,
                      pin_memory=True,
                     sampler=test_sampler)

In [None]:
#For Resnet50
model = models.resnet50(pretrained=True)
#For VGG16
#model = models.vgg16(pretrained=True)
#or download model from "https://download.pytorch.org/models/resnet50-0676ba61.pth"

In [None]:
#model = models.resnet50(pretrained=False)
#model.load_state_dict(torch.load(config.pretrained_path))

#Modify the classifier for agriculture data
num_ftrs = model.fc.in_features
model.fc = nn.Sequential(nn.Linear(num_ftrs,512),
                        nn.ReLU(),
                        nn.Dropout(p=0.3),
                        nn.Linear(512,2))


In [None]:
#Channel Last Optimization in Model
if config.channels_last:
    model = model.to(config.device, memory_format=torch.channels_last) #CHW --> #HWC
else:
    model = model.to(config.device)
    
if config.USE_AMP:
    optimizer = FusedAdam(model.parameters(), config.lr)
    model,optimizer = amp.initialize(model, optimizer, opt_level="O2") #O0/O1/O2
else:
    optimizer = optim.Adam(model.parameters(),lr=config.lr)
    
if config.distributed:
    # FOR DISTRIBUTED:  After amp.initialize, wrap the model with
    # apex.parallel.DistributedDataParallel.
    # model = DistributedDataParallel(model)
    # torch.nn.parallel.DistributedDataParallel is also fine, with some added configs:
    model = torch.nn.parallel.DistributedDataParallel(model,
                                                      device_ids=[args.local_rank],
                                                      output_device=args.local_rank)

In [None]:
# Loss Function
criterion = nn.CrossEntropyLoss()

In [None]:
# Each process receives its own batch of "fake input data" and "fake target data."
# The "training loop" in each process just uses this fake batch over and over.
# https://github.com/NVIDIA/apex/tree/master/examples/imagenet provides a more realistic
# example of distributed data sampling for both training and validation.


In [None]:
def train_model(model,criterion,optimizer,num_epochs=10):
    ############################################################
    # tell a to watch what the model gets up to: gradients, weights, and more!
    wandb.watch(model, criterion, log="all", log_freq=10)
    ############################################################

    since = time.time()                                            
    batch_ct = 0
    example_ct = 0
    for epoch in range(num_epochs):
        print('Epoch {}/{}'.format(epoch, num_epochs - 1))
        print('-' * 10)
        
        #Training
        model.train()
        for x,y in train_dl: #BS=32 ([BS,3,224,224], [BS,4])            
            if config.channels_last:
                x = x.to(config.device, non_blocking=True, memory_format=torch.channels_last) #CHW --> #HWC
            else:
                x = x.to(config.device, non_blocking=True)
            y = y.to(config.device, non_blocking=True) #CHW --> #HWC
            optimizer.zero_grad()
            train_logits = model(x) #Input = [BS,3,224,224] (Image) -- Model --> [BS,4] (Output Scores)
            _, train_preds = torch.max(train_logits, 1)
            train_loss = criterion(train_logits,y)
            
            ########################################################################
            if config.USE_AMP:
                with amp.scale_loss(train_loss, optimizer) as scaled_loss:
                    scaled_loss.backward()
                    loss=scaled_loss
            else:
                train_loss.backward() # Backpropagation this is where your W_gradient
                loss=train_loss

            optimizer.step() # W_new = W_old - LR * W_gradient 
            example_ct += len(x) 
            batch_ct += 1
            # Report metrics every 25th batch
            if ((batch_ct + 1) % 25) == 0:
                train_log(loss, example_ct, epoch)
        
        
        #validation
        model.eval()
        running_loss = 0.0
        running_corrects = 0
        total = 0
        with torch.no_grad():
            for x,y in valid_dl:
                if config.channels_last:
                    x = x.to(config.device, non_blocking=True, memory_format=torch.channels_last) #CHW --> #HWC
                else:
                    x = x.to(config.device, non_blocking=True)
                y = y.to(config.device, non_blocking=True) #CHW --> #HWC
                valid_logits = model(x)
                _, valid_preds = torch.max(valid_logits, 1)
                valid_loss = criterion(valid_logits,y)
                running_loss += valid_loss.item() * x.size(0)
                running_corrects += torch.sum(valid_preds == y.data)
                total += y.size(0)
                wandb.log({"test_accuracy": running_corrects / total})
            
        epoch_loss = running_loss / len(valid_dl)
        epoch_acc = running_corrects.double() / len(valid_dl)
        print("Validation Loss is {}".format(epoch_loss))
        print("Validation Accuracy is {}".format(epoch_acc.cpu()))

            
    time_elapsed = time.time() - since
    print('Training complete in {:.0f}m {:.0f}s'.format(
        time_elapsed // 60, time_elapsed % 60))
    
    torch.save(model.state_dict(), config.saved_path)

In [None]:
def train_log(loss, example_ct, epoch):
    loss = float(loss)
    # where the magic happens
    wandb.log({"epoch": epoch, "loss": loss}, step=example_ct)
    print(f"Loss after " + str(example_ct).zfill(5) + f" examples: {loss:.3f}")
    

In [None]:
train_model(model, criterion, optimizer, num_epochs=config.EPOCHS)

Epoch 0/9
----------
Loss after 00768 examples: 0.000
Loss after 01568 examples: 0.000
Loss after 02368 examples: 0.000
Validation Loss is 324.08106570773657
Validation Accuracy is 26.037037037037035
Epoch 1/9
----------
Loss after 03152 examples: 0.000
Loss after 03952 examples: 0.000
Loss after 04752 examples: 0.000
Loss after 05552 examples: 0.000
Validation Loss is 325.24939106128835
Validation Accuracy is 26.037037037037035
Epoch 2/9
----------
Loss after 06336 examples: 0.000
Loss after 07136 examples: 0.000
Loss after 07936 examples: 0.000
Loss after 08736 examples: 0.000
Validation Loss is 324.7661372997143
Validation Accuracy is 26.037037037037035
Epoch 3/9
----------
Loss after 09520 examples: 0.000
Loss after 10320 examples: 0.000
Loss after 11120 examples: 0.000
Loss after 11920 examples: 0.000
Validation Loss is 322.6980387369792
Validation Accuracy is 26.037037037037035
Epoch 4/9
----------
Loss after 12704 examples: 0.000
Loss after 13504 examples: 0.000
Loss after 14304