# Notebook "Pet Breeds" using Data Parallel (DP)

- Author: [Pierre Guillou](https://www.linkedin.com/in/pierreguillou)
- Date: June 2020
- Sources:
  - original notebook from fastai v2: https://github.com/fastai/fastbook/blob/master/05_pet_breeds.ipynb
  - example of code with DP and DDP: https://github.com/fastai/fastai2/blob/master/nbs/examples/train_imagenette.py
  - doc about Data Parallel with fastai v2: https://dev.fast.ai/distributed#ParallelTrainer

## Intitialization

In [1]:
from utils import *
from fastai2.vision.all import *
from fastai2.distributed import *

import warnings
warnings.filterwarnings('ignore')

## Get data and dataloaders

In [2]:
path = untar_data(URLs.PETS)

Path.BASE_PATH = path

pets = DataBlock(blocks = (ImageBlock, CategoryBlock),
                 get_items=get_image_files, 
                 splitter=RandomSplitter(seed=42),
                 get_y=using_attr(RegexLabeller(r'(.+)_\d+.jpg$'), 'name'),
                 item_tfms=Resize(460),
                 batch_tfms=aug_transforms(size=224, min_scale=0.75))
dls = pets.dataloaders(path/"images")

## Training without Data Parallel 

In [3]:
learn = cnn_learner(dls, resnet34, metrics=error_rate).to_fp16()
learn.fine_tune(2)

epoch,train_loss,valid_loss,error_rate,time
0,1.539825,0.33029,0.103518,00:12


epoch,train_loss,valid_loss,error_rate,time
0,0.513095,0.318171,0.103518,00:15
1,0.324813,0.236592,0.069689,00:15


## Data Parallel training

### with PyTorch

In [4]:
# Get learner
learn = cnn_learner(dls, resnet34, metrics=error_rate).to_fp16()

if torch.cuda.device_count() > 1:
    # Use all GPUs
    learn.model = nn.DataParallel(learn.model)
    # Choose GPUs
    # learn.model = nn.DataParallel(learn.model, device_ids=[0,1])
    
learn.fine_tune(2)

epoch,train_loss,valid_loss,error_rate,time
0,1.702696,0.319656,0.109608,00:19


epoch,train_loss,valid_loss,error_rate,time
0,0.573543,0.277454,0.079838,00:18
1,0.398447,0.23903,0.072395,00:17


### with fastai v2

In [5]:
# Get number of GPUs
gpu = None
if torch.cuda.is_available():
    if gpu is not None: torch.cuda.set_device(gpu)
    n_gpu = torch.cuda.device_count()
else:
    n_gpu = None
    
# Get learner
learn = cnn_learner(dls, resnet34, metrics=error_rate).to_fp16()

# The context manager way of dp/ddp, both can handle single GPU base case.
if gpu is None and n_gpu is not None:
    ctx = learn.parallel_ctx
    with partial(ctx, gpu)():
        print(f"Training in {ctx.__name__} context on GPU {list(range(n_gpu))}")
        learn.fine_tune(2)
else:
    learn.fine_tune(2)

Training in parallel_ctx context on GPU [0, 1]


epoch,train_loss,valid_loss,error_rate,time
0,1.621625,0.344931,0.104195,00:16


epoch,train_loss,valid_loss,error_rate,time
0,0.572673,0.317181,0.099459,00:18
1,0.394333,0.234659,0.073748,00:17


## END