## course-v4, fastai2 notebooks Distributed Training Versions Using `mpify`

Below you'll find distributed training versions of those examples in 'Fastai2' course-v4 notebooks.

[01_intro.ipynb](#01intro)


### <a name="01intro"></a> 01_intro.ipynb

In [None]:
# to train cnn on multiple GPUs
from mpify import *

imports='''from utils import *
from fastai2.vision.all import *
from fastai2.distributed import *
'''

from fastai2.vision.all import *
path = untar_data(URLs.PETS)/'images'

def is_cat(x): return x[0].isupper()
    
def train_cnn():

    dls = ImageDataLoaders.from_name_func(
    path, get_image_files(path), valid_pct=0.2, seed=42,
    label_func=is_cat, item_tfms=Resize(224))

    learn = cnn_learner(dls, resnet34, metrics=error_rate)
    with learn.distrib_ctx():
        learn.fine_tune(4)
    
    return learn

ngpus = 3

learn = in_torchddp(ngpus, train_cnn, imports=imports, need="path is_cat")

In [None]:
# to train unet on multiple GPUs
from mpify import in_torchddp

imports='''from utils import *
from fastai2.vision.all import *
from fastai2.distributed import *
'''

from fastai2.vision.all import *
    
def train_unet():
    path = untar_data(URLs.CAMVID_TINY)
    dls = SegmentationDataLoaders.from_label_func(
        path, bs=8, fnames = get_image_files(path/"images"),
        label_func = lambda o: path/'labels'/f'{o.stem}_P{o.suffix}',
        codes = np.loadtxt(path/'codes.txt', dtype=str)
    )

    learn = unet_learner(dls, resnet34)
    with learn.distrib_ctx(): learn.fine_tune(20)
    return learn

ngpus = 3
learn = in_torchddp(ngpus, train_unet, imports=imports)
learn.show_results(max_n=6, figsize=(7,8))

In [None]:
# To train text classifier on multiple GPUs

from mpify import in_torchddp

imports='''from utils import *
from fastai2.text.all import *
from fastai2.distributed import *
'''

def train_imdb_classifier():
    
    dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test', bs=96)
    learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)
    
    import os
    if int(os.environ['WORLD_SIZE']) > 1 and torch.__version__.startswith("1.4"): DistributedTrainer.fup = True
        
    with learn.distrib_ctx(): learn.fine_tune(4, 1e-2)
    return learn

# To train on 3 GPUs with distributed data parallel
learn = in_torchddp(3, train_imdb_classifier, imports=imports)

learn.predict("I really liked that movie!")

In [None]:
# To train tabular in multiple GPUs

from mpify import in_torchddp

imports='''from utils import *
from fastai2.tabular.all import *
from fastai2.distributed import *
'''

def train_tabular():
    path = untar_data(URLs.ADULT_SAMPLE)

    dls = TabularDataLoaders.from_csv(path/'adult.csv', path=path, y_names="salary",
        cat_names = ['workclass', 'education', 'marital-status', 'occupation',
                     'relationship', 'race'],
        cont_names = ['age', 'fnlwgt', 'education-num'],
        procs = [Categorify, FillMissing, Normalize])

    learn = tabular_learner(dls, metrics=accuracy)
    with learn.distrib_ctx():
        learn.fit_one_cycle(3)
    return learn

# To train on 3 GPUs with distributed data parallel
learn = in_torchddp(3, train_tabular, imports=imports)

In [None]:
# To train collab in multiple GPUs

from mpify import in_torchddp

imports='''from utils import *
from fastai2.collab import *
from fastai2.distributed import *
'''

def train_collab():
    path = untar_data(URLs.ML_SAMPLE)
    dls = CollabDataLoaders.from_csv(path/'ratings.csv')
    learn = collab_learner(dls, y_range=(0.5,5.5))
    with learn.distrib_ctx():
        learn.fine_tune(40)
    return learn

# To train on 3 GPUs with distributed data parallel
learn = in_torchddp(3, train_collab, imports=imports)
learn.show_results()

### <a name="05petbreeds"></a> 05_pet_breeds.ipynb

In [None]:
from mpify import in_torchddp
ngpus = 3
imports='''
from utils import *
from fastai2.data import *
from fastai2.vision.all import *
from fastai2.distributed import *
'''

def fine_tune(learn:Learner, nepochs, *args, **kwargs):
    with learn.distrib_ctx(): learn.fine_tune(nepochs, *args, **kwargs)
    return learn
    
def one_cycle(learn:Learner, nepochs, *args, **kwargs):
    with learn.distrib_ctx(): learn.fit_one_cycle(nepochs, *args, **kwargs)
    return learn


def trainer(train_fn, nepochs, *args, load:str=None, **kwargs):

    path = untar_data(URLs.PETS)

    pets = DataBlock(blocks = (ImageBlock, CategoryBlock),
                     get_items = get_image_files, 
                     splitter  = RandomSplitter(seed=42),
                     get_y     = using_attr(RegexLabeller(r'(.+)_\d+.jpg$'), 'name'),
                     item_tfms = Resize(460),
                     batch_tfms= aug_transforms(size=224, min_scale=0.75))
    dls = pets.dataloaders(path/"images")
    
    learn = cnn_learner(dls, resnet34, metrics=error_rate)
    
    if load:
        learn.load(load)
        print(f'Model and state loaded from {load}')

    learn = train_fn(learn, nepochs, *args, **kwargs)
    return learn

In [None]:
learn = in_torchddp(ngpus, trainer, fine_tune, 1, base_lr=0.1, imports=imports, need="fine_tune")

In [None]:
lr_min,lr_steep = learn.lr_find()

In [None]:
# to do: learn.fine_tune(2, base_lr=3e-3)

learn = in_torchddp(ngpus, trainer, fine_tune, 2, base_lr=3e-3,
                    imports=imports, need="fine_tune")

In [None]:
# to do: learn.fit_one_cycle(3, 3e-3)

learn = in_torchddp(ngpus, trainer, one_cycle, 3, 3e-3,
                    imports=imports, need="one_cycle")

In [None]:
learn.unfreeze()
learn.lr_find()

In [None]:
# to do: learn.fit_one_cycle(6, lr_max=1e-5)


learn = in_torchddp(ngpus, trainer, one_cycle, 6, lr_max=1e-5,
                    imports=imports, need="one_cycle")

### 05 pets breeds: Discriminative Learning Rates

To perform:

```python
    learn.fit_one_cycle(3, 3e-3)
    learn.unfreeze()
    learn.fit_one_cycle(12, lr_max=slice(1e-6,1e-4))
```

we need to *save the model state* before the second `fit_one_cycle()`, then tell `in_torchddp()` to load from that file using 'load=file'

In [None]:
learn = in_torchddp(ngpus, trainer, one_cycle, 3, 3e-3,
                    imports=imports, need="one_cycle")
learn.unfreeze()
learn.save("after_unfreeze", with_opt=True, pickle_protocol=4)

learn = in_torchddp(ngpus, trainer, one_cycle, 12, lr_max=slice(1e-6,1e-4),
                    load="after_unfreeze", imports=imports, need="one_cycle")

### 05 pets breeds: Deeper Architectures

To do:
```python
from fastai2.callback.fp16 import *
learn = cnn_learner(dls, resnet50, metrics=error_rate).to_fp16()
learn.fine_tune(6, freeze_epochs=3)
```

We modify `trainer()` to write a new function `trainer_fp16_resnet50()`, replace `resnet34` with `resnet50`, and add `.to_fp16()`.  Then pass it to `in_torchddp()`:

In [None]:
def trainer_fp16_resnet50(train_fn, nepochs, *args, load:str=None, **kwargs):

    path = untar_data(URLs.PETS)

    pets = DataBlock(blocks = (ImageBlock, CategoryBlock),
                     get_items = get_image_files, 
                     splitter  = RandomSplitter(seed=42),
                     get_y     = using_attr(RegexLabeller(r'(.+)_\d+.jpg$'), 'name'),
                     item_tfms = Resize(460),
                     batch_tfms= aug_transforms(size=224, min_scale=0.75))
    dls = pets.dataloaders(path/"images")
    
    # Use resnet50, and half precision.
    learn = cnn_learner(dls, resnet50, metrics=error_rate).to_fp16()
    
    if load:
        learn.load(load)
        print(f'Model and state loaded from {load}')

    learn = train_fn(learn, nepochs, *args, **kwargs)
    return learn

learn = in_torchddp(ngpus, trainer_fp16_resnet50, fine_tune, 6, freeze_epochs=3,
                    imports=imports, need="fine_tune")