## Using `mpify` to Train Fastai2/Course-V4 Examples "Distributedly" on Multiple GPUs

###  To train a `fastai2` learner on multiple processes inside a Jupyter notebook
1. Ensure each process has its own copy of: the model, the access to its GPU, and dataloader.

    * The `DataLoader` must be re-created fresh on each process, because the CUDA GPU context it might initialize cannot be reused in another process.
    CUDA PyTorch tensors created in the parent Jupyter process should not be passed to the subprocesses, otherwise it will incur 600MB memory, *per subprocess*, on the original GPU associated with the tensor.

    * For other variables (`path` of untar'ed dataset, or `df` a loaded DataFrame), or the many helper functions, they can be passed to the distributed training API `in_torchddp()` via `imports=` and `need=` parameters.

2. In each process `from fastai2.distributed import *`, and surround the fitting function `with learn.distrib_ctx()`.

### Quick links to course-v4 chapters `mpify`-ed:

[01_intro.ipynb](/examples/fastai2_course-v4_01_intro_distrib.ipynb)

[05_pet_breeds.ipynb](/examples/fastai2_course-v4_05_pet_breeds_distrib.ipynb)

[06_multicat.ipynb](/examples/fastai2_course-v4_06_multicat_distrib.ipynb)

[07_sizing_and_tta.ipynb](/examples/fastai2_course-v4_07_sizing_tta_distrib.ipynb)

[08_collab.ipynb](/examples/fastai2_course-v4_08_collab_distrib.ipynb)

### Below are distributed training of examples correspond to fastai2 course-v4 <a href='https://github.com/fastai/course-v4/blob/master/nbs/01_intro.ipynb' target='_blank'>`01_intro.ipynb`</a>


### <a name="01intro"></a> 01_intro.ipynb

In [None]:
# to train cnn on multiple GPUs
from mpify import *

imports='''from utils import *
from fastai2.vision.all import *
from fastai2.distributed import *
'''

from fastai2.vision.all import *
path = untar_data(URLs.PETS)/'images'

def is_cat(x): return x[0].isupper()
    
def train_cnn():

    dls = ImageDataLoaders.from_name_func(
        path, get_image_files(path), valid_pct=0.2, seed=42,
        label_func=is_cat, item_tfms=Resize(224))

    learn = cnn_learner(dls, resnet34, metrics=error_rate)
    with learn.distrib_ctx():
        learn.fine_tune(4)
    
    return learn

ngpus = 3

learn = in_torchddp(ngpus, train_cnn, imports=imports, need="path is_cat")

In [None]:
# to train unet on multiple GPUs
from mpify import in_torchddp

imports='''from utils import *
from fastai2.vision.all import *
from fastai2.distributed import *
'''

from fastai2.vision.all import *
    
def train_unet():
    path = untar_data(URLs.CAMVID_TINY)
    dls = SegmentationDataLoaders.from_label_func(
        path, bs=8, fnames = get_image_files(path/"images"),
        label_func = lambda o: path/'labels'/f'{o.stem}_P{o.suffix}',
        codes = np.loadtxt(path/'codes.txt', dtype=str)
    )

    learn = unet_learner(dls, resnet34)
    with learn.distrib_ctx(): learn.fine_tune(20)
    return learn

ngpus = 3
learn = in_torchddp(ngpus, train_unet, imports=imports)
learn.show_results(max_n=6, figsize=(7,8))

In [None]:
# To train text classifier on multiple GPUs

from mpify import in_torchddp

imports='''from utils import *
from fastai2.text.all import *
from fastai2.distributed import *
'''

def train_imdb_classifier():
    
    dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test', bs=96)
    learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)
    
    import os
    if int(os.environ['WORLD_SIZE']) > 1 and torch.__version__.startswith("1.4"): DistributedTrainer.fup = True
        
    with learn.distrib_ctx(): learn.fine_tune(4, 1e-2)
    return learn

# To train on 3 GPUs with distributed data parallel
learn = in_torchddp(3, train_imdb_classifier, imports=imports)

learn.predict("I really liked that movie!")

In [None]:
# To train tabular in multiple GPUs

from mpify import in_torchddp

imports='''from utils import *
from fastai2.tabular.all import *
from fastai2.distributed import *
'''

def train_tabular():
    path = untar_data(URLs.ADULT_SAMPLE)

    dls = TabularDataLoaders.from_csv(path/'adult.csv', path=path, y_names="salary",
        cat_names = ['workclass', 'education', 'marital-status', 'occupation',
                     'relationship', 'race'],
        cont_names = ['age', 'fnlwgt', 'education-num'],
        procs = [Categorify, FillMissing, Normalize])

    learn = tabular_learner(dls, metrics=accuracy)
    with learn.distrib_ctx():
        learn.fit_one_cycle(3)
    return learn

# To train on 3 GPUs with distributed data parallel
learn = in_torchddp(3, train_tabular, imports=imports)

In [None]:
# To train collab in multiple GPUs

from mpify import in_torchddp

imports='''from utils import *
from fastai2.collab import *
from fastai2.distributed import *
'''

def train_collab():
    path = untar_data(URLs.ML_SAMPLE)
    dls = CollabDataLoaders.from_csv(path/'ratings.csv')
    learn = collab_learner(dls, y_range=(0.5,5.5))
    with learn.distrib_ctx():
        learn.fine_tune(40)
    return learn

# To train on 3 GPUs with distributed data parallel
learn = in_torchddp(3, train_collab, imports=imports)
learn.show_results()