## course-v4, fastai2 notebooks Distributed Training Versions Using `mpify`

Below you'll find distributed training versions of those examples in 'Fastai2' course-v4 notebooks.

[01_intro.ipynb](#01intro)

[05_pet_breeds.ipynb](#01petbreeds)

[06 multicat.ipynb](#06multicat)

[07 Sizing and TTA.ipynb](#07sizingtta)

[08 Collab.ipynb](#08collab)

### <a name="01intro"></a> 01_intro.ipynb

To train a `fastai2` learner on multiple processes, the `DataLoaders` must be re-created fresh on each process, because in its instantiation process, it would access the GPU, thus cannot be used in another process.

Thus in all the examples here, both `DataBlock` and `DataLoaders` (often noted as `dls`) are all created inside the target function body.


In [None]:
# to train cnn on multiple GPUs
from mpify import *

imports='''from utils import *
from fastai2.vision.all import *
from fastai2.distributed import *
'''

from fastai2.vision.all import *
path = untar_data(URLs.PETS)/'images'

def is_cat(x): return x[0].isupper()
    
def train_cnn():

    dls = ImageDataLoaders.from_name_func(
        path, get_image_files(path), valid_pct=0.2, seed=42,
        label_func=is_cat, item_tfms=Resize(224))

    learn = cnn_learner(dls, resnet34, metrics=error_rate)
    with learn.distrib_ctx():
        learn.fine_tune(4)
    
    return learn

ngpus = 3

learn = in_torchddp(ngpus, train_cnn, imports=imports, need="path is_cat")

In [None]:
# to train unet on multiple GPUs
from mpify import in_torchddp

imports='''from utils import *
from fastai2.vision.all import *
from fastai2.distributed import *
'''

from fastai2.vision.all import *
    
def train_unet():
    path = untar_data(URLs.CAMVID_TINY)
    dls = SegmentationDataLoaders.from_label_func(
        path, bs=8, fnames = get_image_files(path/"images"),
        label_func = lambda o: path/'labels'/f'{o.stem}_P{o.suffix}',
        codes = np.loadtxt(path/'codes.txt', dtype=str)
    )

    learn = unet_learner(dls, resnet34)
    with learn.distrib_ctx(): learn.fine_tune(20)
    return learn

ngpus = 3
learn = in_torchddp(ngpus, train_unet, imports=imports)
learn.show_results(max_n=6, figsize=(7,8))

In [None]:
# To train text classifier on multiple GPUs

from mpify import in_torchddp

imports='''from utils import *
from fastai2.text.all import *
from fastai2.distributed import *
'''

def train_imdb_classifier():
    
    dls = TextDataLoaders.from_folder(untar_data(URLs.IMDB), valid='test', bs=96)
    learn = text_classifier_learner(dls, AWD_LSTM, drop_mult=0.5, metrics=accuracy)
    
    import os
    if int(os.environ['WORLD_SIZE']) > 1 and torch.__version__.startswith("1.4"): DistributedTrainer.fup = True
        
    with learn.distrib_ctx(): learn.fine_tune(4, 1e-2)
    return learn

# To train on 3 GPUs with distributed data parallel
learn = in_torchddp(3, train_imdb_classifier, imports=imports)

learn.predict("I really liked that movie!")

In [None]:
# To train tabular in multiple GPUs

from mpify import in_torchddp

imports='''from utils import *
from fastai2.tabular.all import *
from fastai2.distributed import *
'''

def train_tabular():
    path = untar_data(URLs.ADULT_SAMPLE)

    dls = TabularDataLoaders.from_csv(path/'adult.csv', path=path, y_names="salary",
        cat_names = ['workclass', 'education', 'marital-status', 'occupation',
                     'relationship', 'race'],
        cont_names = ['age', 'fnlwgt', 'education-num'],
        procs = [Categorify, FillMissing, Normalize])

    learn = tabular_learner(dls, metrics=accuracy)
    with learn.distrib_ctx():
        learn.fit_one_cycle(3)
    return learn

# To train on 3 GPUs with distributed data parallel
learn = in_torchddp(3, train_tabular, imports=imports)

In [None]:
# To train collab in multiple GPUs

from mpify import in_torchddp

imports='''from utils import *
from fastai2.collab import *
from fastai2.distributed import *
'''

def train_collab():
    path = untar_data(URLs.ML_SAMPLE)
    dls = CollabDataLoaders.from_csv(path/'ratings.csv')
    learn = collab_learner(dls, y_range=(0.5,5.5))
    with learn.distrib_ctx():
        learn.fine_tune(40)
    return learn

# To train on 3 GPUs with distributed data parallel
learn = in_torchddp(3, train_collab, imports=imports)
learn.show_results()

### <a name="05petbreeds"></a> 05_pet_breeds.ipynb

In the `trainer` function for Chapter 5 "Pet Breeds", let's add two functionalities:

1. It redirects the actual training to another function that takes a `learner` object, # of epochs, and other parameters.  This way, we can use the same `trainer()` body but calls different `learner` training methods, sometimes `fit_one_cycle()`, other times `fine_tune()`.

2. Add `load=filename` to load up a model state to a freshly created learner object.  It'll become useful in the section "Discriminative Learning Rates" that follows.


In [None]:
from mpify import in_torchddp
ngpus = 3
imports='''
from utils import *
from fastai2.data import *
from fastai2.vision.all import *
from fastai2.distributed import *
'''

def fine_tune(learn:Learner, nepochs, *args, **kwargs):
    with learn.distrib_ctx(): learn.fine_tune(nepochs, *args, **kwargs)
    return learn
    
def one_cycle(learn:Learner, nepochs, *args, **kwargs):
    with learn.distrib_ctx(): learn.fit_one_cycle(nepochs, *args, **kwargs)
    return learn


def trainer(train_fn, nepochs, *args, load:str=None, **kwargs):

    path = untar_data(URLs.PETS)

    pets = DataBlock(blocks = (ImageBlock, CategoryBlock),
                     get_items = get_image_files, 
                     splitter  = RandomSplitter(seed=42),
                     get_y     = using_attr(RegexLabeller(r'(.+)_\d+.jpg$'), 'name'),
                     item_tfms = Resize(460),
                     batch_tfms= aug_transforms(size=224, min_scale=0.75))
    dls = pets.dataloaders(path/"images")
    
    learn = cnn_learner(dls, resnet34, metrics=error_rate)
    
    if load:
        learn.load(load)
        print(f'Model and state loaded from {load}')

    learn = train_fn(learn, nepochs, *args, **kwargs)
    return learn

In [None]:
learn = in_torchddp(ngpus, trainer, fine_tune, 1, base_lr=0.1, imports=imports, need="fine_tune")

In [None]:
lr_min,lr_steep = learn.lr_find()

In [None]:
# to do: learn.fine_tune(2, base_lr=3e-3)

learn = in_torchddp(ngpus, trainer, fine_tune, 2, base_lr=3e-3,
                    imports=imports, need="fine_tune")

In [None]:
# to do: learn.fit_one_cycle(3, 3e-3)

learn = in_torchddp(ngpus, trainer, one_cycle, 3, 3e-3,
                    imports=imports, need="one_cycle")

In [None]:
learn.unfreeze()
learn.lr_find()

In [None]:
# to do: learn.fit_one_cycle(6, lr_max=1e-5)


learn = in_torchddp(ngpus, trainer, one_cycle, 6, lr_max=1e-5,
                    imports=imports, need="one_cycle")

### 05 pets breeds: Discriminative Learning Rates

To perform:

```python
    learn.fit_one_cycle(3, 3e-3)
    learn.unfreeze()
    learn.fit_one_cycle(12, lr_max=slice(1e-6,1e-4))
```

we need to *save the model state* before the second `fit_one_cycle()`, then tell `in_torchddp()` to load from that file using 'load=file'

In [None]:
learn = in_torchddp(ngpus, trainer, one_cycle, 3, 3e-3,
                    imports=imports, need="one_cycle")
learn.unfreeze()
learn.save("after_unfreeze", with_opt=True, pickle_protocol=4)

learn = in_torchddp(ngpus, trainer, one_cycle, 12, lr_max=slice(1e-6,1e-4),
                    load="after_unfreeze", imports=imports, need="one_cycle")

### 05 pets breeds: Deeper Architectures

To do:
```python
from fastai2.callback.fp16 import *
learn = cnn_learner(dls, resnet50, metrics=error_rate).to_fp16()
learn.fine_tune(6, freeze_epochs=3)
```

We modify `trainer()` to write a new function `trainer_fp16_resnet50()`, replace `resnet34` with `resnet50`, and add `.to_fp16()`.  Then pass it to `in_torchddp()`:

In [None]:
def trainer_fp16_resnet50(train_fn, nepochs, *args, load:str=None, **kwargs):

    path = untar_data(URLs.PETS)

    pets = DataBlock(blocks = (ImageBlock, CategoryBlock),
                     get_items = get_image_files, 
                     splitter  = RandomSplitter(seed=42),
                     get_y     = using_attr(RegexLabeller(r'(.+)_\d+.jpg$'), 'name'),
                     item_tfms = Resize(460),
                     batch_tfms= aug_transforms(size=224, min_scale=0.75))
    dls = pets.dataloaders(path/"images")
    
    # Use resnet50, and half precision.
    learn = cnn_learner(dls, resnet50, metrics=error_rate).to_fp16()
    
    if load:
        learn.load(load)
        print(f'Model and state loaded from {load}')

    learn = train_fn(learn, nepochs, *args, **kwargs)
    return learn

learn = in_torchddp(ngpus, trainer_fp16_resnet50, fine_tune, 6, freeze_epochs=3,
                    imports=imports, need="fine_tune")

### <a name='06multicat'></a> 06 Multicat

The examples in Chapter 6 "Multi-Label Classification", `learn` object is built from
pieces across many cells. 

`DataBlock` object needs `path`, and a few other functions: `get_x, get_y, splitter`, and `dls` needs `df`.  So we group some of them together in `need=`.

I do notice accuracy degradation using the same 3 epochs, of `0.81`, as oppose to the `0.95` range in the book.

So what can we do?  Save the model after the first training, then use `load=filename` flag, tweak `nepochs` and `freeze_epochs=` values and call `in_torchddp()` again.



In [None]:
# These are defined earlier in the notebook:

path = untar_data(URLs.PASCAL_2007)
df = pd.read_csv(path/'train.csv')

def get_x(r): return path/'train'/r['fname']
def get_y(r): return r['labels'].split(' ')
def splitter(df):
    train = df.index[~df['is_valid']].tolist()
    valid = df.index[df['is_valid']].tolist()
    return train,valid

# to perform those trainings in DDP

from mpify import in_torchddp
ngpus = 3

def train_multicat(nepochs, *args, load:str=None, **kwargs):
    dblock = DataBlock(blocks=(ImageBlock, MultiCategoryBlock),
                   splitter=splitter,
                   get_x=get_x, 
                   get_y=get_y,
                   item_tfms = RandomResizedCrop(128, min_scale=0.35))
    dls = dblock.dataloaders(df)
    
    learn = cnn_learner(dls, resnet50, metrics=partial(accuracy_multi, thresh=0.2))
    
    if load: learn.load(load); print(f'Model and state loaded from {load}')

    with learn.distrib_ctx():
        learn.fine_tune(nepochs, *args, **kwargs)
        
    return learn
        
imports='''
from utils import *
from fastai2.vision.all import *
from fastai2.distributed import *
'''
need="path df get_x get_y splitter"

learn = in_torchddp(ngpus, train_multicat, 3, base_lr=3e-3, freeze_epochs=4,
                    imports=imports, need=need)


The next training uses a different dataset `BIWI_HEAD_POSE`, and will need `get_ctr()`, `numpy`, `cal`. And I missed `img2pose` in the first pass, causing a `NameError` exception be thrown.  Simply add it to `need=`.

The same trick with `load=` will help improve the accuracy.


In [None]:
path = untar_data(URLs.BIWI_HEAD_POSE)

def img2pose(x): return Path(f'{str(x)[:-7]}pose.txt')

cal = np.genfromtxt(path/'01'/'rgb.cal', skip_footer=6)
def get_ctr(f):
    ctr = np.genfromtxt(img2pose(f), skip_header=3)
    c1 = ctr[0] * cal[0][0]/ctr[2] + cal[0][2]
    c2 = ctr[1] * cal[1][1]/ctr[2] + cal[1][2]
    return tensor([c1,c2])

def train_biwi(nepochs, *args, load:str=None, **kwargs):
    biwi = DataBlock(
        blocks=(ImageBlock, PointBlock),
        get_items=get_image_files,
        get_y=get_ctr,
        splitter=FuncSplitter(lambda o: o.parent.name=='13'),
        batch_tfms=[*aug_transforms(size=(240,320)), 
                    Normalize.from_stats(*imagenet_stats)])

    dls = biwi.dataloaders(path)

    learn = cnn_learner(dls, resnet18, y_range=(-1,1))
    if load: learn.load(load); print(f'Model and state loaded from {load}')

    lr = 1e-2
    with learn.distrib_ctx(): learn.fine_tune(nepochs, lr)
    return learn

imports='''
from utils import *
from fastai2.vision.all import *
from fastai2.distributed import *
improt numpy as np
'''

need="path cal get_ctr img2pose"

learn = in_torchddp(ngpus, train_biwi, 3, imports=imports, need=need)

# Not satisfied with the accuracy?  Save then train 5 more epochs, starting from the current state
learn.save('biwi_after3')
learn = in_torchddp(ngpus, train_biwi, 5, load='biwi_after3', imports=imports, need=need)

### <a name='07sizingtta'></a> 07 Sizing and TTA

In this chapter, imagenette dataset is trained with incremental optimization.  First add normalization, then progressive resizing and fine tune.

In [None]:
from mpify import in_torchddp
from fastai2.vision.all import *
path = untar_data(URLs.IMAGENETTE)


def get_dls_64_224_no_normalize():
    dblock = DataBlock(blocks=(ImageBlock(), CategoryBlock()),
                       get_items=get_image_files,
                       get_y=parent_label,
                       item_tfms=Resize(460),
                       batch_tfms=aug_transforms(size=224, min_scale=0.75))
    dls = dblock.dataloaders(path, bs=64)
    return dls

def train_imgnette_64_224_no_normalize(nepochs, lr, *args, **kwargs):
    dls = get_dls_64_224_no_normalize() 
    model = xresnet50()
    learn = Learner(dls, model, loss_func=CrossEntropyLossFlat(), metrics=accuracy)
    
    with learn.distrib_ctx():
        learn.fit_one_cycle(nepochs, lr)
    
    return learn

# The following pair will normalize batches, and allow custom sizes.
def get_dls(bs, size):
    dblock = DataBlock(blocks=(ImageBlock, CategoryBlock),
                   get_items=get_image_files,
                   get_y=parent_label,
                   item_tfms=Resize(460),
                   batch_tfms=[*aug_transforms(size=size, min_scale=0.75),
                               Normalize.from_stats(*imagenet_stats)])
    return dblock.dataloaders(path, bs=bs)

def train_imgnette_normalize(nepochs, lr, dls_bs, dls_size, *args,
                             fine_tune:bool=False, load:str=None, **kwargs):
    dls = get_dls(dls_bs, dls_size) 
    model = xresnet50()
    learn = Learner(dls, model, loss_func=CrossEntropyLossFlat(), metrics=accuracy)
    
    if load:
        learn.load(load)
        print(f"Model state loaded from {load}")
        
    with learn.distrib_ctx():
        if fine_tune: learn.fine_tune(nepochs, lr)
        else: learn.fit_one_cycle(nepochs, lr)
    
    return learn


imports='''
from utils import *
from fastai2.vision.all import *
from fastai2.distributed import *
'''
ngpus=3


In [None]:
nepochs, lr = 5, 3e-3
need='get_dls_64_224_no_normalize path'
learn = in_torchddp(ngpus, train_imgnette_64_224_no_normalize, 1, 3e-3,
                    imports=imports, need=need)

# Then use normalization
nepochs, lr = 5, 3e-3
dls_bs, dls_size = 64, 224
need='get_dls path'
learn = in_torchddp(ngpus, train_imgnette_normalize, 5, 3e-3, dls_bs, dls_size,
                    imports=imports, need=need)

### 07 Sizing & TTA . Progressive resizing

Save the model after one stage, then fine tune at different sizes, starting from that.

In [None]:
# Progressive Resizing.  Save the intermediate model state
need='get_dls path'

nepochs, lr = 4, 3e-3
dls_bs, dls_size = 128, 128
learn = in_torchddp(ngpus, train_imgnette_normalize, nepochs, lr, dls_bs, dls_size,
                    imports=imports, need=need)

learn.save('4epochs_128_128')

# Then fine_tune at different size, starting from the saved state
nepochs, lr = 5, 1e-3
dls_bs, dls_size = 64, 224
learn = in_torchddp(ngpus, train_imgnette_normalize, nepochs, lr, dls_bs, dls_size,
                    load='4epochs_128_128', fine_tune=True, imports=imports, need=need)

### <a name='08collab'></a> 08 Collab

In the first two training examples in notebook 08_collab, variables that depends on `dls` we move them inside the target function.  Those created before `dls`, we can pass them in as `need=`:

In [None]:
from utils import *
from fastai2.collab import *
from fastai2.tabular.all import *

path = untar_data(URLs.ML_100k)
ratings = pd.read_csv(path/'u.data', delimiter='\t', header=None,
                      names=['user','movie','rating','timestamp'])

movies = pd.read_csv(path/'u.item',  delimiter='|', encoding='latin-1',
                     usecols=(0,1), names=('movie','title'), header=None)

ratings = ratings.merge(movies)

class DotProduct(Module):
    def __init__(self, n_users, n_movies, n_factors):
        self.user_factors = Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)
        
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        return (users * movies).sum(dim=1)

def train_dotproduct(nepochs, *args, load:str=None, **kwargs):
    dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64)
    
    n_users  = len(dls.classes['user'])
    n_movies = len(dls.classes['title'])
    n_factors = 5
    user_factors = torch.randn(n_users, n_factors)
    movie_factors = torch.randn(n_movies, n_factors)

    model = DotProduct(n_users, n_movies, 50)
    learn = Learner(dls, model, loss_func=MSELossFlat())
    
    if load: learn.load(load); print(f'Model and state loaded from {load}')

    with learn.distrib_ctx(): learn.fit_one_cycle(nepochs, *args)
    return learn

from mpify import in_torchddp
ngpus = 3

imports='''
from utils import *
from fastai2.collab import *
from fastai2.tabular.all import *
from fastai2.distributed import *
'''

need="path ratings DotProductClass"

DotProductClass = DotProduct
learn = in_torchddp(ngpus, train_dotproduct, 5, 5e-3, imports=imports, need=need)

Tune the `DotProduct` class, validation loss drops:

In [None]:
# A new dotproduct class
class DotProduct(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
        self.user_factors = Embedding(n_users, n_factors)
        self.movie_factors = Embedding(n_movies, n_factors)
        self.y_range = y_range
        
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        return sigmoid_range((users * movies).sum(dim=1), *self.y_range)

    
DotProductClass = DotProduct # Update DotProductClass pointer
learn = in_torchddp(ngpus, train_dotproduct, 5, 5e-3, imports=imports, need=need)

Now Try a different DotProduct class: `DotProductBias`:

In [None]:
class DotProductBias(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
        self.user_factors = Embedding(n_users, n_factors)
        self.user_bias = Embedding(n_users, 1)
        self.movie_factors = Embedding(n_movies, n_factors)
        self.movie_bias = Embedding(n_movies, 1)
        self.y_range = y_range
        
    def forward(self, x):
        users = self.user_factors(x[:,0])
        movies = self.movie_factors(x[:,1])
        res = (users * movies).sum(dim=1, keepdim=True)
        res += self.user_bias(x[:,0]) + self.movie_bias(x[:,1])
        return sigmoid_range(res, *self.y_range)
    
DotProductClass=DotProductBias

learn = in_torchddp(ngpus, train_dotproduct, 5, 5e-3, imports=imports, need=need)


### 08 Collab . Weight Decay, and Create Our Own Embedding Module

First try adding weight decay.

In [None]:
DotProductClass=DotProductBias
learn = in_torchddp(ngpus, train_dotproduct, 5, 5e-3, wd=0.1, imports=imports, need=need)

Then create our own embedding module:

In [None]:
# Creating Our Own Embedding Module¶
def create_params(size):
    return nn.Parameter(torch.zeros(*size).normal_(0, 0.01))

class DotProductBias(Module):
    def __init__(self, n_users, n_movies, n_factors, y_range=(0,5.5)):
        self.user_factors = create_params([n_users, n_factors])
        self.user_bias = create_params([n_users])
        self.movie_factors = create_params([n_movies, n_factors])
        self.movie_bias = create_params([n_movies])
        self.y_range = y_range
        
    def forward(self, x):
        users = self.user_factors[x[:,0]]
        movies = self.movie_factors[x[:,1]]
        res = (users*movies).sum(dim=1)
        res += self.user_bias[x[:,0]] + self.movie_bias[x[:,1]]
        return sigmoid_range(res, *self.y_range)

DotProductClass=DotProductBias
need = need+' create_params'  # add create_params
learn = in_torchddp(ngpus, train_dotproduct, 5, 5e-3, wd=0.1, imports=imports, need=need)


The tedious steps are all abstracted away in `collab_learner`:

In [None]:
def train_collab(nepochs, *args, load:str=None, **kwargs):
    dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64)

    learn = collab_learner(dls, n_factors=50, y_range=(0, 5.5))
    
    if load: learn.load(load); print(f'Model and state loaded from {load}')

    with learn.distrib_ctx(): learn.fit_one_cycle(nepochs, *args)
    return learn

learn = in_torchddp(ngpus, train_collab, 5, 5e-3, wd=0.1, imports=imports, need=need)


### 08 Collab . Boostraping Collab Model

Build learner in 2 different ways, from a CollabNN, or the default nn:


In [2]:
embs = get_emb_sz(learn.dls)

class CollabNN(Module):
    def __init__(self, user_sz, item_sz, y_range=(0,5.5), n_act=100):
        self.user_factors = Embedding(*user_sz)
        self.item_factors = Embedding(*item_sz)
        self.layers = nn.Sequential(
            nn.Linear(user_sz[1]+item_sz[1], n_act),
            nn.ReLU(),
            nn.Linear(n_act, 1))
        self.y_range = y_range
        
    def forward(self, x):
        embs = self.user_factors(x[:,0]),self.item_factors(x[:,1])
        x = self.layers(torch.cat(embs, dim=1))
        return sigmoid_range(x, *self.y_range)

def train_bootstrap(nepochs, *args, load:str=None, **kwargs):
    dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64)

    model = CollabNN(*embs)
    
    learn = Learner(dls, model, loss_func=MSELossFlat())    
    if load: learn.load(load); print(f'Model and state loaded from {load}')

    with learn.distrib_ctx(): learn.fit_one_cycle(nepochs, *args)
    return learn

def train_collab_learner(nepochs, *args, load:str=None, **kwargs):
    dls = CollabDataLoaders.from_df(ratings, item_name='title', bs=64)
    
    learn = collab_learner(dls, use_nn=True, y_range=(0, 5.5), layers=[100,50])    
    if load: learn.load(load); print(f'Model and state loaded from {load}')

    with learn.distrib_ctx(): learn.fit_one_cycle(nepochs, *args)
    return learn

need='ratings CollabNN embs'
learn = in_torchddp(ngpus, train_bootstrap, 5, 5e-3, wd=0.1, imports=imports, need=need)

learn = in_torchddp(ngpus, train_collab_learner, 5, 5e-3, wd=0.1, imports=imports, need=need)

NameError: name 'learn' is not defined

### 10 NLP

The `langauge_model_learner` in `fastai2` does not quite work yet in distributed training.  It fails at collating batches, trying to stack two batches of different dimensions:
```python
~/Dropbox/fastai2/fastai2/data/load.py in <listcomp>(.0)
     44     b = t[0]
     45     return (default_collate(t) if isinstance(b, _collate_types)
---> 46             else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
     47             else default_collate(t))
     48 

~/Dropbox/fastai2/fastai2/data/load.py in fa_collate(t)
     43 def fa_collate(t):
     44     b = t[0]
---> 45     return (default_collate(t) if isinstance(b, _collate_types)
     46             else type(t[0])([fa_collate(s) for s in zip(*t)]) if isinstance(b, Sequence)
     47             else default_collate(t))

~/miniconda/envs/fastai2/lib/python3.7/site-packages/torch/utils/data/_utils/collate.py in default_collate(batch)
     53             storage = elem.storage()._new_shared(numel)
     54             out = elem.new(storage)
---> 55         return torch.stack(batch, 0, out=out)
     56     elif elem_type.__module__ == 'numpy' and elem_type.__name__ != 'str_' \
     57             and elem_type.__name__ != 'string_':

RuntimeError: invalid argument 0: Sizes of tensors must match except in dimension 0. Got 218 and 308 in dimension 1 at /pytorch/aten/src/TH/generic/THTensor.cpp:612
```

But text classifier can be trained in DDP, see `fastai2/nbs/examples/train_trainimdbclassifier.py`.
