In [1]:
from fastai.vision import *
from fastai.callbacks import *

In [2]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision

In [3]:
from fasterai.BN_folder import *

In [4]:
path = untar_data(URLs.IMAGENETTE_160)

In [5]:
data = (ImageList.from_folder(path)
                .split_by_folder(train='train', valid='val')
                .label_from_folder()
                .transform(get_transforms(), size=128)
                .databunch(bs=64)
                .normalize(imagenet_stats))

In [6]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

## VGG16

In [7]:
learn = Learner(data, models.vgg16_bn(num_classes=10), metrics=[accuracy])

In [11]:
learn.fit_one_cycle(3, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,2.049496,2.178077,0.321783,00:33
1,1.809241,1.548142,0.484841,00:33
2,1.476879,1.257086,0.595414,00:33


In [12]:
learn.validate()

[1.2570859, tensor(0.5954)]

In [13]:
model = learn.model.eval()

In [14]:
x,y = data.one_batch()

In [15]:
%%timeit
model(x[0][None].cuda())

2.76 ms ± 5.93 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [16]:
count_parameters(model)

134309962

In [17]:
new_model = models.vgg16_bn(num_classes=10)

In [18]:
new_model.load_state_dict(model.state_dict())

<All keys matched successfully>

In [19]:
new_model.eval()
folded_model = bn_folding_model(new_model).cuda()

In [20]:
%%timeit
folded_model(x[0][None].cuda())

2.42 ms ± 420 ns per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [21]:
count_parameters(folded_model)

134301514

In [22]:
folded_learner = Learner(data, folded_model, metrics=[accuracy])

In [23]:
folded_learner.validate()

[1.2570859, tensor(0.5954)]

We have removed the computation of the BN layers, reason why we gain some computation time but we haven't removed the parameters from the model, BN are still hold in memory. This is tricky to do because the model is sequentially built, so the number of layer do not correspond anymore. This is easier to do on ResNets because each layer is named.

## Resnet18

In [24]:
learn = Learner(data, models.resnet50(num_classes=10), metrics=[accuracy])

In [25]:
learn.fit_one_cycle(3, 1e-3)

epoch,train_loss,valid_loss,accuracy,time
0,2.095547,2.519928,0.277197,00:19
1,1.518275,1.446723,0.533248,00:20
2,1.139788,1.035072,0.66293,00:20


In [26]:
learn.validate()

[1.0350723, tensor(0.6629)]

In [27]:
model = learn.model.eval()

In [28]:
x,y = data.one_batch()

In [29]:
%%timeit
model(x[0][None].cuda())

6.16 ms ± 12.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [30]:
count_parameters(model)

23528522

In [36]:
from fasterai.resnet.folded_resnet import *

In [37]:
new_model = resnet50(num_classes=10)

In [38]:
new_model.load_state_dict(model.state_dict())

<All keys matched successfully>

In [39]:
new_model.eval()
folded_model = bn_folding_model(new_model).cuda()

In [40]:
%%timeit
folded_model(x[0][None].cuda())

4.44 ms ± 9.79 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [41]:
count_parameters(new_model)

23528522

In [42]:
folded_learner = Learner(data, folded_model, metrics=[accuracy])

In [43]:
folded_learner.validate()

[1.0350723, tensor(0.6629)]

In [44]:
from fasterai.resnet.folded_resnet_nobn import *

In [45]:
final_model = resnet50(num_classes=10)

In [46]:
final_model.load_state_dict(folded_model.state_dict(),strict=False)

<All keys matched successfully>

In [47]:
count_parameters(final_model)

23501962

In [48]:
final_learner = Learner(data, final_model, metrics=[accuracy])

In [49]:
%%timeit
final_model(x[0][None].cuda())

4.43 ms ± 11.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)


In [50]:
final_learner.validate()

[1.0350723, tensor(0.6629)]

In [51]:
print(f'# Parameters removed: {count_parameters(model)-count_parameters(final_model)}')

# Parameters removed: 26560
