Data-efficient Image Transformers (DeiT) is a Vision Transformer model trained on ImageNet for image classification.

CNNs requires millions of images for training to achieve the SOTA results.

DeiT is a vision transformer model that requires a lot less data and computing resources for training to compete with the leading CNNs in performing image classification, which is made possible by two key components of DeiT:

    - Data augmentation that simulates training on a much larger dataset;
    
    - Native distillation that allows the transformer network to learn from a CNN's output.

In [6]:
from PIL import Image
import torch
import timm
import requests
import torchvision.transforms as transforms
from timm.data.constants import IMAGENET_DEFAULT_MEAN, IMAGENET_DEFAULT_STD

print(torch.__version__)

model = torch.hub.load(
    'facebookresearch/deit:main', 
    'deit_base_patch16_224', 
    pretrained=True
)

model.eval()

transform = transforms.Compose([
    transforms.Resize(256, interpolation=3),
    transforms.CenterCrop(224),
    transforms.ToTensor(),
    transforms.Normalize(
        IMAGENET_DEFAULT_MEAN,
        IMAGENET_DEFAULT_STD
    )
])

img = Image.open(
    requests.get(
        "https://raw.githubusercontent.com/pytorch/ios-demo-app/master/HelloWorld/HelloWorld/HelloWorld/image.png",
        stream=True).raw
)

img = transform(img)[None,] # [None,] add dimension at the beginning
out = model(img)
clsidx = torch.argmax(out)
print(clsidx.item())

1.11.0


Using cache found in /Users/bcp/.cache/torch/hub/facebookresearch_deit_main


269


### Scripting DeiT

In [7]:
model = torch.hub.load(
    'facebookresearch/deit:main', 
    'deit_base_patch16_224', 
    pretrained=True
)
# 346MB
scripted_model = torch.jit.script(model)
scripted_model.save('fbdeit_scripted.pt')

Using cache found in /Users/bcp/.cache/torch/hub/facebookresearch_deit_main


### Quantizing DeiT

apply dynamic-quantization to reduce the trained model size while keeping the inference accuracy.

In [9]:
# Use 'fbgemm' for server inference
# and 'qnnpack' for mobile inference
backend = 'qnnpack'

model.qconfig = torch.quantization.get_default_qconfig(backend)
torch.backends.quantized.engine = backend

quantized_model = torch.quantization.quantize_dynamic(
    model,
    qconfig_spec={torch.nn.Linear},
    dtype=torch.qint8
)

# 89MB
scripted_quantized_model = torch.jit.script(quantized_model)
scripted_quantized_model.save('fbdeit_scripted_quantized.pt')



In [10]:
out = scripted_quantized_model(img)
clsidx = torch.argmax(out)
print(clsidx.item())

269


### Optimizing DeiT

In [13]:
from torch.utils.mobile_optimizer import optimize_for_mobile

# 89MB
optimized_scripted_quantized_model = optimize_for_mobile(scripted_quantized_model)
optimized_scripted_quantized_model.save('fbdeit_optimized_scripted_quantized.pt')

In [14]:
out = optimized_scripted_quantized_model(img)
clsidx = torch.argmax(out)
print(clsidx.item())

269


### Using Lite Interpreter

In [15]:
# 89MB
optimized_scripted_quantized_model._save_for_lite_interpreter(
    'fbdeit_optimized_scripted_quantized_lite.ptl'
)
ptl = torch.jit.load('fbdeit_optimized_scripted_quantized_lite.ptl')

### Comparing Inference Speed

'fbgemm' should be use for comparing inference on Notebook, however M1 is not supporting 'fbgemm' quantization

In [17]:
with torch.autograd.profiler.profile(use_cuda=False) as prof1:
    out = model(img)
with torch.autograd.profiler.profile(use_cuda=False) as prof2:
    out = scripted_model(img)
with torch.autograd.profiler.profile(use_cuda=False) as prof3:
    out = scripted_quantized_model(img)
with torch.autograd.profiler.profile(use_cuda=False) as prof4:
    out = optimized_scripted_quantized_model(img)
with torch.autograd.profiler.profile(use_cuda=False) as prof5:
    out = ptl(img)

print("original model: {:.2f}ms".format(prof1.self_cpu_time_total/1000))
print("scripted model: {:.2f}ms".format(prof2.self_cpu_time_total/1000))
print("scripted & quantized model: {:.2f}ms".format(prof3.self_cpu_time_total/1000))
print("scripted & quantized & optimized model: {:.2f}ms".format(prof4.self_cpu_time_total/1000))
print("lite model: {:.2f}ms".format(prof5.self_cpu_time_total/1000))

original model: 123.96ms
scripted model: 253.00ms
scripted & quantized model: 197.53ms
scripted & quantized & optimized model: 175.56ms
lite model: 161.18ms


In [18]:
import pandas as pd
import numpy as np

df = pd.DataFrame({
    'Model': [
        'original model', 
        'scripted model', 
        'scripted & quantized model',
        'scripted & quantized & optimized model', 
        'lite model'
    ]
})

df = pd.concat([
    df,
    pd.DataFrame([
            ["{:.2f}ms".format(prof1.self_cpu_time_total/1000), "0%"],
            ["{:.2f}ms".format(prof2.self_cpu_time_total/1000),
             "{:.2f}%".format((prof1.self_cpu_time_total-prof2.self_cpu_time_total)/prof1.self_cpu_time_total*100)],
            ["{:.2f}ms".format(prof3.self_cpu_time_total/1000),
             "{:.2f}%".format((prof1.self_cpu_time_total-prof3.self_cpu_time_total)/prof1.self_cpu_time_total*100)],
            ["{:.2f}ms".format(prof4.self_cpu_time_total/1000),
             "{:.2f}%".format((prof1.self_cpu_time_total-prof4.self_cpu_time_total)/prof1.self_cpu_time_total*100)],
            ["{:.2f}ms".format(prof5.self_cpu_time_total/1000),
             "{:.2f}%".format((prof1.self_cpu_time_total-prof5.self_cpu_time_total)/prof1.self_cpu_time_total*100)]
        ],
        columns=['Inference Time', 'Reduction']
    )
], axis=1)

print(df)

                                    Model Inference Time Reduction
0                          original model       123.96ms        0%
1                          scripted model       253.00ms  -104.10%
2              scripted & quantized model       197.53ms   -59.35%
3  scripted & quantized & optimized model       175.56ms   -41.63%
4                              lite model       161.18ms   -30.03%
