This case study investigate how the model perform with devices
Findings:
1) CPU: computation correlates to latency (in all cases)
2) GPU:
    1. when the computation requirement is low (either small batch_size or small token_length), computation reduction doesn't reflect latency reduction.
    2. Two branch model takes two times latency, even it is possible to execute parallel.
       Have tried torch.jit.script & trace, no improvement

In [1]:
'''
A jupyter notebook to show how the CUDA acceleration works
'''
import torch.nn as nn
import torch
import time
from model.vit_model import Block

class DummyModel(nn.Module):
    def __init__(self, depth=12, embed_dim=768, num_heads=12, mlp_ratio=4):
        super().__init__()
        self.model = nn.ModuleList([
            Block(dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio)
            for _ in range(depth)])
    def forward(self, x):
        for blk in self.model:
            x = blk(x)
        return x

class Multi_DummyModel(nn.Module):
    def __init__(self, depth=12, embed_dim=768, num_heads=12, mlp_ratio=4):
        super().__init__()
        self.model1 = nn.ModuleList([
            Block(dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio)
            for _ in range(depth)])
        self.model2 = nn.ModuleList([
            Block(dim=embed_dim, num_heads=num_heads, mlp_ratio=mlp_ratio)
            for _ in range(depth)])

    def forward(self, x):
        x1 = x.clone()
        x2 = x.clone()
        for blk1, blk2 in zip(self.model1, self.model2):
            x1 = blk1(x1)
            x2 = blk2(x2)
            # future = torch.jit.fork(blk1, x1)
            # x2 = blk2(x2)
            # x1 = torch.jit.wait(future)
        return x1, x2

In [2]:
def throughput(images, model):
    model.eval()
    batch_size, token_length = images.shape[0:2]
    for i in range(50):
        model(images)
    torch.cuda.synchronize()
    tic1 = time.time()
    for i in range(30):
        model(images)
    torch.cuda.synchronize()
    tic2 = time.time()
    print(f"batch_size {batch_size} token_length {token_length} throughput {30 * batch_size / (tic2 - tic1)}")
    MB = 1024.0 * 1024.0
    print('memory:', torch.cuda.max_memory_reserved() / MB)

In [5]:
device = 'cuda'
model = DummyModel().to(device)
data = torch.rand((32, 194, 768)).to(device)
throughput(data, model)

batch_size 32 token_length 194 throughput 134.16930629735728
memory: 11546.0


In [141]:
device = 'cuda'
model = DummyModel().to(device)
data = torch.rand((1, 197, 768)).to(device)
throughput(data, model)

batch_size 1 token_length 197 throughput 117.64700602497504
memory: 11534.0


In [150]:
import gc
device = 'cuda'
gc.collect()
torch.cuda.empty_cache()

model = Multi_DummyModel().to(device, non_blocking=True)
data = torch.rand((1, 197, 768)).to(device, non_blocking=True)
throughput(data, model)

batch_size 1 token_length 197 throughput 58.027110558951215
memory: 11534.0


In [151]:
device = 'cuda'
gc.collect()
torch.cuda.empty_cache()
model = Multi_DummyModel().to(device, non_blocking=True)
data = torch.rand((1, 197, 768)).to(device, non_blocking=True)
scripted_model = torch.jit.trace(model, example_inputs=(data))
throughput(data, scripted_model)

batch_size 1 token_length 197 throughput 56.81814364167879
memory: 11534.0
