This case study investigates whether we can perform two branch parallel, but merging two branch
1. Linear
2. Matmul

Findings:
1. Sparse not good
2. CUDA Parallel only for small operation, don't have enough thread to do complete Parallel

In [76]:
import torch.nn as nn
import torch
from model.vit_model import Block
def throughput(images, model):
    model.eval()
    batch_size, token_length = images[0].shape[0:2]
    for i in range(50):
        model(*images)
    torch.cuda.synchronize()
    tic1 = time.time()
    for i in range(30):
        model(*images)
    torch.cuda.synchronize()
    tic2 = time.time()
    print(f"batch_size {batch_size} token_length {token_length} throughput {30 * batch_size / (tic2 - tic1)}")
    MB = 1024.0 * 1024.0
    print('memory:', torch.cuda.max_memory_reserved() / MB)
    return (tic2 - tic1) / 30

In [84]:
class DummyModel(nn.Module):
    def __init__(self, depth=12, embed_dim=768, stream=False, repeat=4):
        super().__init__()
        self.embed_dim = embed_dim
        self.stream = stream
        self.repeat = repeat
        self.model1 = nn.Sequential(*[
            Block(dim=embed_dim, num_heads=12, mlp_ratio=4)
            for _ in range(depth)])
        self.model2 = nn.Sequential(*[
            Block(dim=embed_dim, num_heads=12, mlp_ratio=4)
            for _ in range(depth)])

        self.s1 = torch.cuda.Stream(device=device)
        self.s2 = torch.cuda.Stream(device=device)
    def forward(self, x1, x2):
        if self.stream:
            with torch.cuda.stream(self.s1):
                x1 = self.model1(x1)
            with torch.cuda.stream(self.s2):
                x2 = self.model2(x2)
        else:
            for blk1, blk2 in zip(self.model1, self.model2):
                for i in range(self.repeat):
                    x1 = blk1(x1)
                    x2 = blk2(x2)

        return x1, x2

In [81]:
class weight_fusion(DummyModel):
    def __init__(self):
        super(weight_fusion, self).__init__()
        layers = []
        for blk1, blk2 in zip(self.model1, self.model2):
            weight = torch.zeros((2 * self.embed_dim, 2 * self.embed_dim))
            #print(type(blk1.weight))
            weight[:self.embed_dim, :self.embed_dim] = blk1.weight
            weight[self.embed_dim:, self.embed_dim:] = blk2.weight
            bias = torch.zeros(2 * self.embed_dim)
            bias[:self.embed_dim] = blk1.bias
            bias[self.embed_dim:] = blk2.bias
            layer = nn.Linear(2 * self.embed_dim, 2 * self.embed_dim)
            layer.weight = nn.Parameter(weight)
            layer.bias = nn.Parameter(bias)
            layers.append(layer)
        self.model = nn.ModuleList(layers)
    def forward(self, x):
        for blk in self.model:
            x = blk(x)
        return x

In [86]:
device = 'cuda'
token_length = 200
emb_dim = 300
batch_size = 32
non_blocking = True
model = DummyModel(stream=False, embed_dim=emb_dim, repeat=1).to(device, non_blocking=non_blocking)
data1 = torch.rand((batch_size, token_length, emb_dim)).to(device, non_blocking=non_blocking)
data2 = torch.rand((batch_size, token_length, emb_dim)).to(device, non_blocking=non_blocking)
latency = throughput((data1, data2), model)
print(latency)

batch_size 32 token_length 200 throughput 244.06631325974246
memory: 11538.0
0.13111190795898436


In [108]:
device = 'cuda'
ratio = 0.4
results = []
token_length = 10
for i in range(4):
    token_length = int(token_length * (ratio ** i + 0.05))
    model = DummyModel().to(device)
    data1 = torch.rand((batch_size, token_length, emb_dim)).to(device)
    data2 = torch.rand((batch_size, token_length, emb_dim)).to(device)
    latency = throughput((data1, data2), model)
    results.append(latency)
print(sum(results)/4)

batch_size 4 token_length 10 throughput 1518.9369901708408
memory: 2198.0
batch_size 4 token_length 4 throughput 3333.3762492300966
memory: 2198.0
batch_size 4 token_length 0 throughput 9998.539502175252
memory: 2198.0
batch_size 4 token_length 0 throughput 9996.355114200596
memory: 2198.0
0.0002896005908648173


In [25]:
device = 'cuda'
ratio = 0.4
results = []
token_length = 100
model = weight_fusion().to(device)
for i in range(4):
    token_length = int(token_length * (ratio ** i))
    data1 = torch.rand((batch_size, token_length, emb_dim)).to(device)
    data2 = torch.rand((batch_size, token_length, emb_dim)).to(device)
    data = torch.zeros((batch_size, 2 * token_length, 2 * emb_dim)).to(device)
    data[:, :token_length, :emb_dim] = data1
    data[:, token_length:, emb_dim:] = data2
    latency = throughput([data], model)
    results.append(latency)
print(sum(results)/4)

batch_size 32 token_length 200 throughput 643.430974057145
memory: 1510.0
batch_size 32 token_length 80 throughput 1526.2333964441866
memory: 1510.0
batch_size 32 token_length 12 throughput 9320.093049739948
memory: 1510.0
batch_size 32 token_length 0 throughput 191995.60556933054
memory: 1510.0
0.0005804698914289475


In [63]:
import sparselinear as sl

In [None]:
sl1 = sl.SparseLinear(20000, 20000, sparsity=.99).cuda()
# Reduce weight dimensions if memory errors are raised
fc1 = nn.Linear(20000, 20000).cuda()
x = torch.rand(20000, device=device)

In [None]:
%timeit y = sl1(x)
%timeit y = fc1(x)

In [32]:
emb_dim = 100
# num_connections = 4
input_dim = 2 * emb_dim
output_dim = 2 * emb_dim
col = torch.arange(input_dim).repeat_interleave(emb_dim).view(1,-1).long()
row = torch.cat([torch.arange(emb_dim).repeat(emb_dim).view(1,-1), torch.arange(emb_dim, 2*emb_dim).repeat(emb_dim).view(1,-1)], dim=1)
# row = torch.randint(low=0, high=output_dim, size=(input_dim*num_connections,)).view(1,-1).long()
connections = torch.cat((row, col), dim=0)
connections

tensor([[  0,   1,   2,  ..., 197, 198, 199],
        [  0,   0,   0,  ..., 199, 199, 199]])

In [62]:
import torch
from torch_sparse import spspmm
device = 'cuda'

emb_dim = 300
col = torch.arange(emb_dim * 2).repeat_interleave(emb_dim).view(1,-1).long()
row = torch.cat([torch.arange(emb_dim).repeat(emb_dim).view(1,-1), torch.arange(emb_dim, 2*emb_dim).repeat(emb_dim).view(1,-1)], dim=1)

indexA = torch.cat((col, row), dim=0).to(device)
valueA = torch.rand(2 * emb_dim * emb_dim).to(device)

indexB = torch.cat((col, row), dim=0).to(device)
valueB = torch.rand(2 * emb_dim * emb_dim).to(device)



matrixA = torch.zeros((2 * emb_dim, 2 * emb_dim)).to(device)
dataA = valueA.reshape(emb_dim, -1)
matrixA[:emb_dim, :emb_dim] = dataA[:, :emb_dim]
matrixA[emb_dim:, emb_dim:] = dataA[:, emb_dim:]

matrixB = torch.zeros((2 * emb_dim, 2 * emb_dim)).to(device)
dataB = valueB.reshape(emb_dim, -1)
matrixB[:emb_dim, :emb_dim] = dataB[:, :emb_dim]
matrixB[emb_dim:, emb_dim:] = dataB[:, emb_dim:]

%timeit spspmm(indexA, valueA, indexB, valueB, 2 * emb_dim, 2 * emb_dim, 2 * emb_dim)
%timeit matrixA @ matrixB

16.7 ms ± 541 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
89.8 µs ± 605 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)


In [36]:
device = 'cuda'
sl2 = sl.SparseLinear(input_dim, output_dim, connectivity=connections).cuda()
fc2 = nn.Linear(input_dim, output_dim).cuda()

t1, t2 = 100, 50
data1 = torch.rand((4, t1, emb_dim)).to(device)
data2 = torch.rand((4, t2, emb_dim)).to(device)
data = torch.zeros((4, t1 + t2, input_dim)).to(device)
data[:, :t1, :emb_dim] = data1
data[:, t1:, emb_dim:] = data2

In [37]:
%timeit y = sl2(data)

978 µs ± 21.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [38]:
%timeit y = fc2(data)

The slowest run took 4.48 times longer than the fastest. This could mean that an intermediate result is being cached.
71.7 µs ± 53.5 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)


In [44]:
import torch, time

# In RNN parlance, the dimensions we care about are:
# # of time-steps (T)
# Batch size (B)
# Hidden size/number of "channels" (C)
T, B, C = 50, 50, 1024

# A module that defines a single "bidirectional LSTM". This is simply two
# LSTMs applied to the same sequence, but one in reverse
class BidirectionalRecurrentLSTM(torch.nn.Module):
    def __init__(self):
        super().__init__()
        self.cell_f = torch.nn.LSTM(input_size=C, hidden_size=C)
        self.cell_b = torch.nn.LSTM(input_size=C, hidden_size=C)

    def forward(self, x : torch.Tensor) -> torch.Tensor:
        # Forward layer
        output_f, _ = self.cell_f(x)

        # Backward layer. Flip input in the time dimension (dim 0), apply the
        # layer, then flip the outputs in the time dimension
        x_rev = torch.flip(x, dims=[0])
        output_b, _ = self.cell_b(torch.flip(x, dims=[0]))
        output_b_rev = torch.flip(output_b, dims=[0])

        return torch.cat((output_f, output_b_rev), dim=2)

# An "ensemble" of `BidirectionalRecurrentLSTM` modules. The modules in the
# ensemble are run one-by-one on the same input then their results are
# stacked and summed together, returning the combined result.
class LSTMEnsemble(torch.nn.Module):
    def __init__(self, n_models):
        super().__init__()
        self.n_models = n_models
        self.models = torch.nn.ModuleList([
            BidirectionalRecurrentLSTM() for _ in range(self.n_models)])

    def forward(self, x : torch.Tensor) -> torch.Tensor:
        results = []
        for model in self.models:
            results.append(model(x))
        return torch.stack(results).sum(dim=0)
class LSTMEnsemble_Parallel(torch.nn.Module):
    def __init__(self, n_models):
        super().__init__()
        self.n_models = n_models
        self.models = torch.nn.ModuleList([
            BidirectionalRecurrentLSTM() for _ in range(self.n_models)])

    def forward(self, x : torch.Tensor) -> torch.Tensor:
        futures = [torch.jit.fork(model, x) for model in self.models]
        results = [torch.jit.wait(fut) for fut in futures]
        return torch.stack(results).sum(dim=0)

# For a head-to-head comparison to what we're going to do with fork/wait, let's
# instantiate the model and compile it with TorchScript
device = 'cpu'
ens = torch.jit.script(LSTMEnsemble(n_models=4).to(device))
ens_Parallel = torch.jit.script(LSTMEnsemble_Parallel(n_models=4).to(device))

# Normally you would pull this input out of an embedding table, but for the
# purpose of this demo let's just use random data.
x = torch.rand(T, B, C).to(device)
# Let's run the model once to warm up things like the memory allocator
ens(x)

x = torch.rand(T, B, C).to(device)
# Let's see how fast it runs!
s = time.time()
ens(x)
print('Inference took', time.time() - s, ' seconds')
# Let's see how fast it runs!
s = time.time()
ens_Parallel(x)
print('Inference took', time.time() - s, ' seconds')

Inference took 1.7449994087219238  seconds
Inference took 1.1109998226165771  seconds
