In [1]:
from fastai.vision.all import *
from fastaudio.core.all import *



In [2]:
aud2spec = AudioToSpec.from_cfg(AudioConfig.Voice(f_min=0., n_mels=80))
spec = aud2spec(AudioTensor.create(
Path("../大丈夫な.mp3")))
spec.shape

torch.Size([1, 80, 415])

In [3]:
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

In [57]:
class NonSeparableQuartzSubblock(Module):
    r"A basic building sub-block of Quartznet with non-separable Conv layers"
    
    def __init__(self, in_channels, out_channels, kernel, stride, drop = 0.2, **kwargs):
        self.conv = nn.Conv1d(in_channels, out_channels, kernel, stride, (kernel-1)//2, **kwargs)
        self.norm = nn.BatchNorm1d(out_channels)
        self.act = nn.ReLU()
        self.drop = nn.Dropout(drop)
        
    def forward(self, x, res = 0):
        x = self.conv(x)
        x = self.norm(x)
        if not isinstance(res, int): x += res
        x = self.act(x)
        return self.drop(x)
    
    
class QuartzSubblock(Module):
    r"A basic building sub-block of QuartzNet with separable convolutions"
    
    def __init__(self, in_channels, out_channels, kernel, stride, drop = 0.2, **kwargs):
        self.conv = nn.Sequential(nn.Conv1d(in_channels, in_channels, kernel, stride, (kernel-1)//2, groups=in_channels, **kwargs),
                                 nn.Conv1d(in_channels, out_channels, 1, 1))
        self.norm = nn.BatchNorm1d(out_channels)
        self.act = nn.ReLU()
        self.drop = nn.Dropout(drop)
        
    def forward(self, x, res = 0):
        x = self.conv(x)
        x = self.norm(x)
        if not isinstance(res, int): x += res
        x = self.act(x)
        return self.drop(x)
    
class QuartzBlock(Module):
    
    def __init__(self, in_channels, out_channels, kernel, num_subblocks=3, drop = 0.2):
        self.subblocks = nn.Sequential(*[QuartzSubblock(in_channels, out_channels, kernel, 1, drop = 0.2)] \
        + [QuartzSubblock(out_channels, out_channels, kernel, 1, drop = 0.2) for i in range(num_subblocks-2)])
        self.res_subblock = QuartzSubblock(out_channels, out_channels, kernel, 1, drop = 0.2)
        self.res_conv = nn.Conv1d(in_channels, out_channels, 1)
        self.res_norm = nn.BatchNorm1d(out_channels)
        
    def forward(self, x):
        x, res = self.subblocks(x), self.res_conv(x)
        x = self.res_subblock(x, self.res_norm(x))
        return x
    
class QuartzNet(Module):
    
    def __init__(self, n_mels, n_vocab, num_blocks_factor=2, num_subblocks = 5):
        KERNELS = [33, 39, 51, 63, 75]
        OUTS = [256, 256, 512, 512, 512, 256]
        DROPS = [0.2]*3 + [0.3]*2
        
        self.model = nn.Sequential(*[NonSeparableQuartzSubblock(n_mels, 256, 33, 2)] \
        + [QuartzBlock((OUTS[i], OUTS[i-1])[j==0], OUTS[i], KERNELS[i], num_subblocks, DROPS[i])
           for i in range(5) for j in range(num_blocks_factor)] \
        + [QuartzSubblock(OUTS[-2], 512, 87, 1, 0.4)],
          NonSeparableQuartzSubblock(512, 1024, 1, 1, 0.4), nn.Conv1d(1024, n_vocab, 1, dilation=2), nn.Softmax(1)
        )
        
    def forward(self, x):
        return self.model(x)

In [50]:
print('Number of parameters of a 5x5 QuartzNet model:', count_parameters(QuartzNet(80, 27, 1)))
print('Number of parameters of a 10x5 QuartzNet model:', count_parameters(QuartzNet(80, 27)))
print('Number of parameters of a 15x5 QuartzNet model:', count_parameters(QuartzNet(80, 27, 3)))

Number of parameters of a 5x5 QuartzNet model: 7393051
Number of parameters of a 10x5 QuartzNet model: 13245723
Number of parameters of a 15x5 QuartzNet model: 19098395


In [58]:
model = QuartzNet(80, 27, 2)

In [40]:
"{:012d}".format(55)

'          55'

In [48]:
for n,l in QuartzNet(80,27).named_parameters():
    if l.requires_grad: print("{:08d}".format(l.numel()), n, l.requires_grad)

00675840 model.0.conv.weight True
00000256 model.0.conv.bias True
00000256 model.0.norm.weight True
00000256 model.0.norm.bias True
00008448 model.1.subblocks.0.conv.0.weight True
00000256 model.1.subblocks.0.conv.0.bias True
00065536 model.1.subblocks.0.conv.1.weight True
00000256 model.1.subblocks.0.conv.1.bias True
00000256 model.1.subblocks.0.norm.weight True
00000256 model.1.subblocks.0.norm.bias True
00008448 model.1.subblocks.1.conv.0.weight True
00000256 model.1.subblocks.1.conv.0.bias True
00065536 model.1.subblocks.1.conv.1.weight True
00000256 model.1.subblocks.1.conv.1.bias True
00000256 model.1.subblocks.1.norm.weight True
00000256 model.1.subblocks.1.norm.bias True
00008448 model.1.subblocks.2.conv.0.weight True
00000256 model.1.subblocks.2.conv.0.bias True
00065536 model.1.subblocks.2.conv.1.weight True
00000256 model.1.subblocks.2.conv.1.bias True
00000256 model.1.subblocks.2.norm.weight True
00000256 model.1.subblocks.2.norm.bias True
00008448 model.1.subblocks.3.conv.

In [59]:
model(spec)

AudioSpectrogram([[[0.0431, 0.0133, 0.0724,  ..., 0.0317, 0.0628, 0.0316],
         [0.0281, 0.0382, 0.0217,  ..., 0.0453, 0.0212, 0.0387],
         [0.0295, 0.0862, 0.0335,  ..., 0.0520, 0.0480, 0.0381],
         ...,
         [0.0570, 0.0655, 0.0248,  ..., 0.0481, 0.0550, 0.0488],
         [0.0148, 0.0383, 0.0360,  ..., 0.0210, 0.0267, 0.0359],
         [0.0196, 0.0295, 0.0273,  ..., 0.0461, 0.0380, 0.0333]]],
       grad_fn=<AliasBackward>)