In [1]:
!wget -q http://www.openslr.org/resources/53/asr_bengali_0.zip
!wget -q http://www.openslr.org/resources/53/utt_spk_text.tsv

!unzip -q -o asr_bengali_0.zip
!sudo rm -rf asr_bengali_0.zip

In [2]:
!pip install soundfile
!sudo apt install sox # use for converting flac to wav
import pandas as pd

df = pd.read_csv('utt_spk_text.tsv', sep="\t", header = None)
df.tail()

Reading package lists... Done
Building dependency tree       
Reading state information... Done
The following package was automatically installed and is no longer required:
  libnvidia-common-460
Use 'sudo apt autoremove' to remove it.
The following additional packages will be installed:
  libmagic-mgc libmagic1 libopencore-amrnb0 libopencore-amrwb0 libsox-fmt-alsa
  libsox-fmt-base libsox3
Suggested packages:
  file libsox-fmt-all
The following NEW packages will be installed:
  libmagic-mgc libmagic1 libopencore-amrnb0 libopencore-amrwb0 libsox-fmt-alsa
  libsox-fmt-base libsox3 sox
0 upgraded, 8 newly installed, 0 to remove and 34 not upgraded.
Need to get 760 kB of archives.
After this operation, 6,717 kB of additional disk space will be used.
Get:1 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libopencore-amrnb0 amd64 0.1.3-2.1 [92.0 kB]
Get:2 http://archive.ubuntu.com/ubuntu bionic/universe amd64 libopencore-amrwb0 amd64 0.1.3-2.1 [45.8 kB]
Get:3 http://archive.ubuntu.com

Unnamed: 0,0,1,2
127560,ffff6a7f59,77f82,সংস্কৃতি মন্ত্রী আবুল কালাম আযাদও
127561,ffff98efe5,80539,অভিযোগ রয়েছে
127562,ffffc017a0,15a75,আন্তোনিও দা মাগদালেনার
127563,ffffe3b62d,10a9f,মি. খানের
127564,fffff1c677,06e62,কাজী নজরুলের লেখা হিন্দু-মুসলমান


In [3]:
import os
from os.path import isdir, join
from pathlib import Path
import pandas as pd

# Math
import numpy as np
from scipy.fftpack import fft
from scipy import signal
from scipy.io import wavfile
import librosa

from sklearn.decomposition import PCA

# Visualization
import matplotlib.pyplot as plt
import seaborn as sns
import IPython.display as ipd
import librosa.display

import plotly.offline as py
py.init_notebook_mode(connected=True)
import plotly.graph_objs as go
import plotly.tools as tls
import pandas as pd
import albumentations
import torch

import numpy as np

from PIL import Image
from PIL import ImageFile
import soundfile as sf  
from torch.utils.data import Dataset

ImageFile.LOAD_TRUNCATED_IMAGES = True

SEED = 1234
import random
random.seed(SEED)
np.random.seed(SEED)
torch.manual_seed(SEED)
torch.cuda.manual_seed(SEED)
torch.backends.cudnn.deterministic = True


PAD_LEN = 72

def pad_sequences(x):
    #print(x)
    padded = torch.zeros((PAD_LEN), dtype=torch.long)
    if len(x) > PAD_LEN: padded[:] = torch.tensor(x[:PAD_LEN] , dtype=torch.long)
    else: padded[:len(x)] = torch.tensor(x, dtype=torch.long)
    return padded

class ASRDataset(Dataset):
    def __init__(self, image_paths, targets, resize=(75, 300)):
        # resize = (height, width)
        self.image_paths = image_paths
        self.targets = targets
        self.resize = resize



        mean = (0.485, 0.456, 0.406)
        std = (0.229, 0.224, 0.225)
        self.aug = albumentations.Compose(
            [
                albumentations.Normalize(
                    mean, std, max_pixel_value=255.0, always_apply=True
                )
            ]
        )

    def log_specgram(self, audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
        
        nperseg = int(round(window_size * sample_rate / 1e3))
        noverlap = int(round(step_size * sample_rate / 1e3))
        freqs, times, spec = signal.spectrogram(audio,
                                        fs=sample_rate,
                                        window='hann',
                                        nperseg=nperseg,
                                        noverlap=noverlap,
                                        detrend=False)
        return freqs, times, np.log(spec.T.astype(np.float32) + eps)

    def __len__(self):
        return len(self.image_paths)

    def __getitem__(self, item):
        samples, sample_rate = sf.read(self.image_paths[item]) 
        #freqs, times, spectrogram = self.log_specgram(samples, sample_rate)
        spectrogram = np.abs(librosa.stft(samples))
        
        mean = np.mean(spectrogram, axis=0)
        std = np.std(spectrogram, axis=0)
        spectrogram = (spectrogram - mean) / std
        #S = librosa.feature.melspectrogram(samples, sr=sample_rate, n_mels=128)

        # Convert to log scale (dB). We'll use the peak power (max) as reference.
        #log_S = librosa.power_to_db(S, ref=np.max)
        #log_S = (log_S-np.mean(log_S, axis=0))/np.std(log_S, axis=0)
        image = Image.fromarray(spectrogram , 'L').convert("RGB")
        targets = self.targets[item]
        
        if self.resize is not None:
            image = image.resize(
                (self.resize[1], self.resize[0]), resample=Image.BILINEAR
            )
        
        image = np.array(image).transpose(2, 0, 1)
        image[np.isnan(image)] = 0
        #augmented = self.aug(image=image)
        #image = augmented["image"]
        image = image.astype(np.float32)
        return {
            "images": torch.tensor(image, dtype=torch.float),
            "targets": torch.tensor(pad_sequences(targets), dtype=torch.long),
        }
%matplotlib inline

In [4]:
import torch
device = 'cpu'
if torch.cuda.is_available():
  device = 'cuda'
print(device)

cuda


In [5]:
from tqdm.auto import tqdm
img_paths = []
targets = []

for idx in tqdm(range(len(df))):
  pth = "asr_bengali/data/"+df.loc[idx][0][:2]+'/'+df.loc[idx][0]+'.flac'
  if os.path.exists(pth):
    img_paths.append(pth)
    targets.append(df.loc[idx][2].strip())
print(len(img_paths))
print(img_paths[1])

HBox(children=(FloatProgress(value=0.0, max=127565.0), HTML(value='')))


9552
asr_bengali/data/00/000039928e.flac


In [6]:
targets = [[c for c in x] for x in targets]
targets_flat = [c for clist in targets for c in clist]

print(len(set(targets_flat)))
uniques = list(set(targets_flat))
map = {}
reverse_map = {}
for i in range(len(uniques)):
  map[i+1] = uniques[i]
  reverse_map[uniques[i]] = i+1

targets_enc = [[reverse_map[c] for c in x] for x in targets]
targets_enc = [np.array(x) for x in targets_enc]
targets_enc = np.array(targets_enc)


128



Creating an ndarray from ragged nested sequences (which is a list-or-tuple of lists-or-tuples-or ndarrays with different lengths or shapes) is deprecated. If you meant to do this, you must specify 'dtype=object' when creating the ndarray



In [15]:
train_dataset = ASRDataset(
        image_paths=img_paths[:128*10], #:-512
        targets=targets_enc[:128*10],
    )

test_dataset = ASRDataset(
        image_paths=img_paths[-512:],
        targets=targets_enc[-512:],
    )
test_targets_orig = targets[-512:]

In [16]:
train_loader = torch.utils.data.DataLoader(
        train_dataset,
        batch_size=128,
        num_workers=8,
        shuffle=True,
    )

test_loader = torch.utils.data.DataLoader(
        test_dataset,
        batch_size=128,
        num_workers=8,
        shuffle=False,
    )

In [20]:
import torch
from torch import nn
from torch.nn import functional as F
import math


class ASRModel(nn.Module):
    def __init__(self, num_chars):
        super(ASRModel, self).__init__()
        self.conv_1 = nn.Conv2d(3, 128, kernel_size=(3, 6), padding=(1, 1))
        self.pool_1 = nn.MaxPool2d(kernel_size=(2, 2))
        self.conv_2 = nn.Conv2d(128, 64, kernel_size=(3, 6), padding=(1, 1))
        self.pool_2 = nn.MaxPool2d(kernel_size=(2, 2))
        self.linear_1 = nn.Linear(1152, 512)
        self.drop_1 = nn.Dropout(0.2)
        self.linear_2 = nn.Linear(512, 64)
        self.drop_2 = nn.Dropout(0.1)
        self.lstm = nn.GRU(64, 32, bidirectional=True, num_layers=2, dropout=0.1, batch_first=True)
        self.output = nn.Linear(64, num_chars + 1)
    

    def forward(self, images, targets=None, loss_fn = 'ctc'):
        bs, _, _, _ = images.size()
        x = F.relu(self.conv_1(images))
        #print(x.size())
        x = self.pool_1(x)
        #print(x.size())
        x = F.relu(self.conv_2(x))
        x = self.pool_2(x)
        #print(x.size())
        x = x.permute(0, 3, 1, 2)
        #print(x.size())
        x = x.view(bs, x.size(1), -1)
        #print(x.size())
        x = F.relu(self.linear_1(x))
        #print(x.size())
        x = self.drop_1(x)
        #print(x.size())
        x = F.relu(self.linear_2(x))
        x = self.drop_2(x)

        x, _ = self.lstm(x)
        x = self.output(x)
        
        if loss_fn == 'ctc':
          x = x.permute(1, 0, 2).contiguous()
          if targets is not None:
              log_probs = F.log_softmax(x, 2)
              #print(log_probs)
              input_lengths = torch.full(
                  size=(bs,), fill_value=log_probs.size(0), dtype=torch.int32
              )
              input_lengths.requires_grad=False

              target_lengths = torch.full(
                  size=(bs,), fill_value=targets.size(1), dtype=torch.int32
              )
              target_lengths.requires_grad=False
              loss = nn.CTCLoss(blank=0, reduction='mean', zero_infinity=True)(
                  log_probs, targets, input_lengths, target_lengths
              )
              #if loss.item() == float('inf') or math.isnan(loss.item()):
		          #    loss.data = torch.tensor(0.0)
              return log_probs, loss
        if loss_fn == 'mse':
          x = x.permute(0, 2, 1)
          loss = nn.CrossEntropyLoss()(x,targets)
          return x, loss

        return x, None


In [21]:

def train_fn(model, data_loader, optimizer):
    model.train()
    fin_loss = 0
    tk0 = tqdm(data_loader, total=len(data_loader))
    for data in tk0:
        for key, value in data.items():
            data[key] = value.to(device)

        optimizer.zero_grad()
        out, loss = model(**data)
        loss.backward()
        optimizer.step()
        
        fin_loss += loss.item()
    return fin_loss / len(data_loader)

def eval_fn(model, data_loader):
    model.eval()
    fin_loss = 0
    fin_preds = []
    with torch.no_grad():
        tk0 = tqdm(data_loader, total=len(data_loader))
        for data in tk0:
            for key, value in data.items():
                data[key] = value.to(device)
            out, loss = model(**data)
            fin_loss += loss.item()
            
    return fin_preds, fin_loss / len(data_loader)

In [22]:
from pprint import pprint
from sklearn import metrics
import warnings
warnings.simplefilter("ignore")
model = ASRModel(num_chars=len(map)+1)
model.to(device)

optimizer = torch.optim.Adam(model.parameters(), lr=1e-2) # increase lr

scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
    optimizer, factor=0.9, patience=2, verbose=True
)
opt_level = 'O1'

#model, optimizer = amp.initialize(model, optimizer, opt_level=opt_level)

for epoch in range(5):
    train_loss = train_fn(model, train_loader, optimizer)
    valid_preds, test_loss = eval_fn(model, test_loader)
    print(f"Epoch={epoch}, Train Loss={train_loss}, Test Loss={test_loss}")
    
    #scheduler.step(test_loss)
    scheduler.step(train_loss)
    

HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


Epoch=0, Train Loss=0.020054136961698533, Test Loss=0.04966798983514309


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


Epoch=1, Train Loss=0.01632557958364487, Test Loss=0.05059768073260784


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


Epoch=2, Train Loss=0.016257842630147935, Test Loss=0.047898503951728344


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


Epoch=3, Train Loss=0.01566630695015192, Test Loss=0.04770579468458891


HBox(children=(FloatProgress(value=0.0, max=10.0), HTML(value='')))




HBox(children=(FloatProgress(value=0.0, max=4.0), HTML(value='')))


Epoch=4, Train Loss=0.015457034297287463, Test Loss=0.04808502644300461


In [None]:
idx = 0

def log_specgram(audio, sample_rate, window_size=20,
                 step_size=10, eps=1e-10):
        
    nperseg = int(round(window_size * sample_rate / 1e3))
    noverlap = int(round(step_size * sample_rate / 1e3))
    freqs, times, spec = signal.spectrogram(audio,
                                    fs=sample_rate,
                                    window='hann',
                                    nperseg=nperseg,
                                    noverlap=noverlap,
                                    detrend=False)
    return freqs, times, np.log(spec.T.astype(np.float32) + eps)

samples, sample_rate = sf.read(img_paths[idx])
#freqs, times, spectrogram = log_specgram(samples, sample_rate)
stft = np.abs(librosa.stft(samples))
image = Image.fromarray(stft , 'L').convert("RGB")
image = image.resize(
    (300,75), resample=Image.BILINEAR
)
print(image)
plt.imshow(image, interpolation='nearest')
plt.show()
