In [1]:
%%capture
!pip install pyctcdecode
!python -m pip install pypi-kenlm
!pip install bnunicodenormalizer



# Imports

In [2]:
import os
import numpy as np
from tqdm.auto import tqdm
from glob import glob
from transformers import AutoFeatureExtractor, pipeline
import pandas as pd
import librosa
import IPython
from datasets import load_metric
from tqdm.auto import tqdm
from torch.utils.data import Dataset, DataLoader
import torch
import re
import gc
import wave
from scipy.io import wavfile
import scipy.signal as sps

import torchaudio
from IPython.display import Audio, display

import random

from bnunicodenormalizer import Normalizer 

tqdm.pandas()
import warnings
warnings.filterwarnings("ignore")
from pandarallel import pandarallel

pandarallel.initialize(progress_bar=True,nb_workers=8)


print(torch.__version__)
print(torchaudio.__version__)

bnorm=Normalizer()

INFO: Pandarallel will run on 8 workers.
INFO: Pandarallel will use Memory file system to transfer data between the main process and workers.
1.11.0
0.11.0


# Configs

In [3]:

class CFG:
    model_name = 'arijitx/wav2vec2-xls-r-300m-bengali' 
    test = "../input/iiit-spoken-language-datasets/IIIT Spoken Language Datasets/Bengali/"
    single_SPEECH_FILE = "../input/iiit-spoken-language-datasets/IIIT Spoken Language Datasets/Bengali/ben_0001.wav"

    



# single sample inference demo

In [4]:
asr = pipeline("automatic-speech-recognition", model=CFG.model_name, device=0)
feature_extractor = AutoFeatureExtractor.from_pretrained(
        CFG.model_name, cache_dir=None, use_auth_token=False
    )
speech, sr = librosa.load(CFG.single_SPEECH_FILE, sr=feature_extractor.sampling_rate)
prediction = asr(
            speech, chunk_length_s=112, stride_length_s=None
        )

pred = prediction["text"]
pred


Downloading:   0%|          | 0.00/1.99k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.18G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/287 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/1.13k [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/25.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/309 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/261 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/755 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/3.48G [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/78.0 [00:00<?, ?B/s]

Downloading:   0%|          | 0.00/57.2M [00:00<?, ?B/s]

'এটা হয় ক্লিক করে অভ্র ব্যবহার করলে'

# check the original audio

In [5]:
IPython.display.Audio(CFG.single_SPEECH_FILE)

In [6]:
all_files = sorted(os.listdir(CFG.test))
len(all_files)

1000

In [7]:
numCols = 2
numRows = len(all_files)
df = pd.DataFrame(index=range(numRows),columns=range(numCols))
df = df.rename(columns={0: "file_paths", 1: "pseudo_labels"}, errors="raise")

df["file_paths"]=all_files
df["file_paths"]=df["file_paths"].progress_apply(lambda x:os.path.join(CFG.test,str(x)))
df.head(3)

  0%|          | 0/1000 [00:00<?, ?it/s]

Unnamed: 0,file_paths,pseudo_labels
0,../input/iiit-spoken-language-datasets/IIIT Sp...,
1,../input/iiit-spoken-language-datasets/IIIT Sp...,
2,../input/iiit-spoken-language-datasets/IIIT Sp...,


In [8]:
df.file_paths[0]

'../input/iiit-spoken-language-datasets/IIIT Spoken Language Datasets/Bengali/ben_0001.wav'

# Custom dataset class

In [9]:
class bn_asr_Dataset(Dataset):
    '''
    args:
        df      :  dataframe that contains .wav file paths
    '''
    def __init__(self,df):
        self.df = df

    def __len__(self):
        return len(self.df)

    def __getitem__(self, i):
   
        file = self.df.file_paths[i]
        path = os.path.splitext(file)[0]+'.wav'
        # Read file
        sampling_rate, data = wavfile.read(path)
        # Resample data
        number_of_samples = round(len(data) * float(feature_extractor.sampling_rate) / sampling_rate)
        speech = sps.resample(data, number_of_samples)
        return speech
  


# making prediction 

In [10]:
%%time
df.pseudo_labels = df.pseudo_labels.apply(str)

valid_dataset = bn_asr_Dataset(df)

for i in range(len(valid_dataset)):
    pred = asr(valid_dataset.__getitem__(i), chunk_length_s=112, stride_length_s=None)
    df.loc[i, 'pseudo_labels'] = pred['text']

  

CPU times: user 1min 55s, sys: 1.9 s, total: 1min 57s
Wall time: 2min 25s


In [11]:
def normalize(sen):
    _words = [bnorm(word)['normalized']  for word in sen.split()]
    return " ".join([word for word in _words if word is not None]) 

df.pseudo_labels= df.pseudo_labels.parallel_apply(lambda x:normalize(x))

df.to_csv('./v0_pseudo_labels.csv',index = False) 
df.head(5)

VBox(children=(HBox(children=(IntProgress(value=0, description='0.00%', max=125), Label(value='0 / 125'))), HB…

Unnamed: 0,file_paths,pseudo_labels
0,../input/iiit-spoken-language-datasets/IIIT Sp...,এটা হয় ক্লিক করে অভ্র ব্যবহার করলে
1,../input/iiit-spoken-language-datasets/IIIT Sp...,ইংরেজি একই নিবন্ধে ব্যবহৃত টেম্পলেট সরাসরি ব্য...
2,../input/iiit-spoken-language-datasets/IIIT Sp...,বার আঞ্চলিক সংগীতের অধীনে বাংলাদেশের সিত তার অ...
3,../input/iiit-spoken-language-datasets/IIIT Sp...,তিনি মৌলিক সংখ্যার মধ্যে ভালো সম্পর্ক খুঁজে পান।
4,../input/iiit-spoken-language-datasets/IIIT Sp...,বাংলাদেশের রাজধানী ঢাকা শহরের পুরানো ঢাকার একট...


In [12]:
df = df.rename(columns={"file_paths":"files"}, errors="raise")
df["files"]=all_files

df.to_csv('./v1_pseudo_labels.csv',index = False) 
df.head(5)

Unnamed: 0,files,pseudo_labels
0,ben_0001.wav,এটা হয় ক্লিক করে অভ্র ব্যবহার করলে
1,ben_0002.wav,ইংরেজি একই নিবন্ধে ব্যবহৃত টেম্পলেট সরাসরি ব্য...
2,ben_0003.wav,বার আঞ্চলিক সংগীতের অধীনে বাংলাদেশের সিত তার অ...
3,ben_0004.wav,তিনি মৌলিক সংখ্যার মধ্যে ভালো সম্পর্ক খুঁজে পান।
4,ben_0005.wav,বাংলাদেশের রাজধানী ঢাকা শহরের পুরানো ঢাকার একট...


In [13]:
torch.cuda.empty_cache() 
gc.collect()
!nvidia-smi

Thu Sep  8 06:03:37 2022       
+-----------------------------------------------------------------------------+
| NVIDIA-SMI 470.82.01    Driver Version: 470.82.01    CUDA Version: 11.4     |
|-------------------------------+----------------------+----------------------+
| GPU  Name        Persistence-M| Bus-Id        Disp.A | Volatile Uncorr. ECC |
| Fan  Temp  Perf  Pwr:Usage/Cap|         Memory-Usage | GPU-Util  Compute M. |
|                               |                      |               MIG M. |
|   0  Tesla P100-PCIE...  Off  | 00000000:00:04.0 Off |                    0 |
| N/A   51C    P0    34W / 250W |   2167MiB / 16280MiB |      0%      Default |
|                               |                      |                  N/A |
+-------------------------------+----------------------+----------------------+
                                                                               
+---------------------------------------------------------------------------