In [76]:
import malaya_speech
import torch
from malaya_speech.utils import torch_featurization
import numpy as np
import json
from datasets import Audio

In [77]:
import pandas as pd
from sklearn.model_selection import train_test_split

In [79]:
# /home/ubuntu/bengali/data/LATEST_DATA_WAV2VEC2_DURATION.parquet
# 1. load the dataset
df = pd.read_parquet("/home/ubuntu/bengali/data/LATEST_DATA_WAV2VEC2_DURATION.parquet")
train, val = train_test_split(df, test_size=0.5, random_state=42)

In [80]:
class BengaliDataset(torch.utils.data.Dataset):

    SR = 16000

    def __init__(self, df):
        self.df = df
        self.audio = Audio(sampling_rate=self.SR)

    def __getitem__(self, idx):
        x = self.df.loc[idx]['path']
        y = self.df.loc[idx]['sentence']

        r = self.audio.decode_example(self.audio.encode_example(x))
        return r['array'].astype(np.float32)

    def __len__(self):
        return len(self.df)

In [81]:
train_dataset = BengaliDataset(val.reset_index(drop=True))

In [82]:
dataloader = torch.utils.data.DataLoader(train_dataset, num_workers=4)

In [83]:
train_dataset[0].dtype

dtype('float32')

In [84]:
torch.Tensor(train_dataset[0]).dtype

torch.float32

In [85]:
torch.__version__

'2.0.1+cu118'

In [86]:
import torchaudio

In [87]:
torchaudio.__version__

'2.0.2+cu118'

In [88]:
from tqdm import tqdm

def generate_statistics(samples):
    E_x = 0
    E_x_2 = 0
    N = 0

    for sample in tqdm(samples):
#         print(sample[0].squeeze().dtype)
        mel_spec = torch_featurization.melspectrogram(sample[0].squeeze())
        scaled_mel_spec = torch_featurization.piecewise_linear_log(mel_spec)
        sum = scaled_mel_spec.sum(0)
        sq_sum = scaled_mel_spec.pow(2).sum(0)
        M = scaled_mel_spec.size(0)

        E_x = E_x * (N / (N + M)) + sum / (N + M)
        E_x_2 = E_x_2 * (N / (N + M)) + sq_sum / (N + M)
        N += M

    return E_x, (E_x_2 - E_x**2) ** 0.5

In [None]:
mean, stddev = generate_statistics(iter(dataloader))

  9%|▉         | 43031/483982 [11:29<41:53:22,  2.92it/s]

In [None]:
json_str = json.dumps({"mean": mean.tolist(), "invstddev": (1 / stddev).tolist()}, indent=2)
with open('bengali-stats.json', "w") as f:
    f.write(json_str)