In [1]:
from glob import glob
from tqdm import tqdm
import pandas as pd
import mp
import numpy as np
from datasets import Audio
import json
import os
import random

chunks = 30
sr = 16000

In [5]:
directory = '/home/ubuntu/.cache/huggingface/hub/datasets--mesolitica--malaya-speech-malay-stt/snapshots/cfbfc6a2f28424107416b9872d659bca34d6c5ed/data/*.parquet'

In [15]:
files = sorted(glob(directory))
len(files)

107

In [17]:
df = pd.read_parquet(files[0])
first_shape = len(df)
df = pd.read_parquet(files[-1])
last_shape = len(df)

first_shape, last_shape

(15286, 15285)

In [22]:
global_indices = {}
start = 0
for f in files[:-1]:
    row = {
        'start': start,
        'end': first_shape,
        'filename': f
    }
    row['start'] = start
    row['end'] = first_shape
    global_indices[start] = row
    start += first_shape
    
row = {
    'start': start,
    'end': last_shape,
    'filename': files[-1]
}
global_indices[start] = row

In [31]:
global_indices[0]

{'start': 0,
 'end': 15286,
 'filename': '/home/ubuntu/.cache/huggingface/hub/datasets--mesolitica--malaya-speech-malay-stt/snapshots/cfbfc6a2f28424107416b9872d659bca34d6c5ed/data/train-00000-of-00107-9d147232352b7656.parquet'}

In [32]:
df.iloc[0]

filename    {'bytes': b'\xff\xf3\x88\xc4\x00\x00\x00\x00\x...
Y           tak macam orang cakap sampai masanya kau dah m...
Name: 0, dtype: object

In [25]:
from torch.utils.data import DataLoader, Dataset
from datasets import Audio

In [42]:
class Train(Dataset):
    def __init__(self, indices, maxlen_cache_df=5):
        self.indices = {}
        for k, v in indices.items():
            for i in range(int(k), v['start'] + v['end'], 1):
                self.indices[i] = v
        
        self.max_index = len(self.indices)
        self.cache_df = {}
        self.maxlen_cache_df = maxlen_cache_df
        self.audio = Audio(sampling_rate=16000)
    
    def __len__(self):
        return self.max_index
    
    def __getitem__(self, item):
        if item < 0:
            item = self.max_index + item

        v = self.indices[item]
        chunk_index = item - v['start']
        if v['filename'] not in self.cache_df:
            df = pd.read_parquet(v['filename'])
            if len(self.cache_df) >= self.maxlen_cache_df:
                keys = list(self.cache_df.keys())
                self.cache_df.pop(sorted(keys)[0], None)
            self.cache_df[v['filename']] = df
        else:
            df = self.cache_df[v['filename']]
        
        row = df.iloc[chunk_index]
        audio = self.audio.decode_example(self.audio.encode_example(row['filename']))
        return {'array': audio['array']}

In [43]:
train = Train(global_indices)

In [44]:
train[-1]

1635600 15284


{'array': array([-0.02027534, -0.02458465, -0.02167011, ...,  0.00041462,
         0.00079491,  0.00206928])}

In [45]:
with open('indices-crawl-malaya-speech.json', 'w') as fopen:
    json.dump(global_indices, fopen)