In [1]:
# https://huggingface.co/datasets/google/fleurs/blob/main/fleurs.py

# !wget https://storage.googleapis.com/xtreme_translations/FLEURS102/ms_my.tar.gz
# !tar -zxf ms_my.tar.gz
# !rm ms_my.tar.gz

In [2]:
import os

os.environ['CUDA_VISIBLE_DEVICES'] = ''

In [3]:
import pandas as pd
from glob import glob
from tqdm import tqdm
import json

In [4]:
base_directory = os.getcwd()

In [5]:
ms_my_test = pd.read_csv('ms_my/test.tsv', sep = '\t', header=None)

In [6]:
import malaya
from malaya.text.normalization import cardinal

tokenizer = malaya.tokenizer.Tokenizer(hypen = False, parliament = False, time = False, time_pukul = False,
                                      temperature = False, distance = False, volume = False, duration = False,
                                      weight = False, date = False, money = False)
tokenizer.tokenize(ms_my_test.iloc[3, 3])

  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))
  self.tok = re.compile(r'({})'.format('|'.join(pipeline)))


['gimnastik',
 'as',
 'dan',
 'joas',
 'mempunyai',
 'matlamat',
 'yang',
 'sama',
 'menjadikan',
 'sukan',
 'gimnastik',
 'dan',
 'lain',
 '-',
 'lain',
 'selamat',
 'bagi',
 'para',
 'atlet',
 'untuk',
 'mengejar',
 'impian',
 'mereka',
 'dalam',
 'persekitaran',
 'yang',
 'selamat',
 'positif',
 'dan',
 'berdaya',
 'maju']

In [7]:
def tokenize_and_replace(t):
    tokenized = tokenizer.tokenize(t)
    for i in range(len(tokenized)):
        c = cardinal(tokenized[i])
        if c != tokenized[i]:
            tokenized[i] = c
    return ' '.join(tokenized)

tokenize_and_replace('pada 15 ogos 1940 pihak berikat menyerang perancis selatan serangan ini dipanggil operation dragoon')

'pada lima belas ogos seribu sembilan ratus empat puluh pihak berikat menyerang perancis selatan serangan ini dipanggil operation dragoon'

In [8]:
ms_my = []
for i in range(len(ms_my_test)):
    a = ms_my_test.iloc[i, 1]
    t = ms_my_test.iloc[i, 3]
    t = tokenize_and_replace(t)
    
    ms_my.append((f'{base_directory}/ms_my/audio/test/{a}', t))

In [9]:
len(ms_my)

749

In [10]:
audios = ms_my
audios, texts = zip(*audios)

In [11]:
len(texts)

749

In [12]:
import unicodedata
import re
import itertools

vocabs = [" ", "a", "e", "n", "i", "t", "o", "u", "s", "k", "r", "l", "h", "d", "m", "g", "y", "b", "p", "w", "c", "f", "j", "v", "z", "0", "1", "x", "2", "q", "5", "3", "4", "6", "9", "8", "7"]

def preprocessing_text(string):
    
    tokenized = tokenizer.tokenize(string)
    string = ' '.join(tokenized)
    string = unicodedata.normalize('NFC', string.lower())
    string = ''.join([c if c in vocabs else ' ' for c in string])
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = (
        ''.join(''.join(s)[:2] for _, s in itertools.groupby(string))
    )
    return string

In [13]:
processed_text = [preprocessing_text(t) for t in tqdm(texts)]

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 749/749 [00:00<00:00, 8715.66it/s]


In [14]:
filtered_audios, filtered_texts, rejected = [], [], []
for i in tqdm(range(len(processed_text))):
    if len(processed_text[i]) and not len(re.findall(r'\d+', processed_text[i])):
        filtered_audios.append(audios[i])
        filtered_texts.append(processed_text[i])
    else:
        rejected.append(i)

100%|████████████████████████████████████████████████████████████████████████████████████████████████| 749/749 [00:00<00:00, 650421.06it/s]


In [15]:
len(filtered_audios), len(filtered_audios) / len(audios)

(739, 0.986648865153538)

In [16]:
import soundfile as sf
import numpy as np

In [17]:
lengths = []
for f in tqdm(filtered_audios):
    y, sr = sf.read(f)
    lengths.append(len(y) / sr)
    
np.sum(lengths) / 60 / 60

100%|██████████████████████████████████████████████████████████████████████████████████████████████████| 739/739 [00:00<00:00, 4267.57it/s]


2.226116666666667

In [18]:
with open('malay-asr-test.json', 'w') as fopen:
    json.dump({'X': filtered_audios, 'Y':filtered_texts}, fopen)

In [19]:
with open('malay-audio-test.txt', 'w') as fopen:
    fopen.write('\n'.join(filtered_audios))