In [1]:
# !pip3 install azure-cognitiveservices-speech
# !pip3 install malaya -U --no-deps

In [2]:
# !wget https://f000.backblazeb2.com/file/malay-dataset/wikidump1-raw.json
# !wget https://f000.backblazeb2.com/file/malay-dataset/dumping/news/dumping-news-6-july-2019.json

In [3]:
import numpy as np
import soundfile as sf
from scipy.io import wavfile

def to_ndarray(array):
    """
    Change list / tuple / bytes into np.array

    Parameters
    ----------
    array: list / tuple / bytes

    Returns
    -------
    result : np.array
    """
    
    if isinstance(array, list) or isinstance(array, tuple):
        array = np.array(array)
    elif isinstance(array, bytes) or isinstance(array, bytearray):
        if isinstance(array, bytearray):
            array = bytes(array)
        array = np.frombuffer(array, np.int16)
    return array

In [7]:
import re

_pad = 'pad'
_start = 'start'
_eos = 'eos'
_punctuation = "!'(),.:;? "
_special = '-'
_letters = 'ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz'

MALAYA_SPEECH_SYMBOLS = (
    [_pad, _start, _eos] + list(_special) + list(_punctuation) + list(_letters)
)

In [8]:
import json
import malaya
from unidecode import unidecode

normalizer = malaya.normalize.normalizer(date = False, time = False, money = True)

def put_spacing_num(string):
    string = re.sub('[A-Za-z]+', lambda ele: ' ' + ele[0] + ' ', string)
    return re.sub(r'[ ]+', ' ', string).strip()

def convert_to_ascii(string):
    return unidecode(string)

def collapse_whitespace(string):
    return re.sub(_whitespace_re, ' ', string)

def cleaning(string, normalize = True, add_eos = False):
    sequence = []
    string = convert_to_ascii(string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    if string[-1] in ['-', ',']:
        string = string[:-1]
    if string[-1] != '.':
        string = string + '.'
    string = put_spacing_num(string)
    string = re.sub(r'[ ]+', ' ', string).strip()
    string = string
    return string

2022-04-15 23:41:01.995554: I tensorflow/stream_executor/platform/default/dso_loader.cc:53] Successfully opened dynamic library libcudart.so.11.0
 The versions of TensorFlow you are currently using is 2.5.0 and is not supported. 
Some things might work, some things might not.
If you were to encounter a bug, do not file an issue.
If you want to make sure you're using a tested and supported configuration, either change the TensorFlow version or the TensorFlow Addons's version. 
You can find the compatibility matrix in TensorFlow Addon's readme:
https://github.com/tensorflow/addons
TensorFlow Addons has compiled its custom ops against TensorFlow 2.4.0, and there are no compatibility guarantees between the two versions. 
This means that you might get segfaults when loading the custom op, or other kind of low-level errors.
 If you do, do not file an issue on Github. This is a known limitation.

It might help you to fallback to pure Python ops with TF_ADDONS_PY_OPS . To do that, see https://

In [9]:
import pandas as pd

df = pd.read_csv('haqkiem/metadata.csv', header = None, sep = '|')
df.head()

Unnamed: 0,0,1
0,LJ001-000001,Sultan Johor Sultan Ibrahim Iskandar selamat t...
1,LJ002-000001,Menerusi entri terbaharu dalam laman Facebook ...
2,LJ003-000001,Kepulangan Sultan Ibrahim disambut oleh Tunku ...
3,LJ004-000001,Sultan Ibrahim berlepas ke luar negara pada mi...
4,LJ005-000001,Kepulangan Sultan Ibrahim dijangka dapat menye...


In [10]:
haqkiem_text = df[1].tolist()
haqkiem_text = [text.split('.,,')[0] for text in haqkiem_text if len(re.findall(r'(RM \d+,\d+\.\d+|RM \d+\.\d+)', text))]
haqkiem_text[:10]

['Beliau berkata, keseluruhan dadah dirampas seberat 656.9 gram dengan nilai RM 66836.50',
 'Syarikat gergasi elektronik, Apple hari ini mengumumkan pendapatan suku tahunan sebanyak RM 374.68 bilion iaitu peningkatan sebanyak 9%',
 'Penyanyi Ifa Raziah mengakui rasa takut apabila memakai barang kemas bernilai RM 3.5 juta sehinggakan terpaksa mengupah 3 orang pengawal',
 'Pada majlis akad nikah itu, Eizlan telah menyerahkan mas kahwin sebanyak RM 80 dan sebentuk cincin berlian serta wang hantaran berjumlah RM 12,121.90',
 'Bukan itu sahaja, koleksi filem Marvel ini lengkap dengan pakej bundle yang berharga antara RM 6 hingga RM 19.90 jer']

In [11]:
cleaning(haqkiem_text[0])

'Beliau berkata , keseluruhan dadah dirampas seberat 656.9 gram dengan nilai RM 66836.50.'

In [12]:
cleaning(haqkiem_text[0])

'Beliau berkata , keseluruhan dadah dirampas seberat 656.9 gram dengan nilai RM 66836.50.'

In [13]:
len_texts = []
for t in haqkiem_text:
    len_texts.append(len(t.split()))
    
np.max(len_texts), np.min(len_texts), np.mean(len_texts)

(23, 12, 18.6)

In [14]:
with open('wikidump1-raw.json') as fopen:
    wiki = json.load(fopen)
    
len(wiki)

1748387

In [15]:
with open('dumping-news-6-july-2019.json') as fopen:
    news = json.load(fopen)
    
len(news)

399251

In [21]:
from tqdm import tqdm

min_len = 2
max_len = 20

selected_wiki = []
for t in tqdm(wiki):
    l = len(t.split())
    if min_len < l < max_len:
        selected_wiki.append(t)

100%|██████████| 1748387/1748387 [00:01<00:00, 1365227.05it/s]


In [22]:
selected_news = []
for t in tqdm(news):
    l = len(t.split())
    if min_len < l < max_len:
        selected_news.append(t)

100%|██████████| 399251/399251 [00:00<00:00, 1290666.23it/s]


In [23]:
len(selected_wiki), len(selected_news)

(1077492, 196733)

In [28]:
import random

samples = random.sample(selected_wiki, 20000) + random.sample(selected_news, 30000)
samples = [{'text': t, 'cleaned': cleaning(t)} for t in samples]

In [29]:
samples[0]

{'text': "Sedangkan dalam bahasa Perancis, ''frire'' hanya bererti menggoreng di dalam minyak goreng yang banyak hingga terendam.",
 'cleaned': "Sedangkan dalam bahasa Perancis , '' frire '' hanya bererti menggoreng di dalam minyak goreng yang banyak hingga terendam ."}

In [30]:
lengths = []
for t in tqdm(samples):
    lengths.append(len(t['text']))

100%|██████████| 50000/50000 [00:00<00:00, 3231159.87it/s]


In [31]:
np.sum(lengths)

4445405

In [32]:
with open('populated-text.json', 'w') as fopen:
    json.dump(samples, fopen)