In [1]:
from glob import glob
import soundfile as sf
import librosa
import pandas as pd
import json
from multiprocess import Pool
import itertools

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)

def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

In [2]:
# !pip3 install librosa soundfile

In [3]:
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-TTS-v2/resolve/main/tts-force-alignment.zip

In [4]:
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-TTS-v2/resolve/main/tts.parquet

In [5]:
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-TTS-v2/resolve/main/processed.parquet

In [6]:
# !wget https://huggingface.co/datasets/mesolitica/Malaysian-TTS-v2/resolve/main/chunk.parquet

In [7]:
# import zipfile

# destination_folder = './'
# with zipfile.ZipFile('tts-force-alignment.zip', 'r') as zip_ref:
#     zip_ref.extractall(destination_folder)

In [8]:
df = pd.read_parquet('processed.parquet')
df.head()

Unnamed: 0,reference_text,generate_text,normalized_generate_text,reference_audio,filename_audio,speaker,similarity,audio_length,index,alignment,averaged_pitch,distances
0,"Uhm, hello, selamat pagi ye, saya dari custome...","Encik, bolehkah Encik memberikan maklum balas ...","Encik, bolehkah Encik memberikan maklum balas ...",husein-assistant.mp3,response-husein-v3/55420.mp3,husein,0.800952,4.400181,1367694,"[{'end': 0.38, 'score': -3.89, 'start': 0.12, ...","[279.365, 104.309, 97.318, 94.798, 95.233, 91....","[0.073, 0.007, 0.012, 0.006, 0.007, 0.012, 0.0..."
1,"Hi, saya adalah pembantu AI anda, selamat berk...","Puan, saya dari DanceFit, uhm, kami ada kelas ...","Puan, saya dari DanceFit, uhm, kami ada kelas ...",shafiqah-idayu-enhanced-v2-v2-trim.mp3,introduction-idayu-v2/102406.mp3,idayu,0.737765,7.291066,1772554,"[{'end': 0.72, 'score': -1.533, 'start': 0.28,...","[248.865, 237.913, 218.932, 253.237, 209.599, ...","[0.016, 0.06, 0.02, 0.004, 0.065, 0.035, 0.027..."
2,"Uhm, hello, selamat pagi ye, saya dari custome...","Cik, rider sedang dalam perjalanan untuk hanta...","Cik, rider sedang dalam perjalanan untuk hanta...",husein-assistant.mp3,response-husein-v2/107749.mp3,husein,0.731988,6.153288,1218381,"[{'end': 0.36, 'score': -1.588, 'start': 0.12,...","[182.189, 152.634, 165.402, 135.936, 120.261, ...","[0.11, 0.012, 0.01, 0.012, 0.008, 0.012, 0.01,..."
3,"Hi, saya adalah pembantu AI anda, selamat berk...",open quote Yang itu kurungan terbuka denda R M...,open quote Yang itu kurungan terbuka denda R M...,,prepare-dataset-normalizer-text-malay-news-ida...,idayu,0.776151,11.877007,2166069,"[{'end': 0.4, 'score': -1.74, 'start': 0.14, '...","[339.107, 230.672, 210.806, 278.877, 266.024, ...","[0.025, 0.008, 0.02, 0.04, 0.008, 0.014, 0.028..."
4,"Hi, saya adalah pembantu AI anda, selamat berk...","Cik, nak tanya, berapa lebar pintu masuk Cik? ...","Cik, nak tanya, berapa lebar pintu masuk Cik? ...",shafiqah-idayu-enhanced-v2-v2-trim.mp3,response-idayu-v2/160707.mp3,idayu,0.760246,5.21288,66815,"[{'end': 0.34, 'score': -0.103, 'start': 0.18,...","[222.045, 234.713, 231.469, 224.546, 214.563, ...","[0.015, 0.013, 0.033, 0.013, 0.008, 0.012, 0.0..."


In [9]:
# !rm -rf chunk-streaming chunk-streaming-done chunk-streaming-metadata
# !mkdir chunk-streaming chunk-streaming-done chunk-streaming-metadata

In [10]:
from tqdm import tqdm
import os

def loop(indices):
    indices, _ = indices
    df = pd.read_parquet('processed.parquet')
    for i in tqdm(indices):

        filename_done = f'chunk-streaming-done/{i}.json'
        filename_metadata = f'chunk-streaming-metadata/{i}.json'
        try:
            with open(filename_done, 'w') as fopen:
                json.load(fopen)
                continue
        except:
            pass

        f = df['filename_audio'].iloc[i].replace('/', '_').replace('.mp3', '')
        f = os.path.join('chunk-streaming', f)
            
        c = df['alignment'].iloc[i]
        temp = [c[0]]
        segments = []
        last_t = c[0]['end']
        total = 0
        for k in range(1, len(c), 1):
            c_ = c[k]
            if (c_['start'] - last_t) >= 0.15:
                total += 1
                t = ' '.join([t_['text'] for t_ in temp])
                segments.append(temp)
                temp = []
                temp_pitch = []
                temp_speed = []
        
            last_t = c_['end']
            temp.append(c_)

        if len(temp):
            total += 1
            t = ' '.join([t_['text'] for t_ in temp])
            segments.append(temp)

        y, sr = librosa.load(df['filename_audio'].iloc[i], sr = None)
        inner = []
        for no, segment in enumerate(segments):
            start = segment[0]['start']
            end = segment[-1]['end'] + 0.1
            y_ = y[int(start * sr): int(end * sr)]
            audio_filename = f'{f}_{no}.mp3'
            sf.write(audio_filename, y_, sr)
            text = ' '.join([s['text'] for s in segment])
            inner.append((text, audio_filename))

        with open(filename_metadata, 'w') as fopen:
            json.dump(inner, fopen)

        with open(filename_done, 'w') as fopen:
            json.dump('done', fopen)
        
    del df

In [11]:
# loop((range(10), 0))

In [20]:
with open(glob('chunk-streaming-metadata/*.json')[2]) as fopen:
    d = json.load(fopen)

In [21]:
d

[['Encik,', 'chunk-streaming/introduction-idayu-v2_131817_0.mp3'],
 ['saya dari HouseClean,',
  'chunk-streaming/introduction-idayu-v2_131817_1.mp3'],
 ['uhm,', 'chunk-streaming/introduction-idayu-v2_131817_2.mp3'],
 ['boleh saya tahu berapa bilik tidur rumah Encik?',
  'chunk-streaming/introduction-idayu-v2_131817_3.mp3']]

In [25]:
import IPython.display as ipd

ipd.Audio('chunk-streaming/introduction-idayu-v2_131817_3.mp3')

In [None]:
multiprocessing(range(len(df)), loop, cores = 30)

 34%|███▍      | 18536/54848 [35:18<1:13:54,  8.19it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

 43%|████▎     | 23643/54848 [45:23<1:06:27,  7.83it/s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_rate_limit`.

Current values:
ServerApp.iopub_msg_rate_limit=1000.0 (msgs/sec)
ServerApp.rate_limit_window=3.0 (secs)

 49%|████▉     | 26894/54848 [51:55<59:56,  7.77it/s]s]IOPub message rate exceeded.
The Jupyter server will temporarily stop sending output
to the client in order to avoid crashing it.
To change this limit, set the config variable
`--ServerApp.iopub_msg_