In [1]:
# !pip3 install textgrid
import textgrid

In [2]:
import os

os.environ['AWS_ENDPOINT_URL'] = 'https://s3.us-west-000.backblazeb2.com'
os.environ['AWS_ACCESS_KEY_ID'] = ''
os.environ['AWS_SECRET_ACCESS_KEY'] = ''

In [3]:
from tqdm import tqdm
import boto3
import subprocess
from unidecode import unidecode
import IPython.display as ipd
from glob import glob
import soundfile as sf

os.makedirs('part4-same', exist_ok = True)
os.makedirs('part4-same-audio', exist_ok = True)
os.makedirs('part4-separate', exist_ok = True)
os.makedirs('part4-separate-audio', exist_ok = True)

In [4]:
!mkdir part4-same-audio-mp3
!mkdir part4-separate-audio-mp3
!rm part4-same-audio-mp3/*.mp3
!rm part4-separate-audio-mp3/*.mp3

mkdir: cannot create directory ‘part4-same-audio-mp3’: File exists
mkdir: cannot create directory ‘part4-separate-audio-mp3’: File exists
rm: cannot remove 'part4-same-audio-mp3/*.mp3': No such file or directory
rm: cannot remove 'part4-separate-audio-mp3/*.mp3': No such file or directory


In [5]:
session = boto3.Session()

In [6]:
s3 = session.resource('s3')
bucket = s3.Bucket('imda-singapore')

In [7]:
for f in tqdm(bucket.objects.filter(Prefix='IMDA - National Speech Corpus (Additional)/PART4/Codeswitching/Same Room Scripts')):
    filename = os.path.join('part4-same', os.path.split(f.key)[1])
    if os.path.exists(filename):
        continue
                            
    bucket.download_file(f.key, filename)

1072it [00:00, 1093.64it/s]


In [8]:
for f in tqdm(bucket.objects.filter(Prefix='IMDA - National Speech Corpus (Additional)/PART4/Codeswitching/Same Room Audio')):
    filename = os.path.join('part4-same-audio', os.path.split(f.key)[1])
    if os.path.exists(filename):
        continue
                            
    bucket.download_file(f.key, filename)

1072it [35:36,  1.99s/it]


In [9]:
for f in tqdm(bucket.objects.filter(Prefix='IMDA - National Speech Corpus (Additional)/PART4/Codeswitching/Diff Room Scripts')):
    filename = os.path.join('part4-separate', os.path.split(f.key)[1])
    if os.path.exists(filename):
        continue
                            
    bucket.download_file(f.key, filename)

972it [18:38,  1.15s/it]


In [10]:
for f in tqdm(bucket.objects.filter(Prefix='IMDA - National Speech Corpus (Additional)/PART4/Codeswitching/Diff Room Audio')):
    filename = os.path.join('part4-separate-audio', os.path.split(f.key)[1])
    if os.path.exists(filename):
        continue
                            
    bucket.download_file(f.key, filename)

972it [1:22:13,  5.08s/it]


In [19]:
import re

def cleaning(s):
    return re.sub(r'[ ]+', ' ', s).strip()

replaces = [
    '<malay>',
    '</malay>',
    '_',
    '<UNK>',
    '(ppl)',
    '(ppo)',
    '(mm)',
    '(um)',
    '(ppb)'
]

replaces_with = {
    '[lah]': 'lah',
    '[ah]': 'ah',
    '[sia]': 'sia',
    '[eh]': 'eh',
    '(uh)': 'uh',
    '[what]': 'what',
    '[oh]': 'oh',
    '(err)': 'err',
    '[lor]': 'lor',
    '[ha]': 'ha',
    '[meh]': 'meh',
    '[one]': 'one',
    "[a'ah]": "a'ah",
    '[hor]': 'hor',
    '[leh]': 'leh',
    '[mah]': 'mah',
    '[nah]': 'nah',
    '[tau]': 'tau',
    '[uh]': 'uh',
    '[wah]': 'wah'
}

maxlen = 15

In [20]:
scripts = sorted(glob('part4-same/*'))

all_results = {}
for s in tqdm(scripts):
    results = []
    id_file = os.path.split(s)[1].replace('.TextGrid', '')
    texts = []
    total_l = 0
    start = []
    end = []
    try:
        tg = textgrid.TextGrid.fromFile(s)
    except:
        continue
    for tg_ in tg[0]:
        if tg_.mark is None:
            continue
        if not len(tg_.mark):
            continue
        start.append(tg_.minTime)
        end.append(tg_.maxTime)
        total_l += (tg_.maxTime - tg_.minTime)
        if tg_.mark[0] != '<' and tg_.mark[-1] != '>':
            t = tg_.mark.split()
            filtered = []
            for no, t_ in enumerate(t):
                if t_[-1] == '~':
                    continue
                if t_[0] == '#' and t_[-1] == '#':
                    t_ = t_[1:-1].title()
                filtered.append(t_)
                t[no] = filtered

            texts.append(' '.join(filtered))
        if total_l >= maxlen:
            t = ', '.join(texts)
            if len(t) > 3:
                for r in replaces:
                    t = t.replace(r, '')
                for r, v in replaces_with.items():
                    t = t.replace(r, v)
                t = cleaning(t)
                results.append((t, [start[0], end[-1]]))
            texts = []
            total_l = 0
            start = []
            end = []
    
    if len(texts):
        t = ', '.join(texts)
        if len(t) > 3:
            for r in replaces:
                t = t.replace(r, '')
            for r, v in replaces_with.items():
                t = t.replace(r, v)
            t = cleaning(t)
            results.append((', '.join(texts), [start[0], end[-1]]))
    
    all_results[id_file] = results

100%|██████████| 1072/1072 [00:29<00:00, 36.85it/s]


In [21]:
keys = list(all_results)
keys = [(k, all_results[k]) for k in keys]

In [22]:
def loop(keys):
    results = []
    keys, _ = keys
    for key in tqdm(keys):
        key, rows = key
        audio_file = os.path.join('part4-same-audio', f'{key}.wav')
        if not os.path.exists(audio_file):
            continue
        y, sr = sf.read(audio_file)
        index = 0
        for row in rows:
            filename = os.path.join('part4-same-audio-mp3', f'{key}-{index}.mp3')
            if os.path.exists(filename):
                index += 1
                
                results.append({
                    'filename': filename,
                    'text': row[0]
                })
                
                continue
                
            y_ = y[int(sr * row[-1][0]): int(sr * row[-1][-1])]
            sf.write(filename, y_, sr)
            index += 1
            results.append({
                'filename': filename,
                'text': row[0]
            })
    return results

In [23]:
import mp

results = mp.multiprocessing(keys, loop, cores = 20)

100%|██████████| 52/52 [25:41<00:00, 29.65s/it] 
100%|██████████| 52/52 [30:53<00:00, 35.64s/it]
100%|██████████| 52/52 [31:07<00:00, 35.92s/it]
100%|██████████| 52/52 [31:42<00:00, 36.59s/it]
100%|██████████| 52/52 [32:24<00:00, 37.39s/it]
100%|██████████| 52/52 [32:45<00:00, 37.79s/it]
100%|██████████| 52/52 [32:53<00:00, 37.95s/it]
100%|██████████| 52/52 [33:05<00:00, 38.18s/it]
100%|██████████| 52/52 [33:16<00:00, 38.39s/it]
100%|██████████| 14/14 [08:04<00:00, 34.58s/it]
100%|██████████| 52/52 [33:50<00:00, 39.04s/it]
100%|██████████| 52/52 [34:33<00:00, 39.87s/it]
100%|██████████| 52/52 [34:34<00:00, 39.89s/it]
100%|██████████| 52/52 [34:37<00:00, 39.94s/it]
100%|██████████| 52/52 [35:05<00:00, 40.50s/it]
100%|██████████| 52/52 [35:18<00:00, 40.74s/it]
100%|██████████| 52/52 [35:22<00:00, 40.82s/it]
100%|██████████| 52/52 [35:23<00:00, 40.85s/it]
100%|██████████| 52/52 [35:56<00:00, 41.46s/it]
100%|██████████| 52/52 [36:18<00:00, 41.90s/it]
100%|██████████| 52/52 [37:26<00:00, 43

In [24]:
len(results)

172166

In [30]:
results[-2]

{'filename': 'part4-same-audio-mp3/sur_2014_6028_phns_cs-mly-72.mp3',
 'text': 'and then when you grade when you grade for higher higher grading eh, this is what I heard from ah from harries ah he told me that if you grade yourself high ah then management will think that you have no room for improvement so ah management will actually move down to lower lower'}

In [31]:
ipd.Audio(results[-2]['filename'], rate = 16000)

In [32]:
import json

with open('imda-same-part4.jsonl', 'w') as fopen:
    for r in tqdm(results):
        fopen.write(f'{json.dumps(r)}\n')

100%|██████████| 172166/172166 [00:00<00:00, 271654.58it/s]


In [34]:
!du -hs part4-same-audio-mp3

16G	part4-same-audio-mp3


In [35]:
scripts = sorted(glob('part4-separate/*'))

all_results = {}
for s in tqdm(scripts):
    results = []
    id_file = os.path.split(s)[1].replace('.TextGrid', '')
    texts = []
    total_l = 0
    start = []
    end = []
    try:
        tg = textgrid.TextGrid.fromFile(s)
    except:
        continue
    for tg_ in tg[0]:
        if tg_.mark is None:
            continue
        if not len(tg_.mark):
            continue
        start.append(tg_.minTime)
        end.append(tg_.maxTime)
        total_l += (tg_.maxTime - tg_.minTime)
        if tg_.mark[0] != '<' and tg_.mark[-1] != '>':
            t = tg_.mark.split()
            filtered = []
            for no, t_ in enumerate(t):
                if t_[-1] == '~':
                    continue
                if t_[0] == '#' and t_[-1] == '#':
                    t_ = t_[1:-1].title()
                filtered.append(t_)
                t[no] = filtered

            texts.append(' '.join(filtered))
        if total_l >= maxlen:
            t = ', '.join(texts)
            if len(t) > 3:
                for r in replaces:
                    t = t.replace(r, '')
                for r, v in replaces_with.items():
                    t = t.replace(r, v)
                t = cleaning(t)
                results.append((t, [start[0], end[-1]]))
            texts = []
            total_l = 0
            start = []
            end = []
    
    if len(texts):
        t = ', '.join(texts)
        if len(t) > 3:
            for r in replaces:
                t = t.replace(r, '')
            for r, v in replaces_with.items():
                t = t.replace(r, v)
            t = cleaning(t)
            results.append((', '.join(texts), [start[0], end[-1]]))
    
    all_results[id_file] = results

100%|██████████| 972/972 [00:25<00:00, 38.78it/s]


In [36]:
keys = list(all_results)
keys = [(k, all_results[k]) for k in keys]

In [37]:
def loop(keys):
    results = []
    keys, _ = keys
    for key in tqdm(keys):
        key, rows = key
        audio_file = os.path.join('part4-separate-audio', f'{key}.wav')
        if not os.path.exists(audio_file):
            continue
        y, sr = sf.read(audio_file)
        index = 0
        for row in rows:
            filename = os.path.join('part4-separate-audio-mp3', f'{key}-{index}.mp3')
            if os.path.exists(filename):
                index += 1
                
                results.append({
                    'filename': filename,
                    'text': row[0]
                })
                
                continue
                
            y_ = y[int(sr * row[-1][0]): int(sr * row[-1][-1])]
            sf.write(filename, y_, sr)
            index += 1
            results.append({
                'filename': filename,
                'text': row[0]
            })
    return results

In [38]:
import mp

results_separate = mp.multiprocessing(keys, loop, cores = 20)

100%|██████████| 48/48 [30:54<00:00, 38.63s/it]
100%|██████████| 1/1 [00:37<00:00, 37.63s/it]t]
100%|██████████| 48/48 [31:33<00:00, 39.45s/it]
100%|██████████| 48/48 [32:04<00:00, 40.10s/it]
100%|██████████| 48/48 [32:14<00:00, 40.30s/it]
100%|██████████| 48/48 [32:23<00:00, 40.50s/it]
100%|██████████| 48/48 [32:31<00:00, 40.67s/it]
100%|██████████| 48/48 [32:39<00:00, 40.82s/it]
100%|██████████| 48/48 [32:51<00:00, 41.07s/it]
100%|██████████| 48/48 [33:01<00:00, 41.28s/it]
100%|██████████| 48/48 [33:16<00:00, 41.59s/it]
100%|██████████| 48/48 [33:21<00:00, 41.69s/it]
100%|██████████| 48/48 [34:04<00:00, 42.60s/it]
100%|██████████| 48/48 [34:27<00:00, 43.07s/it]
100%|██████████| 48/48 [34:35<00:00, 43.23s/it]
100%|██████████| 48/48 [34:37<00:00, 43.29s/it]
100%|██████████| 48/48 [34:45<00:00, 43.44s/it]
100%|██████████| 48/48 [34:52<00:00, 43.60s/it]
100%|██████████| 48/48 [35:01<00:00, 43.77s/it]
100%|██████████| 48/48 [35:15<00:00, 44.07s/it]
100%|██████████| 48/48 [35:24<00:00, 44.

In [39]:
len(results_separate)

163431

In [40]:
results_separate[1000]

{'filename': 'part4-separate-audio-mp3/sur_0010_1021_phnd_cs-chn-75.mp3',
 'text': "I can do fifteen in one shot that's my max ah usually I just do ten ten pull-ups right now but I think I got damn fat leh !huh! ya pull-ups"}

In [41]:
ipd.Audio(results_separate[1000]['filename'], rate = 16000)

In [42]:
with open('imda-separate-part4.jsonl', 'w') as fopen:
    for r in tqdm(results_separate):
        fopen.write(f'{json.dumps(r)}\n')

100%|██████████| 163431/163431 [00:00<00:00, 253934.83it/s]


In [43]:
from huggingface_hub import HfApi
api = HfApi()

In [44]:
api.upload_file(
    path_or_fileobj='imda-same-part4.jsonl',
    path_in_repo='imda-same-part4.jsonl',
    repo_id='mesolitica/IMDA-STT',
    repo_type='dataset',
)

imda-same-part4.jsonl:   0%|          | 0.00/38.9M [00:00<?, ?B/s]

'https://huggingface.co/datasets/mesolitica/IMDA-STT/blob/main/imda-same-part4.jsonl'

In [45]:
api.upload_file(
    path_or_fileobj='imda-separate-part4.jsonl',
    path_in_repo='imda-separate-part4.jsonl',
    repo_id='mesolitica/IMDA-STT',
    repo_type='dataset',
)

imda-separate-part4.jsonl:   0%|          | 0.00/37.6M [00:00<?, ?B/s]

'https://huggingface.co/datasets/mesolitica/IMDA-STT/blob/main/imda-separate-part4.jsonl'

In [46]:
# !sudo apt install p7zip-full p7zip-rar -y

In [47]:
!7z -v5g a part4-same-audio-mp3.7z part4-same-audio-mp3 > /dev/null
!7z -v5g a part4-separate-audio-mp3.7z part4-separate-audio-mp3 > /dev/null

In [48]:
for f in glob('part4-same-audio-mp3.7z*'):
    print(f)
    api.upload_file(
        path_or_fileobj=f,
        path_in_repo=f,
        repo_id='mesolitica/IMDA-STT',
        repo_type='dataset',
    )

part4-same-audio-mp3.7z.003


part4-same-audio-mp3.7z.003:   0%|          | 0.00/4.94G [00:00<?, ?B/s]

part4-same-audio-mp3.7z.001


part4-same-audio-mp3.7z.001:   0%|          | 0.00/5.37G [00:00<?, ?B/s]

part4-same-audio-mp3.7z.002


part4-same-audio-mp3.7z.002:   0%|          | 0.00/5.37G [00:00<?, ?B/s]

In [49]:
for f in glob('part4-separate-audio-mp3.7z*'):
    print(f)
    api.upload_file(
        path_or_fileobj=f,
        path_in_repo=f,
        repo_id='mesolitica/IMDA-STT',
        repo_type='dataset',
    )

part4-separate-audio-mp3.7z.001


part4-separate-audio-mp3.7z.001:   0%|          | 0.00/5.37G [00:00<?, ?B/s]

part4-separate-audio-mp3.7z.002


part4-separate-audio-mp3.7z.002:   0%|          | 0.00/5.37G [00:00<?, ?B/s]

part4-separate-audio-mp3.7z.003


part4-separate-audio-mp3.7z.003:   0%|          | 0.00/4.17G [00:00<?, ?B/s]

In [50]:
total_len = 0
for f in tqdm(glob('part4-separate-audio/*.wav')):
    y, sr = sf.read(f)
    total_len += len(y) / sr

100%|██████████| 971/971 [05:04<00:00,  3.19it/s]


In [51]:
for f in tqdm(glob('part4-same-audio/*.wav')):
    y, sr = sf.read(f)
    total_len += len(y) / sr

100%|██████████| 1072/1072 [06:08<00:00,  2.91it/s]


In [53]:
total_len / 60 / 60

2133.685638097089

In [54]:
!rm -rf part4-same-audio part4-separate-audio

In [56]:
!rm *7z*