In [1]:
from glob import glob
from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer
import json
import os
import mp
import re
import subprocess
import soundfile as sf

In [2]:
files = glob('malaysian-podcast_processed/**/*/*.json', recursive = True)
files.extend(glob('/home/husein/ssd3/sg-podcast_processed/**/*/*.json', recursive = True))

len(files)

22492

In [3]:
rejected = [
    'terima kasih kerana menonton',
    'terima kasih',
    'thank you for watching',
]

In [4]:
# !rm -rf new_chunk
!mkdir new_chunk

In [5]:
len(glob('new_chunk/*.mp3'))

0

In [6]:
def new_path(f):
    f = f.replace('.mp3', '.alignment')
    f = f.replace('_processed/', '_processed_alignment/')
    return f

def new_path_lang(f):
    f = f.replace('.mp3', '.language')
    f = f.replace('_processed/', '_processed_language/')
    return f

def chunk(alignment, reject = -15, minimum_length = 3.0):
    alls, temp = [], []
    for a in alignment:
        if a['score'] <= reject:
            if len(temp):
                temp[-1]['end'] = a['start']
                if (temp[-1]['end'] - temp[0]['start']) >= minimum_length:
                    alls.append(temp)
                temp = []
        else:
            temp.append(a)
            
    if len(temp):
        if (temp[-1]['end'] - temp[0]['start']) >= minimum_length:
            alls.append(temp)
    return alls
        
def loop(files):
    files, _ = files
    data = []
    for file in tqdm(files):
        folder = os.path.split(file)[0]
        folder_folder = os.path.split(folder)[1]
        filename = file.replace('.json', '')

        try:
            with open(file) as fopen:
                d = json.load(fopen)
        except:
            continue

        for no, obj in enumerate(d):
            text = obj["text"].strip()
            
            rt_ = re.sub('[^a-z ]+', '', text.lower()).strip()
            if any([s == rt_ for s in rejected]):
                continue
            
            try:
                dense = CountVectorizer(ngram_range = (3,3)).fit_transform([text]).todense()
                repeat = (dense > 3).sum() >= 1
                if repeat:
                    continue
            except:
                continue
            
            audio_path = os.path.join(folder, f'{folder_folder}_{no}.mp3')
            
            if not os.path.exists(audio_path):
                continue
                
            align_path = new_path(audio_path)
            
            if not os.path.exists(align_path):
                continue
                
            with open(align_path) as fopen:
                align = json.load(fopen)
                
            scores = [a for a in align if a['score'] <= -15]
            if not len(scores):
                continue
            
            chunks = chunk(align)
            if len(chunks):
                y, sr = sf.read(audio_path)
                for no, c in enumerate(chunks):
                    try:
                        t = ' '.join([c_['text'] for c_ in c])
                        start = c[0]['start']
                        end = c[-1]['end']
                        a = audio_path.replace('/', '_').replace('.mp3', '') 
                        a = os.path.join('new_chunk', f'{a}_{no}.mp3')
                        if not os.path.exists(a):
                            sf.write(a, y[int(sr * start): int(sr * end)], sr)

                        data.append({
                            'audio': a,
                            'transcription': t,
                        })
                    except:
                        pass
    
    return data

In [7]:
d = loop((files[-100:], 0))

100%|███████████████████████████████████████████████████████████████████████████████████████| 100/100 [00:19<00:00,  5.16it/s]


In [9]:
len(d)

509

In [11]:
d[-1]

{'audio': 'new_chunk/_home_husein_ssd3_sg-podcast_processed_What To Do And Eat In Sydney, Australia [H4y5ydsnMuk]_What To Do And Eat In Sydney, Australia [H4y5ydsnMuk]_0_1.mp3',
 'transcription': 'take a look up and look into the sky. The sky is so'}

In [12]:
import IPython.display as ipd
ipd.Audio(d[-1]['audio'])

In [13]:
data = mp.multiprocessing(files, loop, cores = 15)

100%|█████████████████████████████████████████████████████████████████████████████████████| 1499/1499 [12:36<00:00,  1.98it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 1499/1499 [12:37<00:00,  1.98it/s]
100%|██████████████████████████████████████████████████████████████████████████████████████████| 7/7 [00:00<00:00, 341.29it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 1499/1499 [12:42<00:00,  1.97it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 1499/1499 [12:45<00:00,  1.96it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 1499/1499 [12:48<00:00,  1.95it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 1499/1499 [13:07<00:00,  1.90it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████| 1499/1499 [13:10<00

In [18]:
len(data)

192679

In [15]:
from collections import defaultdict

uniques = defaultdict(int)
for d in tqdm(data):
    uniques[d['audio'].split('_processed')[0]] += 1
    
uniques

100%|████████████████████████████████████████████████████████████████████████████| 192679/192679 [00:00<00:00, 3202665.87it/s]


defaultdict(int,
            {'new_chunk/malaysian-podcast': 157210,
             'new_chunk/_home_husein_ssd3_sg-podcast': 35469})

In [16]:
import pandas as pd

df = pd.DataFrame(data)
df.head()

Unnamed: 0,audio,transcription
0,new_chunk/malaysian-podcast_processed_3 Teknik...,"Ada satu pepatah dalam MMA ni kata, don't give..."
1,new_chunk/malaysian-podcast_processed_3 Teknik...,"di dalam, you know, sukan grappling ataupun MM..."
2,new_chunk/malaysian-podcast_processed_3 Teknik...,"Tapi yang common orang buat ni biasa, rear nak..."
3,new_chunk/malaysian-podcast_processed_3 Teknik...,"tu berada di belakang apa, posisi, maksudnya b..."
4,new_chunk/malaysian-podcast_processed_3 Teknik...,"lock. So, memang, bahaya lah. Okay, nombor sat..."


In [17]:
df.to_parquet('verify-text-chunk-podcasts.parquet')

In [19]:
!du -hs new_chunk

8.1G	new_chunk


In [20]:
!zip -rq text-chunk-podcasts.zip new_chunk