In [2]:
import json
import os
import random
from tqdm import tqdm
import pandas as pd

def detect_mandarin(text):
    for char in text:
        codepoint = ord(char)
        if (
            0x4E00 <= codepoint <= 0x9FFF or   # CJK Unified Ideographs
            0x3400 <= codepoint <= 0x4DBF or   # CJK Unified Ideographs Extension A
            0x20000 <= codepoint <= 0x2A6DF or # Extension B
            0x2A700 <= codepoint <= 0x2B73F or # Extension C
            0x2B740 <= codepoint <= 0x2B81F or # Extension D
            0x2B820 <= codepoint <= 0x2CEAF or # Extension E
            0x2CEB0 <= codepoint <= 0x2EBEF    # Extension F
        ):
            return True
    return False

transcribe = ['transcribe the audio']

In [3]:
with open('processed-cantonese-radio.json') as fopen:
    data = json.load(fopen)

In [4]:
processed = []
for d in tqdm(data):
    new_f = d['audio_filename'].replace('-mp3/', '-mp3-16k/')
    if not os.path.exists(new_f):
        continue
    
    if not detect_mandarin(d['transcript_whisper']):
        continue
        
    processed.append({
        'question': random.choice(transcribe),
        'answer': d['transcript_whisper'],
        'audio_filename': new_f,
    })

100%|█████████████████████████████████████████████████████████████████████████| 1096211/1096211 [00:04<00:00, 259592.33it/s]


In [5]:
len(processed)

1095346

In [6]:
cantonese = random.sample(processed, 500000)

In [7]:
pd.DataFrame(cantonese).to_parquet('cantonese-stt.parquet')

In [8]:
!huggingface-cli upload mesolitica/Transcription-Instructions \
cantonese-stt.parquet /data/cantonese-00000-of-00001.parquet --repo-type=dataset

Uploading files using Xet Storage..
Uploading...: 100%|██████████████████████████| 123M/123M [00:13<00:00, 9.42MB/s]
https://huggingface.co/datasets/mesolitica/Transcription-Instructions/blob/main//data/cantonese-00000-of-00001.parquet


In [9]:
with open('extra-emilia-mandarin.json') as fopen:
    data = json.load(fopen)

In [10]:
processed = []
for d in tqdm(data):
    new_f = d['audio_filename']
    if not os.path.exists(new_f):
        continue
    
    if not detect_mandarin(d['transcript_whisper']):
        continue
        
    processed.append({
        'question': random.choice(transcribe),
        'answer': d['transcript_whisper'],
        'audio_filename': new_f,
    })

100%|███████████████████████████████████████████████████████████████████████████| 868185/868185 [00:02<00:00, 295710.00it/s]


In [11]:
len(processed)

865405

In [12]:
mandarin = random.sample(processed, 500000)

In [13]:
pd.DataFrame(mandarin).to_parquet('mandarin-stt.parquet')

In [14]:
!huggingface-cli upload mesolitica/Transcription-Instructions \
mandarin-stt.parquet /data/mandarin-00000-of-00001.parquet --repo-type=dataset

Uploading files using Xet Storage..
Uploading...: 100%|████████████████████████| 56.8M/56.8M [00:10<00:00, 5.22MB/s]
https://huggingface.co/datasets/mesolitica/Transcription-Instructions/blob/main//data/mandarin-00000-of-00001.parquet
