In [33]:
import json
import pandas as pd
import random
import mp
import os
from huggingface_hub import snapshot_download
from glob import glob
from tqdm import tqdm
from transformers import AutoProcessor

In [21]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
tokenizer = processor.tokenizer
audio_token = "<|AUDIO|>"
audio_bos_token = "<|audio_bos|>"
audio_eos_token = "<|audio_eos|>"
audio_token_id = processor.tokenizer._convert_token_to_id_with_added_voc('<|AUDIO|>')
pad_token_id = processor.tokenizer.pad_token_id

preprocessor_config.json:   0%|          | 0.00/342 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/638k [00:00<?, ?B/s]

vocab.json:   0%|          | 0.00/2.78M [00:00<?, ?B/s]

merges.txt:   0%|          | 0.00/1.67M [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/7.03M [00:00<?, ?B/s]

In [14]:
templates_word = [
    'transcribe the audio into Whisper format in word timestamp',
    'audio to Whisper ASR format word timestamp',
    'transkrip audio ke format word timestamp whisper',
]
templates_segment = [
    'transcribe the audio into Whisper format in segment timestamp',
    'audio to Whisper ASR format segment timestamp',
    'transkrip audio ke format segment timestamp whisper',
]
templates_empty = [
    'transcribe the audio',
    'transkrip audio'
]

In [8]:
snapshot_download(
    repo_id='malaysia-AI/STT-Whisper', 
    repo_type='dataset', allow_patterns = 'data/*.parquet', local_dir = './', max_workers = 10)

Fetching 9 files:   0%|          | 0/9 [00:00<?, ?it/s]

'/home/husein/ssd3/cooking/qwen2audio/sft'

In [10]:
files = glob('data/*.parquet')
files = [f for f in files if 'synthetic' not in f]
files

['data/tatabahasa_word-00000-of-00001.parquet',
 'data/dialects_word-00000-of-00001.parquet',
 'data/mallm_segment-00000-of-00001.parquet',
 'data/tatabahasa_segment-00000-of-00001.parquet',
 'data/malaysian_speech_instructions_segment_timestamp-00000-of-00001.parquet',
 'data/mallm_word-00000-of-00001.parquet',
 'data/dialects_segment-00000-of-00001.parquet',
 'data/malaysian_speech_instructions_word_timestamp-00000-of-00001.parquet']

In [36]:
def loop(files):
    files, _ = files
    
    data = []
    for f in files:
        template = templates_word if 'word' in f else templates_segment
        template = random.choice(template)
        rows = pd.read_parquet(f).to_dict(orient = 'records')
        rows = random.sample(rows, min(len(rows), 100000))
        for r in tqdm(rows):
            conversation = [
                {"role": "user", "content": [
                    {"type": "audio", "audio_url": r['audio_filename']},
                    {"type": "text", "text": template},
                ]},
                {"role": "assistant", "content": r['new_text']},
            ]
            text = processor.apply_chat_template(conversation, tokenize=False)
            f = r['audio_filename']
            if 'dialects_processed' in f:
                f = os.path.join('/home/husein/ssd3', f)
            else:
                f = os.path.join('/home/husein/ssd3/dataset/speech-instructions', f)
                
            data.append({
                'text': text,
                'audio': f,
            })
    return data

In [37]:
data = mp.multiprocessing(files, loop, cores = len(files))

100%|███████████████████████████████████████████████████████████████████████████████████| 671/671 [00:00<00:00, 3780.96it/s]
 61%|█████████████████████████████████████████████████▍                               | 1477/2422 [00:00<00:00, 7473.96it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 2422/2422 [00:00<00:00, 7602.97it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 2422/2422 [00:00<00:00, 7119.64it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:11<00:00, 8947.50it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:11<00:00, 8996.12it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:11<00:00, 8933.43it/s]
100%|█████████████████████████████████████████████████████████████████████████████| 100000/100000 [00:11<00:00, 8820.52it/s]


In [39]:
len(data)

406186

In [40]:
data[100000]

{'text': "<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\ntranscribe the audio into Whisper format in word timestamp<|im_end|>\n<|im_start|>assistant\n<|startoftranscript|><|ms|><|transcribeprecise|><|0.16|> Uh,<|0.18|><|0.32|> and<|0.42|><|0.48|> which<|0.62|><|0.68|> is<|0.76|><|0.88|> why<|1.18|><|1.60|> we,<|1.68|><|2.30|> we,<|2.38|><|2.66|> and<|2.74|><|2.82|> we've<|2.96|><|3.04|> done<|3.16|><|3.20|> this<|3.32|><|3.36|> to<|3.40|><|3.50|> ourselves.<|4.08|><|4.18|> We've,<|4.44|><|4.68|> you<|4.72|><|4.76|> know,<|4.82|><|5.16|> highly<|5.46|><|5.56|> recommend<|5.96|><|6.00|> putting<|6.22|><|6.28|> a<|6.28|><|6.36|> governance<|6.82|><|7.00|> mechanism<|7.48|><|7.56|> in<|7.58|><|7.68|> process<|8.20|><|8.74|> when<|8.82|><|8.88|> you're<|9.00|><|9.04|> deploying<|9.46|><|9.52|> general<|9.82|><|9.98|> AI<|10.04|><|10.12|> solutions<|10.54|><|10.60|> to<|10.64|><|10.74|> manage<|11.02|><|11.06|> the<

In [42]:
with open('prepare-transcription-instructions.json', 'w') as fopen:
    json.dump(data, fopen)