In [1]:
from huggingface_hub import snapshot_download

snapshot_download(
    repo_id="mesolitica/CoVoST2-Instruction",
    repo_type='dataset',
    allow_patterns="data/train*.parquet",
    local_dir="./CoVoST2-Instruction",
)

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

'/home/mesolitica/stt/CoVoST2-Instruction'

In [2]:
from glob import glob
from tqdm import tqdm
from multiprocess import Pool
import itertools
import zipfile
import os

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)


def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

In [1]:
from glob import glob
import pandas as pd
import os
import json
from transformers import AutoProcessor



In [4]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
tokenizer = processor.tokenizer

In [2]:
rows = pd.read_parquet('CoVoST2-Instruction/data/train-00000-of-00001.parquet').to_dict(orient = 'records')
rows[0]

{'question': 'please transcribe to English',
 'from_language': 'Arabic',
 'to_language': 'English',
 'audio_filename': 'covost-mp3/common_voice_ar_19340854.mp3',
 'answer': 'Who owns this villa.'}

In [7]:
[r for r in rows if ['audio_filename'] == 'covost-mp3/common_voice_en_19982218.mp3']

[]

In [5]:
from collections import defaultdict

count = defaultdict(int)
for r in rows:
    count[r['audio_filename']] += 1

In [6]:
for k, v in count.items():
    if v > 1:
        print(k)
        break

covost-mp3/common_voice_en_19982218.mp3


In [15]:
def loop(rows):
    rows, _ = rows
    data = []
    for r in tqdm(rows):
        if not os.path.exists(r['audio_filename']):
            continue
            
        try:
            conversation = [
                {"role": "user", "content": [
                    {"type": "audio", "audio_url": "audio.wav"},
                    {"type": "text", "text": r['question']},
                ]},
                {"role": "assistant", "content": r['answer']},
            ]
            text = processor.apply_chat_template(conversation, tokenize=False)
        except Exception as e:
            continue

        data.append({
            'text': text,
            'audio': r['audio_filename'],
        })
    return data

In [16]:
processed = loop((rows[:100], 0))

100%|███████████████████████████████████████████████████████████████████████████████████| 100/100 [00:00<00:00, 3859.10it/s]


In [18]:
processed = multiprocessing(rows, loop, cores = 30)

100%|███████████████████████████████████████████████████████████████████████████████| 35135/35135 [00:06<00:00, 5544.18it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 35135/35135 [00:06<00:00, 5600.10it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 35135/35135 [00:06<00:00, 5505.19it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 35135/35135 [00:06<00:00, 5758.12it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 35135/35135 [00:06<00:00, 5754.27it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 35135/35135 [00:06<00:00, 5648.28it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 35135/35135 [00:06<00:00, 5643.64it/s]
100%|███████████████████████████████████████████████████████████████████████████████| 35135/35135 [00:06<00:00, 5636.73it/s]


In [19]:
len(processed)

1054060

In [20]:
with open('prepare-CoVoST2-Instruction.json', 'w') as fopen:
    json.dump(processed, fopen)