In [3]:
# from huggingface_hub import snapshot_download

# snapshot_download(
#     repo_type='dataset',
#     repo_id="mesolitica/Sampling-Multitask-National-Speech-Corpus-v1", 
#     allow_patterns='*.zip', local_dir = './')

In [9]:
from huggingface_hub import hf_hub_download

f = hf_hub_download(
    repo_id="mesolitica/Sampling-Multitask-National-Speech-Corpus-v1",
    repo_type='dataset',
    filename="data/train-00000-of-00001.parquet",
    local_dir="./Sampling-Multitask-National-Speech-Corpus-v1")

In [4]:
from glob import glob
from tqdm import tqdm
from multiprocess import Pool
import itertools
import zipfile
import os

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)


def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

def loop(files):
    files, _ = files
    for zip_file_path in tqdm(files):
        destination_folder = './'
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(destination_folder)
        os.remove(zip_file_path)

files = glob('*.zip')
if len(files):
    multiprocessing(files, loop, cores = min(len(files), 20), returned = False)

In [5]:
from transformers import AutoProcessor



In [6]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
tokenizer = processor.tokenizer

In [19]:
import pandas as pd

df = pd.read_parquet(f).to_dict(orient = 'records')
df[0]

{'instruction': 'What was the reason behind Speaker1 crying during the parent-teacher meeting?',
 'answer': 'Speaker1 cried because their dad scolded them for not reading up, as instructed by their teachers.',
 'audio_filename': 'sampling-audio/SQA-PART3-Train-audio_train-00153-of-00171-0.mp3',
 'start': None,
 'end': None,
 'context': None,
 'system': None,
 'sliced_audio_filename': None}

In [20]:
conversation = [
    {'role': 'system', 'content': 'You are AI assistant from Infocomm Media Development Authority (IMDA) Singapore.'},
    {"role": "user", "content": [
        {"type": "audio", "audio_url": "audio.wav"},
        {"type": "text", "text": df[0]['instruction']},
    ]},
    {"role": "assistant", "content": df[0]['answer']},
]

In [21]:
processor.apply_chat_template(conversation, tokenize=False)

'<|im_start|>system\nYou are AI assistant from Infocomm Media Development Authority (IMDA) Singapore.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nWhat was the reason behind Speaker1 crying during the parent-teacher meeting?<|im_end|>\n<|im_start|>assistant\nSpeaker1 cried because their dad scolded them for not reading up, as instructed by their teachers.<|im_end|>\n'

In [22]:
import json
import pandas as pd

def loop(rows):
    rows, _ = rows
    data = []
    for r in tqdm(rows):
        f = r['audio_filename']
        if not os.path.exists(f):
            continue
            
        try:
            conversation = [
                {'role': 'system', 'content': 'You are AI assistant from Infocomm Media Development Authority (IMDA) Singapore.'},
                {"role": "user", "content": [
                    {"type": "audio", "audio_url": "audio.wav"},
                    {"type": "text", "text": r['instruction']},
                ]},
                {"role": "assistant", "content": r['answer']},
            ]
            text = processor.apply_chat_template(conversation, tokenize=False)
        except Exception as e:
            continue
        

        data.append({
            'text': text,
            'audio': f,
        })
    return data

In [23]:
processed = loop((df[-10:], 0))
len(processed)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 4281.21it/s]


10

In [24]:
processed[0]

{'text': "<|im_start|>system\nYou are AI assistant from Infocomm Media Development Authority (IMDA) Singapore.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\nPlease transcribe.<|im_end|>\n<|im_start|>assistant\n<Speaker1>: Previously they also ask Glen and #bi-hun# for a lot of help, Especially Hannah. He will ask #bi-hun# for a lot of help. because #bi-hun# and Hannah. <Speaker2>: But the whole time they were there also [ah] <Speaker1>: Same course, [what] no [lah] they will not [lah] <Speaker2>: Ya [loh] <Speaker1>: I am sure they will did their own stuff. Also like only hand wipe machine, but really a lot of work. I think like Hannah need to do a lot of things ya, Hannah, #Ru-Shin# <Speaker2>: A lot work. <Speaker1>: That's why I say you really will be prepared, because you need to juggle with studies cause you need to go for talks. You don't go get loans. You knew like all this kind of big stuff is all is you do, [eh]<|im_end|>\n",
 'audio': 'sampling-au

In [26]:
processed = multiprocessing(df, loop, cores = 30)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5347.86it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5368.13it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5198.79it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 4976.36it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5094.28it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00, 5296.65it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████| 4734/4734 [00:00<00:00

In [27]:
with open('prepare-Sampling-Multitask-National-Speech-Corpus-v1.json', 'w') as fopen:
    json.dump(processed, fopen)