In [1]:
from huggingface_hub import snapshot_download

f = snapshot_download(
    repo_id="mesolitica/Emilia-Mandarin-Description-Instructions",
    repo_type='dataset',
    allow_patterns="data/*.parquet",
    local_dir="./Emilia-Mandarin-Description-Instructions")

Fetching 2 files:   0%|          | 0/2 [00:00<?, ?it/s]

data/train-00000-of-00002.parquet:   0%|          | 0.00/159M [00:00<?, ?B/s]

data/train-00001-of-00002.parquet:   0%|          | 0.00/160M [00:00<?, ?B/s]

In [2]:
from glob import glob
from tqdm import tqdm
from multiprocess import Pool
import itertools
import zipfile
import os

def chunks(l, n):
    for i in range(0, len(l), n):
        yield (l[i: i + n], i // n)


def multiprocessing(strings, function, cores=6, returned=True):
    df_split = chunks(strings, len(strings) // cores)
    pool = Pool(cores)
    pooled = pool.map(function, df_split)
    pool.close()
    pool.join()

    if returned:
        return list(itertools.chain(*pooled))

def loop(files):
    files, _ = files
    for zip_file_path in tqdm(files):
        destination_folder = './'
        with zipfile.ZipFile(zip_file_path, 'r') as zip_ref:
            zip_ref.extractall(destination_folder)
        os.remove(zip_file_path)

# files = glob('*.zip')
# if len(files):
#     multiprocessing(files, loop, cores = min(len(files), 20), returned = False)

In [3]:
from transformers import AutoProcessor



In [4]:
processor = AutoProcessor.from_pretrained("Qwen/Qwen2-Audio-7B-Instruct")
tokenizer = processor.tokenizer

In [5]:
from glob import glob
import pandas as pd

rows = []

files = glob('Emilia-Mandarin-Description-Instructions/data/*.parquet')
for f in files:
    df = pd.read_parquet(f).to_dict(orient = 'records')
    rows.extend(df)

In [6]:
len(rows)

292649

In [7]:
import json
import pandas as pd

def loop(rows):
    rows, _ = rows
    data = []
    for r in tqdm(rows):
        f = r['audio_filename']
        if not os.path.exists(f):
            continue
            
        try:
            conversation = [
                {"role": "user", "content": [
                    {"type": "audio", "audio_url": "audio.wav"},
                    {"type": "text", "text": r['question']},
                ]},
                {"role": "assistant", "content": r['answer']},
            ]
            text = processor.apply_chat_template(conversation, tokenize=False)
        except Exception as e:
            continue
        

        data.append({
            'text': text,
            'audio': f,
        })
    return data

In [8]:
processed = multiprocessing(rows, loop, cores = 30)

100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5578.95it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5599.02it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5621.83it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5612.75it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5575.41it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5591.80it/s]
100%|█████████████████████████████████████████████████████████████████████████████████| 9754/9754 [00:01<00:00, 5585.33it/s]
 73%|███████████████████████████████████████████████████████████                      | 7115/9754 [00:01<00:00, 5387.39it/s]


In [9]:
len(processed)

292649

In [10]:
with open('prepare-Emilia-Mandarin-Description-Instructions.json', 'w') as fopen:
    json.dump(processed, fopen)

In [11]:
processed[0]

{'text': '<|im_start|>system\nYou are a helpful assistant.<|im_end|>\n<|im_start|>user\nAudio 1: <|audio_bos|><|AUDIO|><|audio_eos|>\ncan you describe the audio<|im_end|>\n<|im_start|>assistant\nThe speaker is discussing the role of Baidu, a prominent Chinese internet company, in collecting user data through its products. The statement highlights how Baidu can directly access and utilize user information and habits through the various products it offers.\n\nHere’s a breakdown of the key points:\n\n1. **Products as Data Collection Tools**: The phrase "依靠这些产品" (relying on these products) suggests that Baidu\'s products are the primary means through which the company gathers user data. These products could include search engines, maps, news apps, and other services that users interact with regularly.\n\n2. **Direct Access to User Information**: "可以直接掌握用户资讯" (can directly grasp user information) indicates that Baidu has immediate and direct access to the data generated by users when they u