In [10]:
from datasets import get_dataset_config_names,load_dataset
from itertools import islice
import pyarrow as pa
import pyarrow.parquet as pq

configs = get_dataset_config_names("fixie-ai/covost2")
print("Available Configurations:")
print(configs,len(configs))

README.md:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

Available Configurations:
['ar_en', 'ca_en', 'cy_en', 'de_en', 'en_ar', 'en_ca', 'en_cy', 'en_de', 'en_et', 'en_fa', 'en_id', 'en_ja', 'en_lv', 'en_mn', 'en_sl', 'en_sv-SE', 'en_ta', 'en_tr', 'en_zh-CN', 'es_en', 'et_en', 'fa_en', 'fr_en', 'id_en', 'it_en', 'ja_en', 'lv_en', 'mn_en', 'nl_en', 'pt_en', 'ru_en', 'sl_en', 'sv-SE_en', 'ta_en', 'tr_en', 'zh-CN_en'] 36


In [9]:
import os
import pyarrow as pa
import pyarrow.parquet as pq
from tqdm import tqdm
from itertools import islice
import soundfile as sf
import pandas as pd
from concurrent.futures import ThreadPoolExecutor

In [11]:
parquet_dir = 'covost2'
mp3_dir = 'covost-mp3'

os.makedirs(parquet_dir, exist_ok=True)
os.makedirs(mp3_dir, exist_ok=True)

def save_audio_file(example):
    try:
        audio_info = example.get('audio')
        if audio_info:
            audio_path = os.path.join(mp3_dir, audio_info['path'])
            sf.write(audio_path, audio_info['array'], audio_info['sampling_rate'], format='MP3')
        return True
    except Exception as e:
        print(f"Error saving audio: {e}")
        return False

def process_split(split):
    try:
        streamed_ds = load_dataset("fixie-ai/covost2", split, streaming=True)['train']
        
        batch = []
        count = 0
        with ThreadPoolExecutor() as executor:
            for example in islice(streamed_ds, 51_000):
                filtered_example = {
                    'file': example.get('audio')['path'],
                    'sentence': example.get('sentence'),
                    'translation': example.get('translation'),
                }
                batch.append(filtered_example)
                executor.submit(save_audio_file, example)
                count += 1

        if not batch:
            print(f"No valid data found in {split}.")
            return

        table = pa.Table.from_pandas(pd.DataFrame(batch))
        save_path = os.path.join(parquet_dir, f"covost2_{split}.parquet")
        pq.write_table(table, save_path)
        print(f"Saved {len(batch)} samples to {save_path}")

    except Exception as e:
        print(f"Error processing {split}: {e}")

for split in tqdm(configs):
    process_split(split)

  3%|███                                                                                                         | 1/36 [00:36<21:29, 36.85s/it]

Saved 2283 samples to covost2/covost2_ar_en.parquet


  6%|█████▊                                                                                                   | 2/36 [05:23<1:44:16, 184.02s/it]

Saved 51000 samples to covost2/covost2_ca_en.parquet


  8%|████████▊                                                                                                | 3/36 [05:48<1:01:17, 111.45s/it]

Saved 1241 samples to covost2/covost2_cy_en.parquet


 11%|███████████▋                                                                                             | 4/36 [10:27<1:34:41, 177.56s/it]

Saved 51000 samples to covost2/covost2_de_en.parquet


 14%|██████████████▌                                                                                          | 5/36 [15:29<1:54:48, 222.23s/it]

Saved 51000 samples to covost2/covost2_en_ar.parquet


 17%|█████████████████▌                                                                                       | 6/36 [20:30<2:04:33, 249.13s/it]

Saved 51000 samples to covost2/covost2_en_ca.parquet


 19%|████████████████████▍                                                                                    | 7/36 [25:34<2:08:59, 266.87s/it]

Saved 51000 samples to covost2/covost2_en_cy.parquet


 22%|███████████████████████▎                                                                                 | 8/36 [30:41<2:10:31, 279.70s/it]

Saved 51000 samples to covost2/covost2_en_de.parquet


 25%|██████████████████████████▎                                                                              | 9/36 [35:57<2:11:00, 291.11s/it]

Saved 51000 samples to covost2/covost2_en_et.parquet


 28%|████████████████████████████▉                                                                           | 10/36 [41:08<2:08:45, 297.14s/it]

Saved 51000 samples to covost2/covost2_en_fa.parquet


 31%|███████████████████████████████▊                                                                        | 11/36 [46:22<2:05:59, 302.38s/it]

Saved 51000 samples to covost2/covost2_en_id.parquet


 33%|██████████████████████████████████▋                                                                     | 12/36 [51:25<2:01:04, 302.70s/it]

Saved 51000 samples to covost2/covost2_en_ja.parquet


 36%|█████████████████████████████████████▌                                                                  | 13/36 [57:06<2:00:27, 314.23s/it]

Saved 51000 samples to covost2/covost2_en_lv.parquet


README.md:   0%|          | 0.00/26.8k [00:00<?, ?B/s]

 39%|███████████████████████████████████████▋                                                              | 14/36 [1:02:16<1:54:42, 312.85s/it]

Saved 51000 samples to covost2/covost2_en_mn.parquet


 42%|██████████████████████████████████████████▌                                                           | 15/36 [1:07:15<1:48:01, 308.63s/it]

Saved 51000 samples to covost2/covost2_en_sl.parquet


 44%|█████████████████████████████████████████████▎                                                        | 16/36 [1:12:14<1:41:56, 305.84s/it]

Saved 51000 samples to covost2/covost2_en_sv-SE.parquet


 47%|████████████████████████████████████████████████▏                                                     | 17/36 [1:17:17<1:36:33, 304.91s/it]

Saved 51000 samples to covost2/covost2_en_ta.parquet


 50%|███████████████████████████████████████████████████                                                   | 18/36 [1:22:21<1:31:27, 304.86s/it]

Saved 51000 samples to covost2/covost2_en_tr.parquet


 53%|█████████████████████████████████████████████████████▊                                                | 19/36 [1:27:23<1:26:07, 303.96s/it]

Saved 51000 samples to covost2/covost2_en_zh-CN.parquet


 56%|████████████████████████████████████████████████████████▋                                             | 20/36 [1:30:23<1:11:08, 266.77s/it]

Error processing es_en: Error opening <_io.BytesIO object at 0x7f38c4a649a0>: Format not recognised.


 58%|████████████████████████████████████████████████████████████▋                                           | 21/36 [1:30:51<48:46, 195.11s/it]

Saved 1782 samples to covost2/covost2_et_en.parquet


 61%|███████████████████████████████████████████████████████████████▌                                        | 22/36 [1:34:25<46:51, 200.80s/it]

Saved 51000 samples to covost2/covost2_fa_en.parquet


 64%|██████████████████████████████████████████████████████████████████▍                                     | 23/36 [1:38:56<48:01, 221.66s/it]

Saved 51000 samples to covost2/covost2_fr_en.parquet


 67%|█████████████████████████████████████████████████████████████████████▎                                  | 24/36 [1:39:16<32:14, 161.17s/it]

Saved 1243 samples to covost2/covost2_id_en.parquet


 69%|████████████████████████████████████████████████████████████████████████▏                               | 25/36 [1:42:20<30:48, 168.02s/it]

Saved 31698 samples to covost2/covost2_it_en.parquet


 72%|███████████████████████████████████████████████████████████████████████████                             | 26/36 [1:42:41<20:39, 123.95s/it]

Saved 1119 samples to covost2/covost2_ja_en.parquet


 75%|██████████████████████████████████████████████████████████████████████████████▊                          | 27/36 [1:43:08<14:14, 94.93s/it]

Saved 2337 samples to covost2/covost2_lv_en.parquet


 78%|█████████████████████████████████████████████████████████████████████████████████▋                       | 28/36 [1:43:34<09:52, 74.06s/it]

Saved 2067 samples to covost2/covost2_mn_en.parquet


 81%|████████████████████████████████████████████████████████████████████████████████████▌                    | 29/36 [1:44:20<07:39, 65.66s/it]

Saved 7108 samples to covost2/covost2_nl_en.parquet


 83%|███████████████████████████████████████████████████████████████████████████████████████▌                 | 30/36 [1:45:15<06:15, 62.53s/it]

Saved 9158 samples to covost2/covost2_pt_en.parquet


 86%|██████████████████████████████████████████████████████████████████████████████████████████▍              | 31/36 [1:46:42<05:48, 69.80s/it]

Saved 12112 samples to covost2/covost2_ru_en.parquet


 89%|█████████████████████████████████████████████████████████████████████████████████████████████▎           | 32/36 [1:47:06<03:44, 56.06s/it]

Saved 1843 samples to covost2/covost2_sl_en.parquet


 92%|████████████████████████████████████████████████████████████████████████████████████████████████▎        | 33/36 [1:47:28<02:17, 45.93s/it]

Saved 2160 samples to covost2/covost2_sv-SE_en.parquet


 94%|███████████████████████████████████████████████████████████████████████████████████████████████████▏     | 34/36 [1:47:48<01:16, 38.13s/it]

Saved 1358 samples to covost2/covost2_ta_en.parquet


 97%|██████████████████████████████████████████████████████████████████████████████████████████████████████   | 35/36 [1:48:19<00:35, 35.98s/it]

Saved 3966 samples to covost2/covost2_tr_en.parquet


100%|████████████████████████████████████████████████████████████████████████████████████████████████████████| 36/36 [1:49:26<00:00, 182.40s/it]

Saved 7085 samples to covost2/covost2_zh-CN_en.parquet





In [1]:
from glob import glob

files = sorted(glob('covost2/*.parquet'))
files

['covost2/covost2_ar_en.parquet',
 'covost2/covost2_ca_en.parquet',
 'covost2/covost2_cy_en.parquet',
 'covost2/covost2_de_en.parquet',
 'covost2/covost2_en_ar.parquet',
 'covost2/covost2_en_ca.parquet',
 'covost2/covost2_en_cy.parquet',
 'covost2/covost2_en_de.parquet',
 'covost2/covost2_en_et.parquet',
 'covost2/covost2_en_fa.parquet',
 'covost2/covost2_en_id.parquet',
 'covost2/covost2_en_ja.parquet',
 'covost2/covost2_en_lv.parquet',
 'covost2/covost2_en_mn.parquet',
 'covost2/covost2_en_sl.parquet',
 'covost2/covost2_en_sv-SE.parquet',
 'covost2/covost2_en_ta.parquet',
 'covost2/covost2_en_tr.parquet',
 'covost2/covost2_en_zh-CN.parquet',
 'covost2/covost2_et_en.parquet',
 'covost2/covost2_fa_en.parquet',
 'covost2/covost2_fr_en.parquet',
 'covost2/covost2_id_en.parquet',
 'covost2/covost2_it_en.parquet',
 'covost2/covost2_ja_en.parquet',
 'covost2/covost2_lv_en.parquet',
 'covost2/covost2_mn_en.parquet',
 'covost2/covost2_nl_en.parquet',
 'covost2/covost2_pt_en.parquet',
 'covost

In [2]:
import pandas as pd
import random
from sklearn.model_selection import train_test_split
from langcodes import *



In [3]:
question = [
    'please transcribe to {lang}',
    'transcribe the audio in {lang}',
    'translate the narrative to {lang}',
]

In [5]:
df = pd.read_parquet(files[0]).to_dict(orient = 'records')

In [11]:
trains, tests = [], []

for f in tqdm(files):
    to_lang = f.split('_')[-1].split('.')[0]
    from_lang = f.split('_')[1]
    from_lang = Language.get(from_lang).display_name()
    to_lang = Language.get(to_lang).display_name()
    df = pd.read_parquet(f).to_dict(orient = 'records')
    train, test = train_test_split(df, test_size = 100)
    for t in train:
        audio_filename = os.path.join('covost-mp3', t['file'])
        if not os.path.exists(audio_filename):
            continue
        trains.append({
            'question': random.choice(question).format(lang = to_lang),
            'from_language': from_lang,
            'to_language': to_lang,
            'audio_filename': audio_filename,
            'answer': t['translation']
        })
    for t in test:
        audio_filename = os.path.join('covost-mp3', t['file'])
        if not os.path.exists(audio_filename):
            continue
        tests.append({
            'question': random.choice(question).format(lang = to_lang),
            'from_language': from_lang,
            'to_language': to_lang,
            'audio_filename': audio_filename,
            'answer': t['translation']
        })

100%|███████████████████████████████████████████████████████████████████████████████████████| 35/35 [00:07<00:00,  4.38it/s]


In [12]:
len(trains), len(tests)

(1054060, 3500)

In [13]:
from datasets import Dataset

dataset = Dataset.from_list(trains)

In [16]:
dataset.push_to_hub('mesolitica/CoVoST2-Instruction', split = 'train')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/1055 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/64.1M [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/CoVoST2-Instruction/commit/83aa36186609bc30c4e1e4303e3069e1ec1e54c4', commit_message='Upload dataset', commit_description='', oid='83aa36186609bc30c4e1e4303e3069e1ec1e54c4', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/CoVoST2-Instruction', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/CoVoST2-Instruction'), pr_revision=None, pr_num=None)

In [17]:
dataset = Dataset.from_list(tests)

In [18]:
dataset.push_to_hub('mesolitica/CoVoST2-Instruction', split = 'test')

Uploading the dataset shards:   0%|          | 0/1 [00:00<?, ?it/s]

Creating parquet from Arrow format:   0%|          | 0/4 [00:00<?, ?ba/s]

Uploading...:   0%|          | 0.00/208k [00:00<?, ?B/s]

README.md:   0%|          | 0.00/446 [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/CoVoST2-Instruction/commit/4a16a036634940551fa2b89aafbc7d8aed37c7d6', commit_message='Upload dataset', commit_description='', oid='4a16a036634940551fa2b89aafbc7d8aed37c7d6', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/CoVoST2-Instruction', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/CoVoST2-Instruction'), pr_revision=None, pr_num=None)

In [87]:
!du -hs covost-mp3

9.8G	covost-mp3


In [89]:
import zipfile
import os

def zip_folder(folder_path, output_zip_path):
    with zipfile.ZipFile(output_zip_path, 'w', zipfile.ZIP_DEFLATED) as zipf:
        for root, dirs, files in os.walk(folder_path):
            for file in files:
                file_path = os.path.join(root, file)
                arcname = os.path.relpath(file_path, start=folder_path)
                zipf.write(file_path, arcname)

zip_folder('covost-mp3', 'covost-mp3.zip')

In [90]:
from huggingface_hub import HfApi
api = HfApi()

api.upload_file(
    path_or_fileobj="covost-mp3.zip",
    path_in_repo="covost-mp3.zip",
    repo_id="mesolitica/CoVoST2-Instruction",
    repo_type="dataset",
)

Uploading...:   0%|          | 0.00/9.48G [00:00<?, ?B/s]

CommitInfo(commit_url='https://huggingface.co/datasets/mesolitica/CoVoST2-Instruction/commit/6f1bf159d655b28a25f82f0f7ef1dde4e4160a43', commit_message='Upload covost-mp3.zip with huggingface_hub', commit_description='', oid='6f1bf159d655b28a25f82f0f7ef1dde4e4160a43', pr_url=None, repo_url=RepoUrl('https://huggingface.co/datasets/mesolitica/CoVoST2-Instruction', endpoint='https://huggingface.co', repo_type='dataset', repo_id='mesolitica/CoVoST2-Instruction'), pr_revision=None, pr_num=None)