In [None]:
!pip install datasets

In [None]:
!git config --global credential.helper store
!huggingface-cli login

In [1]:
from huggingface_hub import notebook_login
notebook_login()

VBox(children=(HTML(value='<center> <img\nsrc=https://huggingface.co/front/assets/huggingface_logo-noborder.sv…

In [2]:
import os
from typing import List, Dict
from datasets import load_dataset, Dataset, Audio
from huggingface_hub import HfApi, create_repo

In [3]:
def create_dataset_entries(filtered_dataset) -> List[Dict]:
    """Create dataset entries based on the filtered dataset."""
    data = []
    for i, item in enumerate(filtered_dataset):
        entry = {
            'line_id': f"SW{i:04d}",
            'audio': item['path'],
            'text': item['sentence'],
            'speaker_id': item['client_id'],

        }
        data.append(entry)
    return data

def upload_to_huggingface(dataset: Dataset, repo_id: str) -> None:
    """Upload the dataset to Hugging Face."""
    api = HfApi()

    try:
        create_repo(repo_id=repo_id, repo_type="dataset")
        print("Repository created successfully.")
    except Exception as e:
        print(f"Repository creation failed or already exists: {e}")

    dataset.push_to_hub(repo_id)
    print("Dataset uploaded successfully!")

In [4]:
def create_and_upload_dataset(repo_id: str, client_id: str) -> None:
    """
    Create a dataset from the Mozilla Common Voice dataset for a specific client_id and upload it to Hugging Face.
    """
    # Load the Swahili dataset
    original_dataset = load_dataset("mozilla-foundation/common_voice_17_0", "sw")

    # Filter the dataset for the specific client_id
    filtered_dataset = original_dataset.filter(lambda example: example['client_id'] == client_id)

    # Print the number of rows after filtering
    print(f"Number of rows for client ID {client_id}: {len(filtered_dataset['train'])}")

    # Create dataset entries
    data = create_dataset_entries(filtered_dataset["train"])

    # Create Dataset
    dataset = Dataset.from_dict({
        'line_id': [item['line_id'] for item in data],
        'audio': [item['audio'] for item in data],
        'text': [item['text'] for item in data],
        'speaker_id': [item['speaker_id'] for item in data],
    })

    # Cast the audio column to Audio type
    dataset = dataset.cast_column("audio", Audio(sampling_rate=48000))  # Adjust sampling rate if needed

    # Upload to Hugging Face
    upload_to_huggingface(dataset, repo_id)

In [None]:
# client_id = "052c5091df7681302a2117b2d21db1540c2156f5254ebe9876a7d0146588eab582e11cb47761a18f84200a510a5386bdf024374f76113cd15fe1cc8d7b9fcf0b"
client_id = "fe3befae02733265c3fc953eb67840c57d970340a76386ffda9ab3226d31e376790d7eddefde5f434647687e6136c44e50513edebca32377799b15363919310d"
create_and_upload_dataset("mcv-sw-female", client_id)

Downloading builder script: 100%|██████████| 8.19k/8.19k [00:00<00:00, 1.84MB/s]
Downloading readme: 100%|██████████| 12.7k/12.7k [00:00<00:00, 3.18MB/s]
Downloading extra modules: 100%|██████████| 3.92k/3.92k [00:00<00:00, 1.96MB/s]
Downloading extra modules: 100%|██████████| 132k/132k [00:00<00:00, 304kB/s] 
Downloading data: 100%|██████████| 17.5k/17.5k [00:00<00:00, 4.38MB/s]
Downloading data: 100%|██████████| 1.33G/1.33G [11:06<00:00, 2.00MB/s]
Downloading data: 100%|██████████| 216M/216M [02:06<00:00, 1.70MB/s]
Downloading data: 100%|██████████| 423M/423M [03:18<00:00, 2.13MB/s]/it]
Downloading data: 100%|██████████| 422M/422M [03:14<00:00, 2.17MB/s]t]  
Downloading data files:  50%|█████     | 3/6 [20:15<16:55, 338.58s/it]