In [None]:
import datasets
import numpy as np
import torch
! pip install wandb

In [None]:
! tts --list_models

In [None]:
!tts --text "Bonjour, j'aime bien la viande. Et je suis vraiment content de te voir !" --model_name "tts_models/multilingual/multi-dataset/xtts_v2" --speaker_wav "./reference_speaker.wav" --out_path "../output.wav" --language_idx fr 

In [None]:
import IPython

IPython.display.Audio("../output.wav")

In [178]:
from datasets import load_dataset, DatasetDict, concatenate_datasets, Dataset
from speechbrain.inference.separation import SepformerSeparation as separator
from IPython.display import display
import torchaudio
import datasets
import pandas as pd
import os
import re
import numpy as np
import gc
import soundfile as sf
import IPython
import torch
from tqdm import tqdm

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [139]:
dataset = load_dataset("oza75/bambara-tts")
dataset

DatasetDict({
    train: Dataset({
        features: ['audio', 'bambara', 'french', 'duration', 'speaker_embeddings', 'speaker_id'],
        num_rows: 4430
    })
})

In [18]:
def iterable_to_dataset(iterable_dataset, num_rows):
    """
    Converts an IterableDataset to a Dataset with a specified number of rows using a while loop.

    Parameters:
    - iterable_dataset (IterableDataset): The input IterableDataset from Hugging Face datasets.
    - num_rows (int): The number of rows desired in the output Dataset.

    Returns:
    - Dataset: A Dataset object with the specified number of rows.
    """
    # Create an iterator from the iterable dataset
    iterator = iter(iterable_dataset)

    # Initialize an empty list to store the dataset rows
    rows = []

    # Initialize the tqdm progress bar
    progress_bar = tqdm(total=num_rows, desc='Converting', unit='row')

    try:
        # Collect the specified number of rows
        while len(rows) < num_rows:
            rows.append(next(iterator))
            progress_bar.update(1)
    except StopIteration:
        # End of iterator reached
        print("End of iterable dataset reached before requested number of rows.")
    finally:
        progress_bar.close()

    # Convert the list of rows to a Dataset object
    converted_dataset = Dataset.from_pandas(pd.DataFrame(rows))

    return converted_dataset

In [25]:
print("> Loading the data using streaming mode....")
mls_fr = load_dataset("ylacombe/cml-tts", "french", split="train", streaming=True)
mls_es = load_dataset("ylacombe/cml-tts", "spanish", split="train", streaming=True)
mls_it = load_dataset("ylacombe/cml-tts", "italian", split="train", streaming=True)
lj_en = load_dataset("lj_speech", split="train", streaming=True)

Resolving data files:   0%|          | 0/373 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/130 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/373 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/203 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/373 [00:00<?, ?it/s]

Resolving data files:   0%|          | 0/61 [00:00<?, ?it/s]

In [138]:
print("> Building datasets....")
fr_ds = iterable_to_dataset(mls_fr, 500).add_column("lang", ['fr'] * 500)
es_ds = iterable_to_dataset(mls_es, 500).add_column("lang", ['es'] * 500)
it_ds = iterable_to_dataset(mls_it, 500).add_column("lang", ['it'] * 500)
en_ds = iterable_to_dataset(lj_en, 500).add_column("lang", ['en'] * 500).add_column("speaker_id", ['ljspeech'] * 500)
print("> Selecting columns....")
fr_ds = fr_ds.select_columns(['audio', 'text', 'lang', 'speaker_id'])
es_ds = es_ds.select_columns(['audio', 'text', 'lang', 'speaker_id'])
it_ds = it_ds.select_columns(['audio', 'text', 'lang', 'speaker_id'])
en_ds = en_ds.select_columns(['audio', 'text', 'lang', 'speaker_id'])
print("> Casting the columns...")
en_ds = en_ds.cast_column('audio', datasets.Audio(sampling_rate=22050))
fr_ds = fr_ds.cast(en_ds.features.copy())
es_ds = es_ds.cast(en_ds.features.copy())
it_ds = it_ds.cast(en_ds.features.copy())

> Building datasets....


Converting: 100%|██████████| 500/500 [00:15<00:00, 32.11row/s]
Converting: 100%|██████████| 500/500 [00:17<00:00, 29.03row/s]
Converting: 100%|██████████| 500/500 [00:14<00:00, 33.49row/s]
Converting: 100%|██████████| 500/500 [00:10<00:00, 46.74row/s]


> Selecting columns....
> Casting the columns...


Casting the dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/500 [00:00<?, ? examples/s]

In [147]:
def assign_integer_ids_to_speakers(dataset):
    """
    Assigns an integer ID to each unique speaker UUID in the 'instance' column of the dataset.

    Parameters:
    - dataset: The input dataset loaded using the Hugging Face `datasets` library.

    Returns:
    - The dataset with an additional column 'speaker_id' containing integer IDs for each speaker.
    """
    # Step 1: Extract all unique instances (speaker UUIDs) in a memory-efficient way
    unique_instances = set(dataset['speaker_id'])

    # Step 2: Create a mapping from UUIDs to integer IDs
    instance_to_id = {uuid: idx for idx, uuid in enumerate(unique_instances)}

    # Step 3: Apply the mapping to create a 'speaker_id' column, in batches
    def add_speaker_id(examples):
        # Map each instance (UUID) in the batch to its corresponding integer ID
        examples['speaker_id'] = [instance_to_id[instance] + 28 for instance in examples['speaker_id']]
        return examples

    # Apply the function in a batched manner
    dataset = dataset.map(add_speaker_id, batched=True, batch_size=10000)

    return dataset

In [157]:
multi_ds = concatenate_datasets([en_ds, fr_ds, es_ds, it_ds])
multi_ds = assign_integer_ids_to_speakers(multi_ds)
multi_ds = multi_ds.cast_column('speaker_id', datasets.Value(dtype='int32'))
multi_ds

Map:   0%|          | 0/2000 [00:00<?, ? examples/s]

Casting the dataset:   0%|          | 0/2000 [00:00<?, ? examples/s]

Dataset({
    features: ['audio', 'text', 'lang', 'speaker_id'],
    num_rows: 2000
})

In [166]:
bam_ds = (
    dataset['train']
    .select_columns(['audio', 'bambara', 'speaker_id'])
    .rename_column('bambara', 'text')
    .add_column('lang', ['bm'] * len(dataset['train']))
)
bam_ds.cast(multi_ds.features.copy())
bam_ds

Dataset({
    features: ['audio', 'text', 'speaker_id', 'lang'],
    num_rows: 4430
})

In [167]:
tts_ds = concatenate_datasets([multi_ds, bam_ds]).shuffle()
tts_ds

Dataset({
    features: ['audio', 'text', 'lang', 'speaker_id'],
    num_rows: 6430
})

In [169]:
CHAR_LIMIT = {
    "en": 250,
    "fr": 273,
    "es": 239,
    "it": 213,
    "bm": 230,
}


def is_valid_items(batch):
    # This will be a list of booleans indicating whether each example is valid
    validity_mask = [
        speaker_id >= 3 and len(text) <= CHAR_LIMIT[lang]
        for speaker_id, text, lang in zip(batch['speaker_id'], batch['text'], batch['lang'])
    ]
    return validity_mask


filtered_ds = tts_ds.filter(is_valid_items, batched=True)
filtered_ds

Filter:   0%|          | 0/6430 [00:00<?, ? examples/s]

Dataset({
    features: ['audio', 'text', 'lang', 'speaker_id'],
    num_rows: 5727
})

In [179]:
np.unique(filtered_ds['lang'], return_counts=True)

(array(['bm', 'en', 'es', 'fr', 'it'], dtype='<U2'),
 array([3810,  500,  492,  500,  425]))

In [185]:
def create_audio_files_and_update_dataset(dataset, audio_column, output_dir):
    """
    Create audio files from the 'audio' column of a Hugging Face dataset and update the dataset with file paths.

    Parameters:
    - dataset: The input dataset that contains the 'audio' column.
    - audio_column: The name of the column containing the audio data (datasets.Audio feature).
    - output_dir: The directory where audio files will be saved.

    Returns:
    - The updated dataset with the 'audio' column containing the file paths of saved audio files.
    """
    # Make sure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Prepare a list to hold the file paths, to avoid modifying the dataset in-place
    audio_file_paths = []

    for index, example in tqdm(enumerate(dataset), total=len(dataset), desc="Creating audio files", unit="file"):
        audio_filename = f"audio_{index}.wav"
        audio_filepath = os.path.join(output_dir, audio_filename)

        if os.path.isfile(audio_filepath):
            audio_file_paths.append(audio_filepath)
            continue

        audio_data = example[audio_column]['array']
        # Typically, the sample rate should also be retrieved from the dataset
        sample_rate = example[audio_column]['sampling_rate']

        # Save the audio file
        sf.write(audio_filepath, audio_data, sample_rate)

        # Append the file path to the list
        audio_file_paths.append(audio_filepath)

        # Option to clear memory if needed, uncomment if large arrays are involved
        del audio_data
        gc.collect()

    # Update the dataset with the new file paths
    dataset = dataset.add_column("audio_file_path", audio_file_paths)

    return dataset


# Function to create the metadata file
def create_metadata_file(dataset, output_dir='MyTTSDataSet'):
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Define the path to the metadata file
    metadata_path = os.path.join(output_dir, 'metadata.txt')

    # Open the metadata file in write mode
    with open(metadata_path, 'w', encoding='utf-8') as f:
        # Iterate over each item in the dataset
        for item in dataset:
            # Your dataset should have an 'audio' column with a dictionary containing the file path and 'array' for the audio data
            audio_path = item['audio_file_path'].replace(".wav", "")
            text = item['text'].replace(" ", " ").replace(" ", " ").replace("\n", " ")
            normalized_text = text
            speaker_id = item['speaker_id']
            lang = item['lang']

            # Write the formatted data to the metadata file
            f.write(f"{audio_path}|{text}|{normalized_text}|{speaker_id}|{lang}\n")

    return metadata_path

In [186]:
filtered_ds = create_audio_files_and_update_dataset(
    filtered_ds,
    audio_column="audio",
    output_dir="/home/aboubacar/lab/others/bambara-translation/code/coqui-TTS/finetuning/bambara/dataset/audios/"
)
filtered_ds

Creating audio files:  16%|█▌        | 917/5727 [00:06<00:31, 150.97row/s]


KeyboardInterrupt: 

In [ ]:
create_metadata_file(tts_ds, output_dir="./dataset")

In [ ]:
! CUDA_VISIBLE_DEVICES="0" python ./train_gpt_xtts_2.py

In [113]:
def resample_audio_to_2d(audio, orig_sr, target_sr):
    """
    Resample an audio sample to a specified target sampling rate and ensure it is two-dimensional.

    Parameters:
    - audio (Tensor or numpy.ndarray): The original audio data.
    - orig_sr (int): The original sampling rate of the audio data.
    - target_sr (int): The target sampling rate to resample the audio data to.

    Returns:
    - Tensor: A two-dimensional tensor representing the resampled audio.
    """
    # Ensure the audio is a tensor
    if not isinstance(audio, torch.Tensor):
        audio = torch.tensor(audio, dtype=torch.float32)

    # Add a channel dimension if the audio is 1D
    if audio.ndim == 1:
        audio = audio.unsqueeze(0)

    # Resample the audio if the sampling rates are different
    if orig_sr != target_sr:
        # Create the resample transformation
        resampler = torchaudio.transforms.Resample(orig_freq=orig_sr, new_freq=target_sr)
        audio = audio.mean(dim=0, keepdim=True)
        # Resample audio
        audio = resampler(audio)

    return audio

In [124]:
speech_enhancement_model = separator.from_hparams(
    source="speechbrain/sepformer-whamr-enhancement",
    savedir='pretrained_models/sepformer-whamr-enhancement'
)


def speech_enhancement(item):
    sample_rate = speech_enhancement_model.hparams.sample_rate
    audio = resample_audio_to_2d(item['audio']['array'], item['audio']['sampling_rate'], sample_rate)
    ouputs = speech_enhancement_model.separate_batch(audio)
    item['audio']['array'] = ouputs[:, :, 0].detach().cpu()
    item['audio']['sampling_rate'] = sample_rate

    return item

In [132]:
def test_speech_enhancement(item):
    display(IPython.display.Audio(item['audio']['array'], rate=item['audio']['sampling_rate']))
    enhanced = speech_enhancement(item)
    display(IPython.display.Audio(enhanced['audio']['array'], rate=enhanced['audio']['sampling_rate']))

In [137]:
test_speech_enhancement(es_ds[85])