In [1]:
! pip install wandb



In [None]:
! tts --list_models

In [None]:
!tts --text "Bonjour, j'aime bien la viande. Et je suis vraiment content de te voir !" --model_name "tts_models/multilingual/multi-dataset/xtts_v2" --speaker_wav "./reference_speaker.wav" --out_path "../output.wav" --language_idx fr 

In [7]:
import IPython

IPython.display.Audio("../output.wav")

ValueError: rate must be specified when data is a numpy array or list of audio samples.

In [17]:
from datasets import load_dataset, DatasetDict, concatenate_datasets
import os
import re
import gc
import soundfile as sf

In [None]:
from huggingface_hub import notebook_login

notebook_login()

In [3]:
dataset = load_dataset("oza75/bambara-tts")
dataset

Downloading readme:   0%|          | 0.00/5.92k [00:00<?, ?B/s]

Downloading data: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 378M/378M [00:29<00:00, 12.8MB/s] 
Downloading data: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 309M/309M [00:22<00:00, 13.4MB/s] 
Downloading data: 100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 381M/381M [00:26<00:00, 14.4MB/s] 


Generating train split:   0%|          | 0/4430 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['audio', 'bambara', 'french', 'duration', 'speaker_embeddings', 'speaker_id'],
        num_rows: 4430
    })
})

In [11]:
tts_ds = dataset.filter(lambda item: item['speaker_id'] >= 27)
tts_ds

Filter:   0%|          | 0/4430 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['audio', 'bambara', 'french', 'duration', 'speaker_embeddings', 'speaker_id'],
        num_rows: 997
    })
})

In [4]:
tts_ds['train'][0]

{'audio': {'path': None,
  'array': array([ 2.00997878e-04, -3.56181881e-05,  1.74966917e-05, ...,
          2.45530282e-05, -1.61255230e-06,  0.00000000e+00]),
  'sampling_rate': 22050},
 'bambara': 'Jigi, i bolo degunnen don wa ?',
 'french': 'Jigi, es-tu occup√© ?',
 'duration': 2.646,
 'speaker_embeddings': [-2.564516305923462,
  -20.928388595581055,
  69.90596008300781,
  8.361804962158203,
  14.13325309753418,
  50.45071792602539,
  80.53385162353516,
  20.306468963623047,
  -35.76181411743164,
  -18.653125762939453,
  -4.586198329925537,
  -88.45294952392578,
  14.038538932800293,
  -1.9949610233306885,
  29.295623779296875,
  35.923561096191406,
  -4.508488655090332,
  22.126203536987305,
  -20.97467803955078,
  39.27812194824219,
  15.961697578430176,
  35.7476806640625,
  26.484188079833984,
  -12.542716979980469,
  -35.30205154418945,
  92.43451690673828,
  -11.966684341430664,
  -48.78108596801758,
  -42.39558792114258,
  -20.03965187072754,
  21.1246395111084,
  -3.3788418

In [9]:
tts_ds['train'][0]['audio']

{'path': None,
 'array': array([ 2.00997878e-04, -3.56181881e-05,  1.74966917e-05, ...,
         2.45530282e-05, -1.61255230e-06,  0.00000000e+00]),
 'sampling_rate': 22050}

In [39]:

def create_audio_files_and_update_dataset(dataset, audio_column, output_dir):
    """
    Create audio files from the 'audio' column of a Hugging Face dataset and update the dataset with file paths.

    Parameters:
    - dataset: The input dataset that contains the 'audio' column.
    - audio_column: The name of the column containing the audio data (datasets.Audio feature).
    - output_dir: The directory where audio files will be saved.

    Returns:
    - The updated dataset with the 'audio' column containing the file paths of saved audio files.
    """
    # Make sure the output directory exists
    os.makedirs(output_dir, exist_ok=True)

    # Prepare a list to hold the file paths, to avoid modifying the dataset in-place
    audio_file_paths = []

    for index, example in enumerate(dataset):
        audio_filename = f"audio_{index}.wav"
        audio_filepath = os.path.join(output_dir, audio_filename)

        if os.path.isfile(audio_filepath):
            audio_file_paths.append(audio_filepath)
            continue

        audio_data = example[audio_column]['array']
        # Typically, the sample rate should also be retrieved from the dataset
        sample_rate = example[audio_column]['sampling_rate']

        # Save the audio file
        sf.write(audio_filepath, audio_data, sample_rate)

        # Append the file path to the list
        audio_file_paths.append(audio_filepath)

        # Option to clear memory if needed, uncomment if large arrays are involved
        del audio_data
        gc.collect()

    # Update the dataset with the new file paths
    dataset = dataset.add_column("audio_file_path", audio_file_paths)

    return dataset


# Function to create the metadata file
def create_metadata_file(dataset, output_dir='MyTTSDataSet'):
    # Create the output directory if it doesn't exist
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)

    # Define the path to the metadata file
    metadata_path = os.path.join(output_dir, 'metadata.txt')

    # Open the metadata file in write mode
    with open(metadata_path, 'w', encoding='utf-8') as f:
        # Iterate over each item in the dataset
        for item in dataset['train']:
            # Your dataset should have an 'audio' column with a dictionary containing the file path and 'array' for the audio data
            audio_path = item['audio_file_path'].replace(".wav", "")
            bambara_text = item['bambara'].replace("‚ÄØ", " ").replace("¬†", " ").replace("\n", " ")
            normalized_text = bambara_text
            speaker_id = item['speaker_id']

            # Write the formatted data to the metadata file
            f.write(f"{audio_path}|{bambara_text}|{normalized_text}|{speaker_id}\n")

    return metadata_path

In [24]:
tts_ds['train'] = create_audio_files_and_update_dataset(tts_ds['train'], audio_column="audio",
                                               output_dir="/home/aboubacar/lab/others/bambara-translation/code/coqui-TTS/finetuning/bambara/dataset/audios/")
tts_ds

Flattening the indices:   0%|          | 0/997 [00:00<?, ? examples/s]

DatasetDict({
    train: Dataset({
        features: ['audio', 'bambara', 'french', 'duration', 'speaker_embeddings', 'speaker_id', 'audio_file_path'],
        num_rows: 997
    })
})

In [25]:
tts_ds['train']['audio_file_path'][0]

'/home/aboubacar/lab/others/bambara-translation/code/coqui-TTS/finetuning/bambara/dataset/audios/audio_0.wav'

In [40]:
create_metadata_file(tts_ds, output_dir="./dataset")

'./dataset/metadata.txt'

In [38]:
os.path.realpath("./dataset/audios/")

'/home/aboubacar/lab/others/bambara-translation/code/coqui-TTS/finetuning/bambara/dataset/audios'

In [30]:
! CUDA_VISIBLE_DEVICES="0" python ./train_gpt_xtts.py

>> DVAE weights restored from: /home/aboubacar/lab/others/bambara-translation/code/coqui-TTS/finetuning/bambara/run/training/XTTS_v2.0_original_model_files/dvae.pth
 | > Found 997 files in /home/aboubacar/lab/others/bambara-translation/code/coqui-TTS/finetuning/bambara/dataset
 > Training Environment:
 | > Backend: Torch
 | > Mixed precision: False
 | > Precision: float32
 | > Num. of CPUs: 20
 | > Num. of Torch Threads: 1
 | > Torch seed: 1
 | > Torch CUDNN: True
 | > Torch CUDNN deterministic: False
 | > Torch CUDNN benchmark: False
 | > Torch TF32 MatMul: False
[34m[1mwandb[0m: Currently logged in as: [33mabouba181[0m. Use [1m`wandb login --relogin`[0m to force relogin
[34m[1mwandb[0m: Tracking run with wandb version 0.16.6
[34m[1mwandb[0m: Run data is saved locally in [35m[1m/home/aboubacar/lab/others/bambara-translation/code/coqui-TTS/finetuning/bambara/wandb/run-20240408_013112-5wq75mwn[0m
[34m[1mwandb[0m: Run [1m`wandb offline`[0m to turn off