# Split filelist file into train and test sets

Use a train ratio or number of samples in test set.


In [13]:
# Load the data from the csv file
import pandas as pd
import os
import random

random.seed(42)

dataset_name = "rw_kinyarwanda_22050_male"
data: pd.DataFrame = pd.read_csv("../cv-corpus-5.1-2020-06-22/rw/speaker_321_wav.csv")
data

Unnamed: 0,uttid,text,phonemes
0,common_voice_rw_20996796,Ni izihe ngaruka zo kutagira ubushake bw’imibo...,"ni izihe nɡaɾu,uːka zo ku,uːtaɡiɾa u,uːβu,uːʃa..."
1,common_voice_rw_20996798,Hari bamwe bifuza ko iri rushanwa ryacika buru...,"haɾi βamŋe βifu,uːza ko iɾi ɾu,uːʃanŋwa ɾɟat͡ʃ..."
2,common_voice_rw_20996793,Iki kibazo cy’amazi ni karande muri aka gace.,"iki kiβazo camazi ni kaɾande mu,uːɾi aka ɡat͡ʃe"
3,common_voice_rw_20996794,Nubwo ariko amakorali yose yatumiwe yari yitab...,"nu,uːbɡo aɾiko amakoɾai jose jatu,uːmiwe jaɾi ..."
4,common_voice_rw_20997532,Ufite inshingano zo gukomeza kureba mu ndorerw...,"u,uːfite inʃinɡano zo ɡu,uːkomeza ku,uːɾeβa mu..."
...,...,...,...
30309,common_voice_rw_21586982,Dr Bizimana yavuze icyatumye aba baganga bahin...,"dɾ βizimana javu,uːze icatu,uːmɲe aβa βaɡanɡa ..."
30310,common_voice_rw_21586984,Kwinjira muri iri torero ngo ntakiguzi bisaba ...,"kwinʒiɾa mu,uːɾi iɾi toɾeɾo nɡo ntakiɡu,uːzi β..."
30311,common_voice_rw_21586999,Bwana Buhari ntiyahishuye indwara ye ariko avu...,"bɡana βu,uːhaɾi ntijahiʃu,uːje indɡwaɾa je aɾi..."
30312,common_voice_rw_21586987,yavugaga ko Abanyarwanda bagiye kunguka umunya...,"javu,uːɡaɡa ko aβaɲaɾɡwanda βaɡije ku,uːnɡu,uː..."


In [14]:
# Support for DataFrames
def split_file_list(orig_data: pd.DataFrame, train_ratio=None, test_samples=None, max_samples=None):
    # Shuffle the data
    data = orig_data.sample(frac=1).reset_index(drop=True)

    if max_samples is not None:
        data = data[:max_samples]

    if test_samples is not None:
        train_set = data[:-test_samples]
        test_set = data[-test_samples:]
    elif train_ratio is not None:
        train_set_size = int(len(data) * train_ratio)
        train_set = data[:train_set_size]
        test_set = data[train_set_size:]

    else:
        raise ValueError("Either 'train_ratio' or 'test_samples' should be provided.")

    return train_set, test_set


# Example usage
train_data, val_data = split_file_list(data, test_samples=240)

### Save phonemes and text of train_data, val_data


In [15]:
i_dir = "/home/navneeth/EgoPro/deep_learning/vits_new/cv-corpus-5.1-2020-06-22/rw/rw_updated_22050"
o_file_train = f"../filelists/{dataset_name}_audio_sid_text_train_filelist.txt"
o_file_val = f"../filelists/{dataset_name}_audio_sid_text_test_filelist.txt"

link_name = "DUMMY3"

In [16]:
def create_path_map(source_dir):
    path_map = {}
    for root, dirs, files in os.walk(source_dir):
        for file in files:
            if file.endswith(".wav"):
                path_map[file] = os.path.join(root, file)
    return path_map


def save_file_list(data, out_file_path, source_dir, path_map, link_name, cleaned_text=False):
    with open(out_file_path, "w") as file:
        for row in data.itertuples():
            uttid = f"{row.uttid}.wav"
            path = path_map[uttid].replace(source_dir, link_name)
            info = row.text if not cleaned_text else row.phonemes

            file.write(f"{path}|{info}\n")
            # Print every nth sample
            if row.Index % 5000 == 0:
                print(f"{row.Index}: {path}|{info}")

    print(f"Saved to '{out_file_path}' ({len(data)} samples).")


def save_files(data, out_file_path, source_dir, path_map, link_name):
    save_file_list(train_data, out_file_path, source_dir, path_map, link_name)
    if "phonemes" in data.columns:
        out_file_path = out_file_path.replace(".txt", ".txt.cleaned")
        save_file_list(data, out_file_path, source_dir,
                       path_map, link_name, cleaned_text=True)

In [17]:
path_map = create_path_map(i_dir)


save_files(train_data, o_file_train, i_dir, path_map, link_name)
save_files(val_data, o_file_val, i_dir, path_map, link_name)

0: DUMMY3/common_voice_rw_21456474.wav|Mc Buryohe ni we wayoboye iki gitaramo.
5000: DUMMY3/common_voice_rw_21551504.wav|Mico asoje agira ati ‘reka dutegereze cash zacu’.
10000: DUMMY3/common_voice_rw_21389154.wav|Imihanda ishamikiye kuri uyu muhanda nk’uwa Cumi na gatanu nayo zizajya zikoreshwamo.
15000: DUMMY3/common_voice_rw_21268601.wav|Chili ntiratsinda imikino itatu ikurikirana umu gikombe cy’Isi.
20000: DUMMY3/common_voice_rw_21199421.wav|Yumvikanye abasaba kuzamura amaboko bakamwereka ko bamushyigikiye.
25000: DUMMY3/common_voice_rw_21390108.wav|Uyu mukino ukaba uyobowe na Monika Gintersdorfer afatanyije na knut Klassen.
30000: DUMMY3/common_voice_rw_21434063.wav|Issa Bigirimana yazitiwe na Habimana Hussein.
Saved to '../filelists/rw_kinyarwanda_22050_male_audio_sid_text_train_filelist.txt' (30074 samples).
0: DUMMY3/common_voice_rw_21456474.wav|mt͡ʃ βu,uːɾɟohe ni we wajoβoje iki ɡitaɾamo
5000: DUMMY3/common_voice_rw_21551504.wav|mit͡ʃo asoʒe aɡiɾa ati ɾeka du,uːteɡeɾeze t͡ʃaʃ 

### Create a symlink to the dataset


In [7]:
# Create symlink to the dataset
!ln -s {i_dir} {link_name}