In [1]:
# VCTK Corpus Path
__CORPUSPATH__ = "./data/vocalset"

# output path
__OUTPATH__ = "./data/original"

In [4]:
import os
from scipy.io import wavfile
from pydub import AudioSegment

from pydub import AudioSegment
from pydub.silence import split_on_silence
import os

import glob

def split(sound):
    dBFS = sound.dBFS
    chunks = split_on_silence(sound,
        min_silence_len = 100,
        silence_thresh = dBFS-16,
        keep_silence = 100
    )
    return chunks

def combine(_src):
    audio = AudioSegment.empty()
    for i,filename in enumerate(os.listdir(_src)):
        if filename.endswith('.wav'):
            filename = os.path.join(_src, filename)
            audio += AudioSegment.from_wav(filename)
    return audio

def save_chunks(chunks, directory):
    if not os.path.exists(directory):
        os.makedirs(directory)
    counter = 0

    target_length = 5 * 1000
    output_chunks = [chunks[0]]
    for chunk in chunks[1:]:
        if len(output_chunks[-1]) < target_length:
            output_chunks[-1] += chunk
        else:
            # if the last output chunk is longer than the target length,
            # we can start a new one
            output_chunks.append(chunk)

    for chunk in output_chunks:
        chunk = chunk.set_frame_rate(24000)
        chunk = chunk.set_channels(1)
        counter = counter + 1
        chunk.export(os.path.join(directory, str(counter) + '.wav'), format="wav")

In [2]:
# Source: http://speech.ee.ntu.edu.tw/~jjery2243542/resource/model/is18/en_speaker_used.txt
# Source: https://github.com/jjery2243542/voice_conversion

speakers = ["female"+str(i) for i in range(1, 10)] + ["male"+str(i) for i in range(1, 12)]

In [7]:
# downsample to 24 kHz
from tqdm import tqdm

for p in tqdm(speakers):
    directory = './data/vocalset/' + p
    save_directory = __OUTPATH__ + '/' + p
    os.makedirs(directory, exist_ok=True)
    audio = combine(directory)
    chunks = split(audio)
    save_chunks(chunks, save_directory)

100%|██████████| 20/20 [05:10<00:00, 15.54s/it]


In [5]:
# get all speakers

data_list = glob.glob('./data/original/*/*')
                
import pandas as pd

data_list = pd.DataFrame(data_list)
data_list = data_list.sample(frac=1)

import random

split_idx = round(len(data_list) * 0.1)

test_data = data_list[:split_idx][0].to_list()
train_data = data_list[split_idx:][0].to_list()

In [6]:
from sklearn.preprocessing import LabelEncoder

le = LabelEncoder().fit(speakers)

In [8]:
# write to file 

file_str = ""
for k in train_data:
    speaker = k.split('/')[3]
    speaker_le = le.transform([speaker])[0]
    file_str += '.'+ k + "|" + str(speaker_le)+ '\n'
text_file = open(__OUTPATH__ + "/train_list.txt", "w")
text_file.write(file_str)
text_file.close()

file_str = ""
for k in test_data:
    speaker = k.split('/')[3]
    speaker_le = le.transform([speaker])[0]
    file_str += '.'+ k + "|" + str(speaker_le)+ '\n'
text_file = open(__OUTPATH__ + "/val_list.txt", "w")
text_file.write(file_str)
text_file.close()

In [12]:
data_list