# Initial setup
**NOTE:** you must install `virtualenv` and `ipynbname` globally before running this notebook

Should be run in Unix environment

*Estimated time*: 5 minutes

In [None]:
import ipynbname, os

def install_package(package, data_path):
    return_code = os.system(f"{data_path}bin/pip install --ignore-requires-python {package}")
    if return_code != 0:
        print(f"** Error installing: {package}")

def is_package_installed(package, data_path):
    return os.system(f"{data_path}bin/pip show {package} > /dev/null") == 0
os.chdir(ipynbname.path().parent.absolute())
data_path=f"./{ipynbname.name()}/"
# Make the venv if it doesn't already exist
if not os.path.exists(f"{data_path}bin/"):
    print(f"** Creating venv at {data_path}")
    os.system(f"virtualenv {data_path}")
# Install the requried packages
for package in ['ffmpeg', 'demucs', 'pyworld', 'so-vits-svc-fork', 'pydub']:
   if not is_package_installed(package, data_path):
    print(f"** Installing {package}")
    install_package(package, data_path)
os.chdir(data_path)
os.makedirs(os.path.expanduser()+'/separated')
os.symlink(os.path.expanduser()+'/separated', './separated')

# Split the audio
*Estimated time*: Many hours, depends on audio clip length. Took ~3 hours with a 3GB wav file. You will have to manually download an audio file and place it in the directory where the notebook is for the dataset

In [None]:
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
from pathlib import Path
# Split the audio if it exists
audio_basename = ''
@interact_manual(audio_path='')
def split(audio_path):
    global audio_basename
    audio_basename = Path(audio_path).stem
    if os.path.exists(audio_path):
        os.system(f"./bin/demucs --two-stems=vocals {audio_path}")
    else:
        print(f"Could not find or use file: {audio_path}")

# Remove silences from audio file
*Estimated time*: A few minutes. Took 30 min with 3 GB WAV file

In [None]:
from pydub import AudioSegment
from pydub.silence import split_on_silence

audio_vocals_path = f"./separated/htdemucs/{audio_basename}/vocals.wav"
out_dir = f"./dataset_raw/{audio_basename}/"
if not os.path.exists(out_dir):
    os.makedirs(out_dir)

@interact_manual(minimum_silence_length=widgets.IntSlider(min=0, max=5000, step=1, value=600), silence_threshold=widgets.IntSlider(min=--125, max=0, step=1, value=-45), keep_silence_in_audio_clips=False)
def fragment(minimum_silence_length, silence_threshold, keep_silence_in_audio_clips):
    if os.path.exists(audio_vocals_path):
        # Load the audio
        print("Loading audio file...")
        audio = AudioSegment.from_file(audio_vocals_path)
        # Get segments with talking and put each into a file
        i = 0
        print("Splitting audio clips...")
        # TODO: change keep silence to make audio clips longer
        chunks = split_on_silence(audio, min_silence_len=minimum_silence_length, silence_thresh=silence_threshold, keep_silence=keep_silence_in_audio_clips)
        progress_bar = widgets.IntProgress(
            value=0,
            min=0,
            max= len(chunks) - 1,
            step=1,
            description='Audio clips saved:',
            bar_style='info',
            orientation='horizontal'
        )
        print("Saving audio clips...")
        display(progress_bar)
        for chunk in chunks:
            chunk.export(f"{out_dir}{i}.mp3", format='mp3')
            i = i + 1
            progress_bar.value = i
        print("Done.")
    else:
        print(f"Could not access vocals file {audio_vocals_path}")

# Prepration for training
*Estimated Time*: ~10 minutes

In [None]:
svc = './bin/svc'
@interact_manual(f0_method=["dio", "crepe-tiny", "parselmouth", "crepe", "harvest"])
def pre_train(f0_method):
    print("Resampling...")
    os.system(f"{svc} pre-resample")
    print("Configuring...")
    os.system(f"{svc} pre-config -t so-vits-svc-4.0v1-legacy")
    print("HuBERTing...")
    os.system(f"{svc} pre-hubert -fm {f0_method}")

# Train!
*Estimated Time*: Serveral hours **per** epoch

In [None]:
import json

@interact_manual(epochs = widgets.IntSlider(min=1, max=10000, step=1, value=1))
def train(epochs):
    with open('./configs/44k/config.json', "r+") as config_file:
        config = json.load(config_file)
        config['train']['epochs'] = epochs
        config_file.seek(0)
        json.dump(config, config_file, indent=4)
        config_file.truncate()
    os.system(f"{svc} train")

# Prepare song for inference
*Estimated time*: A few minutes

At this point, you must manually download the song you want to the folder with the name of this notebook inside of the folder where this notebook is stored. Just `yt-dlp` it. Once it is downloaded, input it's filename into the box.

In [None]:
import ipywidgets as widgets
from ipywidgets import interact, interact_manual
from pathlib import Path
# Split the audio if it exists
song_basename=''
@interact_manual(audio_path='')
def split(audio_path):
    global song_basename
    song_basename = Path(audio_path).stem
    if os.path.exists(audio_path):
        os.system(f"./bin/demucs --two-stems=vocals {audio_path}")
    else:
        print(f"Could not find or use file: {audio_path}")

# Do it!
*Estimated time*: ~15 minutes at most

Pitch -12 is down one octave 12 is up one. You must select if the audio is signing to make sure the correct options are set so the pitch isn't messed up.

In [None]:
@interact_manual(pitch=widgets.IntSlider(min=-120, max=120, step=1, value=0), singing=True)
def infer(pitch,singing):
    na_arg = ''
    if singing:
        na_arg = ' -na'
    os.system(f"{svc} infer ./separated/htdemucs/{song_basename}/vocals.wav -t {pitch}{na_arg}")

# Merge background and vocals
*Estimated time*: 1 minute

In [None]:
@interact_manual(output_wav_filename=f"{song_basename}_{audio_basename}.wav")
def merge(output_wav_filename):
    vocals = AudioSegment.from_file(f"./separated/htdemucs/{song_basename}/vocals.out.wav")
    background = AudioSegment.from_file(f"./separated/htdemucs/{song_basename}/no_vocals.wav")
    result = background.overlay(vocals, position=0)
    result.export(output_wav_filename)