

This notebook provides the means to:

- download a solo saxophone dataset from the repository provided.
- augment and chunk the data to make it suitable for training a SampleRNN model on.
- train a SampleRNN model.
- generate fake saxophone playing from the trained model.

All that is required is a Google Drive account and a HuggingFace account. You can use your Google account to create a HuggingFace account. Cell 4 below requires you to generate and paste an access token https://huggingface.co/settings/tokens

Before running any of the cells below, go to Edit -> Notebook Settings above and select the "High RAM" option, otherwise the notebook will crash. Also make sure you have a GPU selected (T4 is fine, A100 much faster if available).


In [None]:
# 1 CONNECT TO YOUR GOOGLE DRIVE

import os, sys
from google.colab import drive
drive.mount('/content/mnt', force_remount=True)
nb_path = '/content/notebooks'
os.symlink('/content/mnt/My Drive/', nb_path)
sys.path.insert(0, nb_path)

In [None]:
# 2 CLONE SAMPLERNN REPO

%cd {nb_path}
!mkdir SampleRNN
%cd SampleRNN
!git clone https://github.com/rncm-prism/prism-samplernn
%cd prism-samplernn

In [None]:
# 3 INSTALL HUGGING FACE DATASETS DEPENDENCY

!pip install datasets

In [None]:
# 4 DOWNLOAD DATASET FROM HUGGINGFACE
from datasets import load_dataset

# PASTE YOUR HUGGING FACE ACCESS TOKEN HERE, GENERATE ONE FROM YOUR ACCOUNT AT https://huggingface.co/settings/tokens
MY_ACCESS_TOKEN = ""

dataset = load_dataset("markhanslip/markhanslip_phd_saxophone_data", token = MY_ACCESS_TOKEN)

data = dataset['train']['audio'][0]['array']
sr = dataset['train']['audio'][0]['sampling_rate']

In [None]:
# 5 PREPARE DATASET FOR TRAINING (TAKES A WHILE)

import soundfile as sf
import numpy as np
import os
import librosa

stretched_data = librosa.effects.time_stretch(data, rate=1.05)
data = np.hstack((data, stretched_data))

data_inv = -data

chunk_len = int(sr*8) # 8 second chunks
startpos = 0
endpos = chunk_len
count=0
out_dir='./ToneRows_dataset/'

if not os.path.exists(out_dir):
    os.mkdir(out_dir)

for i in range(len(data)):
    if i % chunk_len == 0:
        count+=1
        sf.write(os.path.join(out_dir,'{}.wav'.format(str(count).zfill(6))), samplerate=sr, data=data[startpos:endpos], subtype='PCM_16')
        startpos = (startpos+chunk_len)
        endpos = (endpos+(chunk_len))

startpos = 0
endpos = chunk_len
count=0

for i in range(len(data_inv)):
    if i % chunk_len == 0:
        count+=1
        sf.write(os.path.join(out_dir,'{}_inv.wav'.format(str(count).zfill(6))), samplerate=sr, data=data_inv[startpos:endpos], subtype='PCM_16')
        startpos = (startpos+chunk_len)
        endpos = (endpos+(chunk_len))

files = os.listdir(out_dir)
for wavfile in files:
  y, sr = sf.read(os.path.join(out_dir, wavfile))
  if len(y) != chunk_len:
    os.remove(os.path.join(out_dir, wavfile))


In [None]:
# 5 create config file for defining the model architecture

import json

config = {

    "seq_len": 1024,
    "frame_sizes": [16,64],
    "dim": 1024,
    "rnn_type": "lstm",
    "num_rnn_layers": 3,
    "q_type": "mu-law",
    "q_levels": 256,
    "emb_size": 256,
    "skip_conn": False

}

with open('./3l_lstm.config.json', 'w') as outfile:
    json.dump(config, outfile)

In [None]:
# 6 TRAIN THE MODEL (TAKES A FEW HOURS)
!python ./train.py --data_dir ./ToneRows_dataset/ --config_file ./3l_lstm.config.json --output_dir ./ToneRows_training --id ToneRows_training --output_file_dur 3 --batch_size 64 --sample_rate {sr} --num_epochs 120  # if training stops then add --resume True and run again

In [None]:
# 7 POST-TRAINING - PICK THE MOST RECENT SAVED CHECKPOINT FOLDER

import ipywidgets as widgets
from ipywidgets import interact, interactive, fixed, interact_manual
import os

ckpt_dirs=[]

for dir in os.listdir('./logdir/ToneRows_training'):
  ckpt_dirs.append(dir)

def f(x):
    return x

w=interactive(f, x=widgets.Dropdown(options=ckpt_dirs, description='ckpt dir: ',disabled=False));
display(w)

In [None]:
ckpt_dir = w.result

In [None]:
# 8 - CHOOSE A CHECKPOINT FROM WHICH TO GENERATE

from functools import reduce

ckpt_files=[]

for ckpt in os.listdir(os.path.join('./logdir/ToneRows_training', ckpt_dir)):
  if ckpt[:5] == "model":
    ckpt_files.append(ckpt[11:14])

for index, ckpt in enumerate(ckpt_files):
  if ckpt[-1] == "i" or ckpt[-1] == "d":
    ckpt_files[index] = ckpt[0]
  elif ckpt[-1] == ".":
    ckpt_files[index] = ckpt[:2]
  else:
    pass

unique_ckpts = reduce(lambda l, x: l.append(x) or l if x not in l else l, ckpt_files, [])

def f(x):
    return x

v=interactive(f, x=widgets.Dropdown(options=unique_ckpts, description='ckpt: ',disabled=False));
display(v)

In [None]:
ckpt = v.result

In [None]:
# 9 - GENERATE SAMPLES
import os

NUM_FILES = 5 # number of audio files to generate, can be any int
FILE_LENGTH = 8 # length per generated file in seconds, can be any int (very long samples will take a long time to generate)
OUTPUT_PATH = "ToneRows_samples"
FILE_PREFIX = "ToneRows"

if not os.path.exists(OUTPUT_PATH):
  os.mkdir(OUTPUT_PATH)

!python ./generate.py --output_path ./{OUTPUT_PATH}/{FILE_PREFIX} --checkpoint_path ./logdir/ToneRows_training/{ckpt_dir}/model.ckpt-{ckpt} --config_file ./3l_lstm.config.json --dur {FILE_LENGTH} --num_seqs {NUM_FILES} --sample_rate 22050 --temperature 0.95

In [None]:
# 10 - LISTEN BACK TO GENERATED SAMPLES

import soundfile
from IPython.display import Audio, display
import time
import random
import os

wavfiles = os.listdir(OUTPUT_PATH)

for wavfile in wavfiles:
  y, sr = soundfile.read(os.path.join(OUTPUT_PATH, wavfile))
  widget = Audio(y, rate=sr, autoplay=False)
  display(widget)
  time.sleep(len(y)/sr + random.choice([0.1, 0.05, 0.2, 0.15, 0.25]))