### Main script to train RNN type architectures on audio data with control parameters
* Loading/batching from audioDataloader
* Control parameter creation/manipulation from paramManager
* NN architectures found in utils.architectures
* Teacher forcing (using known targets as input) ratio can be changed to <1 to do professor forcing
* Training target can be specified by no. of steps, epochs (specify no. of steps> steps required for an epoch or just set it to a very very big number) or a target loss value
* training parameters + model state dic + optimizer state dic are saved into a single python dic if savemodel=True 

To do:
* Put training/generation routines into its own script so it can be easily reused/ easier version management - but some variables then need to be imported into that script since these functions rely on many global variables defined here. All functions to be put in utils folder thereafter
* Move all training visualizations to Tensorboard
* Better integration with the attention training (in another notebook currently)
* streamline checkpoint saving - now ok for all conditions except for saving after each epoch

In [None]:
import numpy as np
import math
import time
from datetime import datetime

import torch
import torch.nn as nn
import torchvision.transforms as transform

import audioDataloader.dataloader as dataloader
from audioDataloader.transforms import mulawnEncode,mulaw,array2tensor,dic2tensor,injectNoise,normalizeDim
from paramManager import paramManager
from utils.architectures import RNN
from utils.myUtils import time_taken,plot_signal
#import utils.training as process

import matplotlib.pylab as plt
%matplotlib inline

In [None]:
# Read/write directory parameters
#*************************************
datadir = 'data/faustData_2019.06.22/dataset'
paramdir = 'data/faustData_2019.06.22/dataparams'
savemodeldir = 'model'
savename = 'model'
loadmodelpath = 'model/2019-06-23_19-20-24_model_epoch0_step4400.tar' #shared path to load model, optimizer and TrainingParams

# Pytorch parameters
#*************************************
#--- Note all intervals below are counted in no. of steps. 1 epoch = [len(dataset)//batch_size] steps ---

device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
savemodel = True
savemodel_interval = 1000 #if 0 (and savemodel=True) will only save model at the end of entire training
loadmodel = False #NOTE: if continuing training from previous save remember to also enable loadTrainingParams

# Training parameters
#*************************************
loadTrainingParams = False #if true will ignore the below and load training params from a .pth as specified in loadmodelpath
sr = 16000
seqLen = 512
stride = 1
batch_size = 256
num_epochs = 1
lr = 0.005
log_interval = 100 #will print the loss each log_interval 
max_steps = 10000 #set max_steps > (len(dataset)//batch_size) if training for more than 1 epoch
loss_target = 1.0 #alternative to using max_steps, training will end if model achieves this loss target
teacher_forcing_ratio = 1.0 #stochastically use either targets or own predictions as input for training. Set to 1 to always use targets

#Generation parameters 
#*************************************
max_length = seqLen*3
evaluate_interval = 200 #will generate an audio sequence each evaluate_interval for a visual check on the progress

#Network parameters
#*************************************
n_layers = 4
hidden_size = 40
output_size = 256 #also the no. of mu-law intervals. Both encoding and decoding will depend on this for consistency

#Pre-processing parameters
#*************************************
noise = 0.1 #eg. 0.1 == noise at 10% of signal
lowNote =  63 #midi pitch no.
hiNote = 75
prop = ['instID', 'pressure', 'midiPitch', 'tongue'] #will train on these parameters (need not use all available params)

#Define variables that change between runs (do not alter these!)
#*************************************
list_of_losses = []
start_epoch = 0
start_step = 0


if loadTrainingParams: #will overwrite the above if True
    print("Loading existing training params...")
    checkpoint = torch.load(loadmodelpath, map_location=device) #map_location in case using cpu
    
    list_of_losses = checkpoint['loss']
    start_epoch = checkpoint['epoch']
    start_step = checkpoint['step']
    sr = checkpoint['sample_rate']
    start = checkpoint['start_time']
    datadir = checkpoint['datadir']
    paramdir = checkpoint['paramdir']
    savemodeldir = checkpoint['savemodeldir']
    seqLen = checkpoint['seqLen']
    stride = checkpoint['stride']
    batch_size = checkpoint['batch_size']
    num_epochs = checkpoint['num_epochs'] #can comment out if want to change between runs
    lr = checkpoint['lr']
    log_interval = checkpoint['log_interval']
    #max_steps = checkpoint['max_steps'] #can comment out if want to change between runs
    teacher_forcing_ratio = checkpoint['teacher_forcing_ratio']                   
    prop = checkpoint['prop']
    output_size = checkpoint['output_size']
    hidden_size = checkpoint['hidden_size']
    n_layers = checkpoint['n_layers']
    noise = checkpoint['noise']
    lowNote = checkpoint['lowNote']
    hiNote = checkpoint['hiNote']
    
    for key in checkpoint:
        if (key != 'model_state_dict') and (key != 'optimizer_state_dict'): 
            print(key,'=',checkpoint[key])
    

# Loading warnings below
print('*****************')
if loadmodel and loadTrainingParams:
    print('Will continue training from a previous checkpoint...')
if not loadmodel and not loadTrainingParams:
    print('Will start training from scratch...')
if loadmodel and not loadTrainingParams:
    print('Will load existing model weights but not use trained parameters! (Are you sure?)')
if not loadmodel and loadTrainingParams:
    print('Will initialize new model but use parameters trained from a previous run! (Are you sure?)')
print('using',device, 'pytorch',torch.version.cuda)

In [None]:
# Let's check out the available conditional parameters first
#*************************************
pm = paramManager.paramManager(datadir, paramdir)
datafiles = pm.filenames(datadir)
params = pm.getParams(datafiles[0]) 
print(params.keys())

#note midiPitch has to be scaled since the large raw values interfere with the learning

In [None]:
# Initialize dataset & dataloader
#*************************************
audiocoding = mulawnEncode(output_size,0,1) #initialize the mu-law encodings
targetcoding = mulaw(output_size)
rescalePitch = normalizeDim('midiPitch',lowNote,hiNote)
#rescaleAmp = normalizeDim('volume',0,0.9)
cond_size = len(prop)

adataset = dataloader.AudioDataset(sr,seqLen,stride,
                                  datadir=datadir,extension='wav',
                                  paramdir=paramdir,prop=prop,
                                  transform=transform.Compose([injectNoise(weight=noise),audiocoding,array2tensor(torch.FloatTensor)]),
                                  param_transform=transform.Compose([rescalePitch,dic2tensor(torch.FloatTensor)]), 
                                  target_transform=transform.Compose([targetcoding,array2tensor(torch.LongTensor)]))

testdataset = dataloader.AudioDataset(sr,seqLen,stride, #for priming during generation
                                  datadir=datadir,extension='wav',
                                  paramdir=paramdir,prop=prop,
                                  transform=transform.Compose([array2tensor(torch.FloatTensor)]), 
                                  param_transform=transform.Compose([rescalePitch,dic2tensor(torch.FloatTensor)]),
                                  target_transform=transform.Compose([array2tensor(torch.LongTensor)]))

train_loader = torch.utils.data.DataLoader(dataset=adataset,
                                           batch_size=batch_size, 
                                           shuffle=True,
                                           num_workers=4,
                                           drop_last=True)

test_loader = torch.utils.data.DataLoader(dataset=testdataset,
                                          batch_size=1, 
                                          shuffle=True,
                                          num_workers=4,
                                          drop_last=True)

In [None]:
#Just to visualize the audio encoding in dataloader

print("size of dataset is",len(adataset))
print("no. of steps per epoch is",len(adataset)//batch_size)

# first pick a section of audio from the dataset
samp = adataset.rand_sample()
print("shape of audio seq is",samp.shape)
plt.figure(figsize=(20,1)) 
plt.plot(np.arange(len(samp)), samp) #just print one example from the batch
plt.grid()
plt.show()

#now inject noise weighted by the signal amplitude (to get roughly constant signal-to-noise ratio across data samples)
print("audio seq + 10% noise")
samp2 = samp + 0.1 * np.random.uniform(samp.min(), samp.max(), size=len(samp)).reshape(-1,1)
plt.figure(figsize=(20,1)) 
plt.plot(np.arange(len(samp2)), samp2) #just print one example from the batch
plt.grid()
plt.show()

#if required the noise can be fixed as well (constant=True) - see injectNoise() in transforms.py
print("audio seq + fixed noise")
samp3 = samp + 0.1 * np.random.uniform(-1, 1, size=len(samp)).reshape(-1,1)
plt.figure(figsize=(20,1)) 
plt.plot(np.arange(len(samp3)), samp3) #just print one example from the batch
plt.grid()
plt.show()

#we then encode the sampling to mu-law
print("audio seq  + 10% noise + mu-law encoding -> this is our input")
samp_coded = audiocoding(samp2)
plt.figure(figsize=(20,1)) 
plt.plot(np.arange(len(samp_coded)), samp_coded) #just print one example from the batch
plt.grid()
plt.show()

In [None]:
#just to check that the parameter values make sense
for step, (inp,target) in enumerate(train_loader): 
        print(inp[0])
        break

In [None]:
# Define the training cycle
#*************************************
def train(model,epoch):
    model.train() #put in training mode
    ave_loss_over_steps = 0
    current_loss = 10000 #set at a large initial value 
    
    for step, (inp,target) in enumerate(train_loader):
        inp, target = inp.to(device), target.to(device)

        # Forward + Backward + Optimize
        hidden = model.init_hidden(batch_size).to(device)
        optimizer.zero_grad()
        loss = 0
        
        for i in range(seqLen):
            use_teacher_forcing = True if np.random.random() < teacher_forcing_ratio or i==0 else False
            #similar to Bengio et al, Scheduled Sampling for Sequence Prediction with Recurrent Neural Networks
            
            if use_teacher_forcing: #feed the target as the next input 
                outputs, hidden = model(inp[:,i,:],hidden,batch_size)  #input dim: (batch, seq, feature)
                loss += criterion(outputs, torch.squeeze(target[:,i],1))
                
                outputs = nn.functional.log_softmax(outputs,dim=1)
                topv, topi = outputs.detach().topk(1)  #choose the strongest activation detach()
                predicted_sample = targetcoding.index2float(topi)
                
            else: #feed its own predictions (output of t-1) as next input
                own_inp = inp[:,i,:].clone()
                own_inp[:,0] = torch.squeeze(torch.tensor(audiocoding(predicted_sample),
                                                          dtype=torch.float,device=device,requires_grad=True),1)
                outputs, hidden = model(own_inp,hidden,batch_size)  #input dim: (batch, seq, feature)
                loss += criterion(outputs, torch.squeeze(target[:,i],1))
                
                outputs = nn.functional.log_softmax(outputs,dim=1)
                topv, topi = outputs.detach().topk(1)  #choose the strongest activation
                predicted_sample = targetcoding.index2float(topi)
                           
        loss.backward()
        optimizer.step()
        
        ave_loss_per_sample = loss.item()/seqLen   #over each minibatch
        ave_loss_over_steps += ave_loss_per_sample
        
        if (step+1) % log_interval == 0:
            current_loss = ave_loss_over_steps/log_interval
            print ('{:%Y-%m-%d %H:%M:%S} Epoch [{}/{}], Step [{}/{}] Loss: {:.4f}'.format( 
                datetime.now(), epoch, num_epochs, step+1, len(adataset)//batch_size, current_loss))
            
            list_of_losses.append(current_loss)
            ave_loss_over_steps = 0
            
        if (step+1) % evaluate_interval == 0:
            result, hs = generate(model,max_length)
            plot_signal(result,start=seqLen-1,start_min_max=[-.5,.5])
            model.train() #put model back to training mode
        
        if savemodel_interval != 0 and savemodel:
            if (step+1) % savemodel_interval == 0:
                torch.save({ #the training parameters that will be saved
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': list_of_losses,
                'epoch': start_epoch+epoch,
                'step': start_step+step+1,
                'sample_rate': sr,
                'start_time': start,
                'datadir': datadir,
                'paramdir': paramdir,
                'savemodeldir': savemodeldir,
                'seqLen': seqLen,
                'stride': stride,
                'batch_size': batch_size,
                'num_epochs': num_epochs,
                'lr': lr,
                'log_interval': log_interval,
                'max_steps': max_steps,
                'teacher_forcing_ratio': teacher_forcing_ratio,                    
                'prop': prop,
                'output_size': output_size,
                'hidden_size': hidden_size,
                'n_layers': n_layers,
                'noise': noise,
                'lowNote': lowNote,
                'hiNote': hiNote
                },                            
                '{}/{}_{}_epoch{}_step{}.tar'.format(
                    savemodeldir,start,savename,start_epoch+epoch,start_step+step+1))
                
                print('model {}_{}_epoch{}_step{}.tar saved'.format(start,savename,start_epoch+epoch,start_step+step+1))

        
        if step==max_steps or current_loss < loss_target:
            torch.save({ #the training parameters that will be saved
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': list_of_losses,
                'epoch': start_epoch+epoch,
                'step': start_step+step+1,
                'sample_rate': sr,
                'start_time': start,
                'datadir': datadir,
                'paramdir': paramdir,
                'savemodeldir': savemodeldir,
                'seqLen': seqLen,
                'stride': stride,
                'batch_size': batch_size,
                'num_epochs': num_epochs,
                'lr': lr,
                'log_interval': log_interval,
                'max_steps': max_steps,
                'teacher_forcing_ratio': teacher_forcing_ratio,                    
                'prop': prop,
                'output_size': output_size,
                'hidden_size': hidden_size,
                'n_layers': n_layers,
                'noise': noise,
                'lowNote': lowNote,
                'hiNote': hiNote
                },                            
            '{}/{}_{}_epoch{}_step{}.tar'.format(
                savemodeldir,start,savename,start_epoch+epoch,start_step+step+1))

            print('model {}_{}_epoch{}_step{}.tar saved'.format(start,savename,start_epoch+epoch,start_step+step+1))
            break
 

 
def generate(model,max_length,primer=None,paramvect=None,returnHiddenSequence=False):
    
    hs=[] #list to save hidden states, function returns empty list if returnHiddenSequence=False 
    
    model.eval()
    with torch.no_grad():
        for p_inp,target in test_loader:
            if primer is not None:
                # must clone else primer is changed outside this function
                p_inp.data = primer.clone()
            seq = np.copy(p_inp[0,:,0])  #extract the original sample
            seq_mu = audiocoding(seq)  #mu-law
            p_inp[0,:,0] = array2tensor(torch.FloatTensor)(seq_mu) #now we have both the original and mu-lawed samples
            break  
        generated = seq
        p_inp = p_inp.to(device)

        hidden = model.init_hidden().to(device)
        if returnHiddenSequence :
            hs.append(torch.squeeze(hidden).cpu().numpy())
        
        if p_inp.shape[1] > 1: #if priming with something with len>1
            for j in range(p_inp.shape[1]-1):  #build up hidden state
                _, hidden = model(p_inp[:,j,:],hidden)
        inp = p_inp[:,-1,:]  #feed the last value as the initial value of the actual generation
        
        for i in range(max_length):
            outputs, hidden = model(inp,hidden)
            outputs = nn.functional.log_softmax(outputs,dim=1)
            topv, topi = outputs.detach().topk(1)  #choose the strongest activation
            predicted_sample = targetcoding.index2float(topi)
            
            generated = np.append(generated,predicted_sample)
            
            inp[:,0] = torch.from_numpy(audiocoding([predicted_sample])).type(torch.FloatTensor).to(device)
            if paramvect is not None:
                if callable(paramvect):
                    inp[:,1:] = torch.from_numpy(paramvect(i)).type(torch.FloatTensor).to(device)
                else:
                    inp[:,1:] = torch.from_numpy(paramvect).type(torch.FloatTensor).to(device)
                
            if returnHiddenSequence :
                hs.append(torch.squeeze(hidden).cpu().numpy())
                                       
        return generated, hs
        

In [None]:
# Initialize the network, optimizer and objective func
#*************************************
model = RNN(input_size=1,cond_size=cond_size,hidden_size=hidden_size,output_size=output_size,n_layers=n_layers).to(device)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)

if loadmodel: # load checkpoint if needed
    print("Loading existing model and optimizer state...")
    model.load_state_dict(torch.load(loadmodelpath, map_location=device)['model_state_dict'])
    optimizer.load_state_dict(torch.load(loadmodelpath, map_location=device)['optimizer_state_dict'])
    
criterion = nn.CrossEntropyLoss()

In [None]:
# Train!
#*************************************
if not loadTrainingParams:
    start = '{:%Y-%m-%d_%H-%M-%S}'.format(datetime.now())
print('{:%Y-%m-%d %H:%M:%S} Starting training at epoch{} step{}...'.format(datetime.now(),start_epoch,start_step))
start_time = time.monotonic()
for epoch in range(num_epochs):
    train(model,epoch)
elapsed_time = time.monotonic() - start_time
print('Training time taken:',time_taken(elapsed_time))

if savemodel_interval == 0 and savemodel: #!NOTE the naming scheme for this saving part in still incorrect (for steps) 
    torch.save({#the checkpoint_dict here is slightly different from above since it can't access some local variables in training
                            'model_state_dict': model.state_dict(),
                            'optimizer_state_dict': optimizer.state_dict(),
                            'loss': list_of_losses,
                            'epoch': start_epoch+epoch,
                            'step': start_step,
                            'sample_rate': sr,
                            'start_time': start,
                            'datadir': datadir,
                            'paramdir': paramdir,
                            'savemodeldir': savemodeldir,
                            'seqLen': seqLen,
                            'stride': stride,
                            'batch_size': batch_size,
                            'num_epochs': num_epochs,
                            'lr': lr,
                            'log_interval': log_interval,
                            'max_steps': max_steps,
                            'teacher_forcing_ratio': teacher_forcing_ratio,                    
                            'prop': prop,
                            'output_size': output_size,
                            'hidden_size': hidden_size,
                            'n_layers': n_layers,
                            'noise': noise,
                            'lowNote': lowNote,
                            'hiNote': hiNote
                },                            
                    '{}/{}_{}_epoch{}_step{}.tar'.format(
                        savemodeldir,start,savename,start_epoch+epoch,start_step))
    print('model {}_{}_epoch{}_step{}.tar saved'.format(start,savename,start_epoch+epoch,start_step))
    
if savemodel_interval == num_epochs: #!NOTE the naming scheme for this saving part in still incorrect (for steps) 
    torch.save({#the checkpoint_dict here is slightly different from above since it can't access some local variables in training
                            'model_state_dict': model.state_dict(),
                            'optimizer_state_dict': optimizer.state_dict(),
                            'loss': list_of_losses,
                            'epoch': start_epoch+epoch,
                            'step': start_step,
                            'sample_rate': sr,
                            'start_time': start,
                            'datadir': datadir,
                            'paramdir': paramdir,
                            'savemodeldir': savemodeldir,
                            'seqLen': seqLen,
                            'stride': stride,
                            'batch_size': batch_size,
                            'num_epochs': num_epochs,
                            'lr': lr,
                            'log_interval': log_interval,
                            'max_steps': max_steps,
                            'teacher_forcing_ratio': teacher_forcing_ratio,                    
                            'prop': prop,
                            'output_size': output_size,
                            'hidden_size': hidden_size,
                            'n_layers': n_layers,
                            'noise': noise,
                            'lowNote': lowNote,
                            'hiNote': hiNote
                },                            
                    '{}/{}_{}_epoch{}_step{}.tar'.format(
                        savemodeldir,start,savename,start_epoch+epoch,start_step))
    print('model {}_{}_epoch{}_step{}.tar saved'.format(start,savename,start_epoch+epoch,start_step))

In [None]:
# Plot the loss over time
#*************************************
plt.figure()
plt.plot(list_of_losses)