## Imports

In [18]:
%load_ext autoreload
%autoreload 2

import tensorflow as tf
from tensorflow.keras.layers import Dense, Dropout, PReLU
from tensorflow.keras.optimizers import Adam

from music_generator.basic.random import generate_dataset
from music_generator.basic.signalproc import SamplingInfo
from music_generator.musical.timing import Tempo
from music_generator.musical.scales import GenericScale
from music_generator.basic.signalproc import mix_at
from music_generator.analysis import preprocessing

from music_generator.musical import scales
import numpy as np
from multiprocessing import Pool
from functools import partial

import matplotlib.pyplot as plt
from IPython.display import Audio
%matplotlib inline

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Filtering lead instrument

## Goal

We are going to generate some music with more than one synthesizer

We will filter out the lead tone using a feed-forward neural network.

```
Model: input wave with 3 instruments -> output wave 1 instrument
```

We will use a auto-encoder like setup. Replace the image by a short fragment of 1024 samples (~1/40th of a second) of sound data.

<img src="images/ae.png">

[Image source](`https://medium.com/@curiousily/credit-card-fraud-detection-using-autoencoders-in-keras-tensorflow-for-hackers-part-vii-20e0c85301bd)

## Generating the data...

In [3]:
sr = 44100
sampling_info = SamplingInfo(sr)

In [4]:
# Generate in all keys
all_roots = scales.chromatic_scale('C')
roots = [n.get_symbol() for n in all_roots.generate(0, 1)]
print(roots)

def generate_dataset_for_root(root):
    return generate_dataset(n_measures=32,
                            tempo=Tempo(120),
                            scale=GenericScale(root, [0, 2, 3, 5, 7, 8, 10]),
                            sampling_info=sampling_info)
    
with Pool(8) as pool:
    datasets = pool.map(generate_dataset_for_root, roots)
    
# Make one big data set and make sure data is of same size    
audio_tracks, mix = preprocessing.combine_datasets(datasets)    

['C', 'C#', 'D', 'D#', 'E', 'F', 'F#', 'G', 'G#', 'A', 'A#', 'B']


In [5]:
mix.shape

(33807948,)

In [6]:
audio_tracks.shape

(3, 33807948)

## Training and target

In [7]:
n_samples = 1024 * 40
fragment_length = 1024 * 1
input_track = mix
target_track = audio_tracks[2]

In [8]:
Audio(input_track[0:8*sr], rate=sr)

In [9]:
Audio(target_track[0:8*sr], rate=sr)

<img width='75%' src="images/ae.png">

# Create training set

In [10]:
x, y = preprocessing.create_training_data_set(n_samples, 
                                              fragment_length, 
                                              input_track, 
                                              target_track)

In [13]:
Audio(x[0], rate=sr)

# play_array(np.tile(x[0], 100))

In [14]:
def x_fade_profile(batch_dim):
    x = np.arange(batch_dim)
    return 1 - abs(x - (batch_dim / 2)) / (batch_dim / 2)

def model_predict(model, input_track):
    dim = input_shape[0]
    n_batches = int(len(input_track) / dim) - 1
    pred_batches = input_track[0:n_batches*dim].reshape((-1, dim))
    
    pred_batches_shifted = input_track[dim//2:n_batches*dim + dim//2].reshape((-1, dim))
    
    xfp = x_fade_profile(dim)
    
    x0 = np.array([xfp * batch for batch in model.predict(pred_batches)]).reshape(-1)
    x1 = np.array([xfp * batch for batch in model.predict(pred_batches_shifted)]).reshape(-1)
    
    return mix_at(x0, x1, dim//2)

## Time for some (deep) learning: build an auto-encoder-like network

The model is just a simple feed forward neural network

The architecture is one of a simple auto-encoder: same output dim as input dim. However, the data that we present is different: targets $\neq$ inputs

<img src="images/ae.png">

[Image source](`https://medium.com/@curiousily/credit-card-fraud-detection-using-autoencoders-in-keras-tensorflow-for-hackers-part-vii-20e0c85301bd)

In [15]:
input_shape = x[0].shape
output_shape = x[1].shape[0]

In [19]:
model = tf.keras.models.Sequential()
model.add(Dense(1024, input_shape=input_shape))
model.add(PReLU())
model.add(Dense(512))
model.add(PReLU())
model.add(Dense(output_shape))
model.compile(Adam(), 'mse')
model.summary()

Model: "sequential"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
dense (Dense)                (None, 1024)              1049600   
_________________________________________________________________
p_re_lu (PReLU)              (None, 1024)              1024      
_________________________________________________________________
dense_1 (Dense)              (None, 512)               524800    
_________________________________________________________________
p_re_lu_1 (PReLU)            (None, 512)               512       
_________________________________________________________________
dense_2 (Dense)              (None, 1024)              525312    
Total params: 2,101,248
Trainable params: 2,101,248
Non-trainable params: 0
_________________________________________________________________


## How does the network sound before training?

In [20]:
Audio(model_predict(model, mix)[0:15*sr], rate=sr)

## Fit the model in two epochs

In [21]:
model.fit(x, y, epochs=2)

Epoch 1/2
Epoch 2/2


<tensorflow.python.keras.callbacks.History at 0x7f385f65ac88>

## Let's test the model

In [22]:
display(Audio(mix[40*sr:45*sr], rate=sr))
display(Audio(model_predict(model, mix)[40*sr:45*sr], rate=sr))

## Is it overfitted?

Of course it is overfitted on this particular sound of synth and backing track, and it will not work for any other sounds than this. But how well can it predict if we generate a completely new data set using a different scale (Phrygian Dominant, instead of minor)?

In [23]:
score_tracks_test, audio_tracks_test, mix_test = \
    generate_dataset(n_measures=64,
                     tempo=Tempo(120),
                     scale=GenericScale('E', [0, 1, 4, 5, 7, 8, 10]),
                     sampling_info=sampling_info)

In [24]:
Audio(model_predict(model, mix_test[0:15*44100]), rate=sr)