<a href="https://colab.research.google.com/github/Birkbeck/bsc-computer-science-project-2021_22-mohammadreza490/blob/main/project_notebook.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Setting up paths and import libraries

In [9]:
#Set the following variables
PATH_TO_SRC_FOLDER = None
PATH_TO_PROJECT_FOLDER = None

In [2]:
#if you stored the unzipped file in google drive, you can mount your drive in colab at /content/gdrive like below
from google.colab import drive
drive.mount("/content/gdrive")

Mounted at /content/gdrive


In [None]:
!pip install musdb
!pip install pydub
!pip install tensorflow-addons
!pip install ipdb
!pip install museval
!pip install librosa

import os
import numpy as np
import librosa
import librosa.display
import matplotlib.pyplot as plt
import tensorflow as tf
import pandas as pd
import IPython.display as ipd
import musdb
import subprocess
import pydub
import tensorflow_addons as tfa
import time
import shutil
import ipdb
import scipy as sp
import soundfile as sf
import museval
import random

In [5]:
%load_ext autoreload

In [6]:
%autoreload 2

In [11]:
import sys
if PATH_TO_SRC_FOLDER not in sys.path:
  sys.path.append(PATH_TO_SRC_FOLDER) #https://stackoverflow.com/questions/48905127/importing-py-files-in-google-colab

In [12]:
from config_handler import Config_Handler
from dataset_handler import Dataset_Handler
from model_architecture_builder import Model_Architecture_Builder
from model_handler import Model_Handler
from printer import Printer
from visualiser import Visualiser
from wav_file_handler import Wav_File_Handler

In [None]:
Config_Handler.init(PATH_TO_PROJECT_FOLDER)

##Training a model (same as model_shfl_vocal_mdl6)


In [24]:
def loss_function(y_true:np.array, y_pred:np.array, alpha:float=1.0)->float:
      '''
      this returns the customised loss function as mentioned in the paper
      the result will be calculated as loss(Singing Voice) = alpha ∗ L(vocal, channelvocal) +(1 − alpha) ∗ L(acc, channelacc)
      where L() is the average of L1 losses on every pixel. with alpha = 1.0, we only get the vocals and not the accompaniments
      '''
      vocal_loss = tf.reduce_mean(tf.abs(y_true[..., 0] - y_pred[..., 0]))
      accompaniment_loss = tf.reduce_mean(tf.abs(y_true[..., 1] - y_pred[..., 1]))
      #L1 loss function: https://afteracademy.com/blog/what-are-l1-and-l2-loss-functions
      return alpha * vocal_loss + (1 - alpha) * accompaniment_loss

def learning_rate_scheduler(epoch:int, lr:float)->float:
     if epoch == 20:
       lr = 1e-4 
     return lr

def data_generator(model:Model_Handler, batch_size:int=8):
        path_to_dir = Config_Handler.PATH_TO_TRAIN_DATA_DIR()
        available_musics =  os.listdir(path_to_dir)
        if f"{model._model_name}-spectrograms" in available_musics:
            available_musics.remove(f"{model._model_name}-spectrograms")
        if "model_9_vocal_with_model_6_data_generator-spectrograms" in available_musics:
            available_musics.remove("model_9_vocal_with_model_6_data_generator-spectrograms")
        music_dict = {}
        for music_name in available_musics:
            
            path = os.path.join(Config_Handler.PATH_TO_TEMP_SPECTROGRAMS_FOR_TRAINING_DIR(), music_name)
            number_of_available_segments = Wav_File_Handler(audio_path = os.path.join(Config_Handler.PATH_TO_TRAIN_DATA_DIR(), music_name, "mixture.wav")).get_number_of_possible_segments()
            music_dict[music_name] = {"available_spectrograms_ids": [str(spec_id) for spec_id in range(number_of_available_segments)]}
        batch_X = []
        batch_y = {"vocal_spectrograms" : [],
        "ac_spectrograms": []}
        
        while len(available_musics) > 0:
            np.random.shuffle(available_musics)
            for music_name in reversed(available_musics):
                spec_dir_path = os.path.join(Config_Handler.PATH_TO_TEMP_SPECTROGRAMS_FOR_TRAINING_DIR(), music_name)
                specs = np.load(os.path.join(spec_dir_path, "spectrograms.npy"), allow_pickle=True).item()
                while len(music_dict[music_name]["available_spectrograms_ids"]) > 0:
                    np.random.shuffle(music_dict[music_name]["available_spectrograms_ids"])
                    if len(music_dict[music_name]["available_spectrograms_ids"]) < batch_size:
                        spectrogram_ids_to_select = music_dict[music_name]["available_spectrograms_ids"]
                    else:
                        spectrogram_ids_to_select = music_dict[music_name]["available_spectrograms_ids"][:batch_size]
                    segments_ids_to_select = [spec_id for spec_id in spectrogram_ids_to_select]
                    music_dict[music_name]["available_spectrograms_ids"] = music_dict[music_name]["available_spectrograms_ids"][batch_size:]
                    batch_X = np.array([specs[spec_id]["mixture"] for spec_id in segments_ids_to_select])
                    batch_y["vocal_spectrograms"] = np.array([specs[spec_id]["vocals"] for spec_id in segments_ids_to_select])
                    batch_y["ac_spectrograms"] = np.array([specs[spec_id]["accompaniment"] for spec_id in segments_ids_to_select])
                    X = np.array(batch_X)
                    y = np.array(np.stack([batch_y["vocal_spectrograms"], batch_y["ac_spectrograms"]], axis=-1))
                    X = tf.squeeze(X)
                    y = tf.squeeze(y)
                    X = tf.expand_dims(X, -1) #this is for the input channel numbers (the input layer of cnn is has only one channel (look at the structure in the paper))
                    y = tf.expand_dims(y, -1)
                    if len(X.shape) == 3:
                        #if for example there is only one element (one spectrogram), we add a batch size of one at the beggining
                        X = tf.expand_dims(X, 0) #this is for the input channel numbers (the input layer of cnn is has only one channel (look at the structure in the paper))
                        y = tf.expand_dims(y, 0)
                    batch_X = []
                    batch_y = {"vocal_spectrograms" : [],
                "ac_spectrograms": []}
                    yield(X, y)
                    if len(music_dict[music_name]["available_spectrograms_ids"]) == 0:
                        available_musics.remove(music_name)
                    if len(available_musics) == 0:
                        break 

In [None]:
new_model = Model_Handler("new_model")

In [None]:
new_model.train(loss_function, data_generator, learning_rate_scheduler)

##Load the pretrained model

In [None]:
model_shfl_vocal_mdl6 = Model_Handler("model_shfl_vocal_mdl6") #loading the pretrained vocal model

In [None]:
#to predict a song, either load it using librosa.load function or pass the full song path to the predict method

v, a, o = model_shfl_vocal_mdl6.predict() #set either wav_array or audio_path here

#The four models trained in this project were coded like this:

In [None]:
model_shfl_vocal_mdl6 = Model_Handler("model_shfl_vocal_mdl6") #only vocals
model_shfl_accompaniment_mdl6 = Model_Handler("model_shfl_accompaniment_mdl6") #only accompaniments
model_shfl_half_alpha_mdl6 = Model_Handler("model_shfl_half_alpha_mdl6") #half vocals and half accompaniments
model_shfl_more_vocals_mdl6 = Model_Handler("model_shfl_more_vocals_mdl6") #more vocals

In [None]:
model_shfl_vocal_mdl6.train(loss_function, data_generator, learning_rate_scheduler)

In [None]:
model_shfl_accompaniment_mdl6.train(lambda y_true, y_pred: loss_function(y_true, y_pred, 0.0), data_generator, learning_rate_scheduler)

In [None]:
model_shfl_half_alpha_mdl6.train(lambda y_true, y_pred: loss_function(y_true, y_pred, 0.50), data_generator, learning_rate_scheduler)

In [None]:
model_shfl_more_vocals_mdl6.train(lambda y_true, y_pred: loss_function(y_true, y_pred, 0.707), data_generator, learning_rate_scheduler)