In [None]:
import pandas as pd
import numpy as np
import os, datetime

from pyAudioAnalysis import audioBasicIO, ShortTermFeatures
from pydub import AudioSegment

import soundfile

import tensorflow as tf
from tensorflow import keras


This script is used to convert the user's recorded audio (vocal-only) and transform it using a vocal profile of one of the speakers in the datasets. The full conversion is not expected to be a complete vocal replacement but more of a shift towards the chosen speaker's voice.

The first part of this script will be very similar to Audio_Properties_Extract to extract the properties per sample in the audio recording. (Sample = one sample in a sample rate of a audio file)
Conversions will be stored in the ~\Vocal_Replace\User_Vocal_Data\User_Transformation folder.

The second part is to use the audio extraction and the vocal profile of the chosen speaker as the variables to feed into the model, which the model will attempt to predict the amplitude of each sample. Most of the coding will be from the Deep_Learning_Two_To_One_Model script.
Final prediction audio will be placed in ~\Vocal_Replace\Model\Model_Outputs folder. There will be two predictions done: one using a model trained on all speakers and another prediction using a model trained only with male or female speakers (depending on the gender of the chosen speaker).

Manual file moving and variable declaration is required before using this script:

1) Move the user's audio file (in WAV file format and ONLY IN MONO not stereo) into the ~\Vocal_Replace\User_Vocal_Data\Recordings folder
    If the user's audio is not in WAV and/or is not a mono channel recording, it is better to use Audacity to manually convert the audio over. (Although there are Python packages that will cover some of the conversions needed, they have limited capabilities on reading different audio files. Audacity is free and widely used, and has a lot of support for almost anything audio-related.)

2) Rename the file to with a unique filename

3) Take that filename (with the file extension) and set it to be the value for the variable, user_audio_filename
    user_audio_filename = "audio file.wav"
    
4) Choose a speaker/vocal profile for the vocal replacement and set their ID number as the value for the speaker_ID variable. 
    To see an overview of speakers, look at the SPEAKER_Modified_Final.csv file. To see the vocal profiles, look in the Vocal Profile Summary.csv file. Do note that not all of the speakers in the SPEAKER_Modified_Final.csv have vocal profiles--that is because not all the datasets avaliable were used in this project and it is better to overview the avaliable speakers using the Vocal Profile Summary.csv instead. If the datasets were downloaded and unzipped, you can listen to the speaker's voice by playing the audio file in that dataset. The dataset that the speaker belongs to is listed in the SPEAKER_modified_final.csv file.
    
Note: Once audio recording will take up a lot of storage, especially RAM! A ~8 minute recording will take up 10-12 GBs of storage and RAM!

In [None]:
############################### MANUAL INPUT IN THIS CELL ###################################################################

# Declare the filename of the user's vocal audio file:
user_audio_filename = "Fire and Ice - Robert Frost - Recording.wav"
speaker_ID = 61

#############################################################################################################################

In [None]:
# Setting up the filepaths to the user's audio file:
current_directory = os.getcwd()

user_audio_filepath = current_directory + '\\Vocal_Replace\\User_Vocal_Data\\Recordings\\' + user_audio_filename

In [None]:
# Audio Extraction:

###### Loading in converted audio file:
sample_rate, audio_amplitude = audioBasicIO.read_audio_file(user_audio_filepath)

###### "window" = 512, "step" = 1
feature_values, feature_names = ShortTermFeatures.feature_extraction(audio_amplitude, sample_rate, window= 512, step= 1)

###### Creating a DF of the features extracted in the previous line of code:
###### The features and their definitions are found here (https://github.com/tyiannak/pyAudioAnalysis/wiki/3.-Feature-Extraction)
feature_df = pd.DataFrame()

for df_index in range(len(feature_names)):
    feature_df[feature_names[df_index]] = feature_values[df_index]

feature_df['SAMPLE_RATE'] = sample_rate

feature_df.to_csv(current_directory + '\\Vocal_Replace\\User_Vocal_Data\\User_Transformation\\' + user_audio_filename 
                  + ' - Audio Extraction.csv')

In [None]:
# Loading in the Vocal Profiles, converting genders, and selecting only the chosen speaker according to the input speaker_ID:
vocal_profile_df = pd.read_csv(current_directory 
                               + '\\Vocal_Replace\\Data\\Modified_Data\\Vocal_Profiles\\Vocal Profile Summary.csv', 
                               index_col=0) 

# Converting gender into numeric (0, 1):
for index, row in vocal_profile_df.iterrows():
    if row.GENDER == 'M':
        gender = 1
        
    else:
        gender = 0
        
    vocal_profile_df.loc[index, 'GENDER'] = gender
    
vocal_profile_df = vocal_profile_df.astype({'GENDER': np.int64})

vocal_profile_selected = vocal_profile_df[vocal_profile_df.SPEAKER_ID == speaker_ID]

In [None]:
# Loading in trained models and predicting a audio file:

# Reloading in the feature_df file:
feature_df = pd.read_csv(current_directory + '\\Vocal_Replace\\User_Vocal_Data\\User_Transformation\\' + user_audio_filename 
                  + ' - Audio Extraction.csv', index_col= 0)

# Loading in the trained models and definitions of custom metrics used:
def custom_metric_mean(y_true, y_pred):
    ## Want this to be as close to 0 as possible
    return keras.backend.mean(y_true, axis=None, keepdims=False) - keras.backend.mean(y_pred, axis=None, keepdims=False)

# Defining custom metric:
def custom_metric_difference(y_true, y_pred):
    ## Want this to be as close to 0 as possible
    return keras.backend.sum(y_true, axis=None, keepdims=False) - keras.backend.sum(y_pred, axis=None, keepdims=False)

# Loading in the three pre-trained models (male + female, male only, female only):
model_full_loaded = tf.keras.models.load_model(current_directory 
                                               + '\\Vocal_Replace\\Model\\Saved_Models\\Saved_Model_Full.h5', 
                                               custom_objects={"custom_metric_mean": custom_metric_mean, 
                                                               'custom_metric_difference': custom_metric_difference,
                                                              'PReLU': keras.layers.PReLU()}, 
                                               compile=False)
model_full_loaded.compile(optimizer= keras.optimizers.Adam(learning_rate=0.001), loss= 'mean_squared_error', 
                          metrics= [custom_metric_mean, custom_metric_difference])



# Extending the vocal_profile selected to be the same length as feature_df:
extended_vocal_profile = pd.DataFrame({'SPEAKER_ID': vocal_profile_selected.SPEAKER_ID.values[0], 
                                       'GENDER': vocal_profile_selected.GENDER.values[0], 
                                       'MIN_RANGE': vocal_profile_selected.MIN_RANGE.values[0], 
                                       'MAX_RANGE': vocal_profile_selected.MAX_RANGE.values[0], 
                                       'AVERAGE_RANGE': vocal_profile_selected.AVERAGE_RANGE.values[0],
                                       'STD_RANGE': vocal_profile_selected.STD_RANGE.values[0], 
                                       'MEDIAN_RANGE': vocal_profile_selected.MEDIAN_RANGE.values[0]},
                                     index=range(len(feature_df)))

# Converting feature_df & vocal_profile_selected to numpy array to be used in prediction:
audio_input = feature_df.values
extended_vocal_profile = extended_vocal_profile.values

# Predicting & converting the predictions into a WAV file:
model_full_prediction = model_full_loaded.predict([audio_input, extended_vocal_profile])
soundfile.write(file= current_directory + '\\Vocal_Replace\\Model\\Model_Outputs\\Full Model - Transformed With ' 
                    + str(speaker_ID) + ' - ' + user_audio_filename, 
                data= model_full_prediction, samplerate= feature_df.SAMPLE_RATE[0])


    
