In [None]:
import pandas as pd
import numpy as np
import os, datetime

import soundfile

import tensorflow as tf
from tensorflow import keras


This script is to create and train two neural networks models that converge into one full model.

The first model is to intake the vocal profile of the speaker and the second model is to intake the audio properties of a sample from a audio file corresponding to the speaker of the vocal profile.

Once the training is completed, testing can be done on vocal profiles and audio files set aside.

If the model can mostly reproduce the original audio file, then the all testing data will be used to train the model as well so the model can be better improved. The main purpose of this is the addition of vocal profiles to the model so the model might be able to learn more vocal profiles and might become be better at the main purpose of the project: replacing/transforming the user's voice to the speaker's voice.

With the final model, the vocal profile will be either a customized creation or using a vocal profile from the speakers in the database. 

Note: Because the files with the audio properties are so large, the training will be done one file at a time.

In [None]:
current_directory = os.getcwd()

# Creating Model directories:
if os.path.exists(current_directory + '\\Vocal_Replace\\Model\\') != True:
    os.mkdir(current_directory + '\\Vocal_Replace\\Model\\')
    
if os.path.exists(current_directory + '\\Vocal_Replace\\Model\\Saved_Models') != True:
    os.mkdir(current_directory + '\\Vocal_Replace\\Model\\Saved_Models')
    
if os.path.exists(current_directory + '\\Vocal_Replace\\Model\\Model_Outputs') != True:
    os.mkdir(current_directory + '\\Vocal_Replace\\Model\\Model_Outputs')
    
if os.path.exists(current_directory + '\\Vocal_Replace\\Model\\Trained_Speakers') != True:
    os.mkdir(current_directory + '\\Vocal_Replace\\Model\\Trained_Speakers')
  

In [None]:
# Loading the speakers DF which have speakers who's audio properties has been fully extracted:
analyzed_speakers_df = pd.read_csv(current_directory + '\\Vocal_Replace\\Data\\Modified_Data\\Finished Speakers.csv')
analyzed_speakers_list = list(analyzed_speakers_df['SPEAKER_ID'])

# Loading the speakers DF to associate the gender to the speaker ID:
speaker_df = pd.read_csv(current_directory + "\\Vocal_Replace\\Data\\Modified_Data\\SPEAKER_Modified_Final.csv")

# Creating a DF of speaker_df that contains only the speaker IDs from analyzed_speakers_list:
access_speakers_df = speaker_df[speaker_df.ID.isin(analyzed_speakers_list)]


# Creating a DF of speakers that have been used to train the model and how many times the training has been 
# done on that speaker:
if os.path.exists(current_directory 
                  + '\\Vocal_Replace\\Model\\Trained_Speakers\\Speakers - Model Training.csv') == True:
    speakers_trained_df = pd.read_csv(current_directory 
                                      + '\\Vocal_Replace\\Model\\Trained_Speakers\\Speakers - Model Training.csv', 
                                      index_col= 0)
    
    # Updating the DF with speakers that have been newly extracted:
    for speaker in analyzed_speakers_list:
        if speaker in speakers_trained_df.SPEAKER_ID.values:
            continue
        
        else:
            add_row = pd.DataFrame({'SPEAKER_ID': speaker, 'AMOUNT_TRAINED': 0}, index = [len(speakers_trained_df)])
            speakers_trained_df = pd.concat([speakers_trained_df, add_row], ignore_index= True)
    
else:
    speakers_trained_df = pd.DataFrame({'SPEAKER_ID': analyzed_speakers_list, 'AMOUNT_TRAINED': 0})

In [None]:
# Defining custom metric:
def custom_metric_mean(y_true, y_pred):
    ## Want this to be as close to 0 as possible
    return keras.backend.mean(y_true, axis=None, keepdims=False) - keras.backend.mean(y_pred, axis=None, keepdims=False)

# Defining custom metric:
def custom_metric_difference(y_true, y_pred):
    ## Want this to be as close to 0 as possible
    return keras.backend.sum(y_true, axis=None, keepdims=False) - keras.backend.sum(y_pred, axis=None, keepdims=False)
    

# Defining Model:
def create_model():
    ## Creating the TensorFlow model:
    audio_X_train = keras.Input(shape= (69,), name= 'Audio_Properties_Input')
    vocal_profile_individual = keras.Input(shape= (7,), name= 'Vocal_Profile_Input')

    ## Audio Properties Model:
    audio_model_input = keras.layers.Dense(69, activation= keras.layers.PReLU(input_shape=(69,)), kernel_initializer= 'RandomNormal')(audio_X_train)
    audio_model_hidden_1 = keras.layers.Dense(900, activation= keras.layers.PReLU(), kernel_initializer= 'RandomNormal')(audio_model_input)
    audio_model_hidden_2 = keras.layers.Dense(700, activation= keras.layers.PReLU(), kernel_initializer= 'RandomNormal')(audio_model_hidden_1)
    audio_model_hidden_3 = keras.layers.Dense(500, activation= keras.layers.PReLU(), kernel_initializer= 'RandomNormal')(audio_model_hidden_2)
    audio_model_hidden_4 = keras.layers.Dense(400, activation= keras.layers.PReLU(), kernel_initializer= 'RandomNormal')(audio_model_hidden_3)
    audio_model_hidden_5 = keras.layers.Dense(300, activation= keras.layers.PReLU(), kernel_initializer= 'RandomNormal')(audio_model_hidden_4)
    audio_model_hidden_6 = keras.layers.Dense(100, activation= keras.layers.PReLU(), kernel_initializer= 'RandomNormal')(audio_model_hidden_5)


    ## Vocal Profile Model:
    vocal_model_input = keras.layers.Dense(7, activation= keras.layers.PReLU(input_shape=(7,)), kernel_initializer= 'RandomNormal')(vocal_profile_individual)
    vocal_model_hidden_1 = keras.layers.Dense(20, activation= keras.layers.PReLU(), kernel_initializer= 'RandomNormal')(vocal_model_input)
    vocal_model_hidden_2 = keras.layers.Dense(100, activation= keras.layers.PReLU(), kernel_initializer= 'RandomNormal')(vocal_model_hidden_1)

    ## Merging Audio Model + Vocal Model:
    merged_model = keras.layers.concatenate([audio_model_hidden_6, vocal_model_hidden_2])
    merged_model_hidden_1 = keras.layers.Dense(200, activation= keras.layers.PReLU(), kernel_initializer= 'RandomNormal')(merged_model)
    merged_model_hidden_2 = keras.layers.Dense(200, activation= keras.layers.PReLU(), kernel_initializer= 'RandomNormal')(merged_model_hidden_1)
    merged_model_hidden_3 = keras.layers.Dense(100, activation= keras.layers.PReLU(), kernel_initializer= 'RandomNormal')(merged_model_hidden_2)

    ## Output Layer:
    output_amplitude = keras.layers.Dense(1, activation= 'linear', kernel_initializer= 'RandomNormal', name= 'Output_Amplitude')(merged_model_hidden_3)

    ## Molding the Model togther:
    model = keras.Model(inputs= [audio_X_train, vocal_profile_individual], outputs= output_amplitude)

    ## Compiling the Model:
    model.compile(optimizer= keras.optimizers.Adam(learning_rate=0.001), loss= 'mean_squared_error', 
                  metrics= [custom_metric_mean, custom_metric_difference])
    
    return model

In [None]:
# Training using TensorFlow:

# Loading in saved model (if it exists) or creating the model:
# Because there is a custom metric defined, TensorFlow is unable to save/load custom metrics so the model will have to be
# re-compiled after loading. The saved models seems to be intact as the model performance starts off better than a new model.
# The HDF5 format is used to save the models due to the default & recommended SaveModel format being extremely unstable.
if os.path.exists(current_directory + '\\Vocal_Replace\\Model\\Saved_Models\\Saved_Model_Full.h5') == True:
    model_full_loaded = tf.keras.models.load_model(current_directory 
                                                   + '\\Vocal_Replace\\Model\\Saved_Models\\Saved_Model_Full.h5', 
                                                   custom_objects={"custom_metric_mean": custom_metric_mean, 
                                                                   'custom_metric_difference': custom_metric_difference,
                                                                  'PReLU': keras.layers.PReLU()}, 
                                                   compile=False)
    model_full_loaded.compile(optimizer= keras.optimizers.Adam(learning_rate=0.001), loss= 'mean_squared_error', 
                              metrics= [custom_metric_mean, custom_metric_difference])
    
else:
    model_full_loaded = create_model()
    
    

# Loading in the summary of vocal profiles:
vocal_profile_df = pd.read_csv(current_directory 
                               + '\\Vocal_Replace\\Data\\Modified_Data\\Vocal_Profiles\\Vocal Profile Summary.csv', 
                               index_col=0)    

# Converting gender into numeric (0, 1):
for index, row in vocal_profile_df.iterrows():
    if row.GENDER == 'M':
        gender = 1
        
    else:
        gender = 0
        
    vocal_profile_df.loc[index, 'GENDER'] = gender
    
vocal_profile_df = vocal_profile_df.astype({'GENDER': np.int64})


# Accessing all anaylzed audio & training on batches & showing a progress bar:
with tf.device('/GPU:0'):
    for index, row in access_speakers_df.iterrows():
        speaker_ID = row.ID

        ## This is to prevent over-training on speakers that are trained early--allowing all speakers to be trained equally:
        if speakers_trained_df.AMOUNT_TRAINED[speakers_trained_df.SPEAKER_ID == speaker_ID].values[0] <= np.average(speakers_trained_df.AMOUNT_TRAINED):
            if row.SEX == 'M':
                gender = 'Male'

            else:
                gender = 'Female'

            ### Loading in the summary vocal profile of a speaker:
            vocal_profile = vocal_profile_df[vocal_profile_df.SPEAKER_ID == speaker_ID]

            ### Loading in the audio properties of a speaker:
            filepath_audio_properties = (current_directory + '\\Vocal_Replace\\Data\\Modified_Data\\Audio_Properties\\' 
                                         + gender + '\\' + str(speaker_ID) + ' - Audio Properties.csv')

            if os.path.exists(filepath_audio_properties) == True:
                audio_properties_load = pd.read_csv(filepath_audio_properties, index_col= 0, chunksize=100000)

                #### Training the model:
                for audio_properties in audio_properties_load:
                    label = audio_properties['AMPLITUDE']
                    label = label.values

                    audio_input = audio_properties.drop(['SPEAKER_ID', 'SPEAKER_GENDER', 'FILENAME', 'AMPLITUDE'], axis= 1)
                    audio_input = audio_input.values

                    extended_vocal_profile = pd.DataFrame({'SPEAKER_ID': vocal_profile.SPEAKER_ID.values[0], 
                                                           'GENDER': vocal_profile.GENDER.values[0], 
                                                           'MIN_RANGE': vocal_profile.MIN_RANGE.values[0], 
                                                           'MAX_RANGE': vocal_profile.MAX_RANGE.values[0], 
                                                           'AVERAGE_RANGE': vocal_profile.AVERAGE_RANGE.values[0],
                                                           'STD_RANGE': vocal_profile.STD_RANGE.values[0], 
                                                           'MEDIAN_RANGE': vocal_profile.MEDIAN_RANGE.values[0]},
                                                         index=range(len(audio_properties)))
                    
                    extended_vocal_profile = extended_vocal_profile.values
                    
                    print(speaker_ID)

                    model_full_loaded.fit(x= [audio_input, extended_vocal_profile], 
                                          y= label, 
                                          epochs= 100,
                                         batch_size= 4000)
                    
                    if gender == 'Male':
                        print('Male', speaker_ID)
                        
                        model_male_loaded.fit(x= [audio_input, extended_vocal_profile], 
                                          y= label, 
                                          epochs= 100,
                                         batch_size= 4000)
                    
                    else:
                        print('Female', speaker_ID)
                        
                        model_female_loaded.fit(x= [audio_input, extended_vocal_profile], 
                                          y= label, 
                                          epochs= 100,
                                         batch_size= 4000)

                


                ### Updating the number of times the speaker was used to train the model:
                for index, row in speakers_trained_df.iterrows():
                    if row.SPEAKER_ID == speaker_ID:
                        speakers_trained_df.loc[index, 'AMOUNT_TRAINED'] += 100
                        break
                        
            else:
                print('Audio Properties File Not Present; Speaker: ', speaker_ID)
                
            # Exporting updated models and speakers trained:
            speakers_trained_df.to_csv(current_directory + '\\Vocal_Replace\\Model\\Trained_Speakers\\Speakers - Model Training.csv')

            model_full_loaded.save(current_directory + '\\Vocal_Replace\\Model\\Saved_Models\\Saved_Model_Full.h5', 
                                   overwrite= True, include_optimizer= True)
                

        else:
            continue
        


In [None]:
# Use if wanting to see the model diagram; this is a bit buggy so it might require installation of other packages:
keras.utils.plot_model(model_full_loaded, 'multi_input_and_output_model.png', show_shapes=True)