In [1]:
import pandas as pd
import numpy as np
import tensorflow as tf
from tensorflow.keras import layers
from tensorflow.keras.layers.experimental import preprocessing
from sklearn.model_selection import train_test_split
from os import listdir, makedirs
from sklearn.model_selection import KFold
import time
import random

In [2]:
def df_to_dataset(dataframe, shuffle=True, batch_size=32):
    dataframe = dataframe.copy()
    labels = dataframe.pop('targetValue')
    ds = tf.data.Dataset.from_tensor_slices((dict(dataframe), labels))
    if shuffle:
        ds = ds.shuffle(buffer_size=len(dataframe))
    ds = ds.batch(batch_size)
    ds = ds.prefetch(batch_size)
    return ds

def get_normalization_layer(name, dataset):
    # Create a Normalization layer for our feature.
    normalizer = preprocessing.Normalization()
    # Prepare a Dataset that only yields our feature.
    feature_ds = dataset.map(lambda x, y: x[name])
    # Learn the statistics of the data.
    normalizer.adapt(feature_ds)
    return normalizer

def load_subsample_data(sample_array, subsample_array):
    """
    %author Łukasz Ozimek
    Function to use arrays of sample numbers to load in data of subsample audio features.
    
    Parameter:
    sample_array - array of sample numbers
    subsample_array - array of subsample data
    
    Returns:
    final_array - array of subsample data for selected samples
    """
    final_array = np.zeros(shape=(int(len(sample_array)*5),len(subsample_array.columns)))
    idx = 0
    for i in sample_array:
        num = i*5
        for n in range(0,5):
            final_array[idx]=subsample_array[num+n:num+n+1]
            idx+=1
    return final_array

In [3]:
def data_model(path, save_path, save_name_start = 'train', iter_num = 5, epoch_num =20, neurons = 32):
    """
    Function that creates datasets out of csv files and normalizes them using Keras layer preprocessing. Then a model is 
    created     and trained multiple times. Then it's metrics are saved and their summary is printed in the console.
    
    Parameters:
    path - filepath of the csv document with features
    save_path - directory where model metrics will be saved
    save_name_start - beginning of each saved file
    iter_num - number of Kfold splits
    epoch_num - number of epochs
    neurons - number of neurons on first dense layer
    
    Returns:
    df - Padnas DataFrame with best metrics from each cycle
    summary - model summary
    """
    try:
        makedirs(save_path)
    except:
        pass
    save_path = save_path+save_name_start
    file = path
    df = pd.read_csv(file)
    df.drop('Unnamed: 0', axis=1, inplace = True)
    base = list(range(0,int(len(df)/5))) #array of numbers equal to number of whole samples
    
    # Defining list to store best values
    best_loss = []
    best_acc = []
    best_loss_val = []
    best_acc_val = []
    batch_size = 20
    iterator = 0
    kfold = KFold(n_splits=iter_num, shuffle=True)
    
    for train_base, val_base in kfold.split(base):
        train = pd.DataFrame(load_subsample_data(train_base, df),columns=df.columns)
        val = pd.DataFrame(load_subsample_data(val_base, df),columns=df.columns)
        train_ds = df_to_dataset(train, batch_size=batch_size)
        column_list = train.columns[0:-1]
        all_inputs = []
        encoded_features = []
        count = 0
        for header in column_list:
            count += 1
            print('Processed %d column out of %d' % (count, len(column_list)), end="\r", flush=True)
            numeric_col = tf.keras.Input(shape=(1,), name=header)
            normalization_layer = get_normalization_layer(header, train_ds)
            encoded_numeric_col = normalization_layer(numeric_col)
            all_inputs.append(numeric_col)
            encoded_features.append(encoded_numeric_col)

        print("")
        # Build model
        all_features = tf.keras.layers.concatenate(encoded_features)
        x = tf.keras.layers.Dense(neurons, activation="relu")(all_features)
        x = tf.keras.layers.Dropout(0.4)(x)
        output = tf.keras.layers.Dense(1)(x)
        val_ds = df_to_dataset(val, shuffle=False, batch_size=batch_size)
        iterator += 1
        print('Training cycle %d out of %d' % (iterator, iter_num), end="\r", flush=True)
        model = tf.keras.Model(all_inputs, output)
        model.compile(optimizer='adam',
                      loss=tf.keras.losses.BinaryCrossentropy(),
                      metrics=["accuracy"])
        model.fit(train_ds, epochs=epoch_num, validation_data=val_ds, verbose=0)
        df2 = pd.DataFrame()
        for part in ['loss', 'accuracy', 'val_loss', 'val_accuracy']: 
            df2[part] = model.history.history[part]
        df2.index.names = ['Epoch']
        df2.to_csv(save_path+'_training'+str(iterator)+'.csv')

        # Save best values
        best_loss.append(max(model.history.history['loss']))
        best_acc.append(max(model.history.history['accuracy']))
        best_loss_val.append(max(model.history.history['val_loss']))
        best_acc_val.append(max(model.history.history['val_accuracy']))
        print("")
    
    # Post training
    summary = model.summary # Every model is the same so the summary can be called post training loop
    df = pd.DataFrame()
    df['Best_Loss'] = best_loss
    df['Best_Acc'] = best_acc
    df['Best_Loss_Val'] = best_loss_val
    df['Best_Acc_Val'] = best_acc_val
    df.index.names = ['Model_Num']
    df.to_csv(save_path+'_best'+'.csv')
    return df, summary

In [98]:
start_time = time.time()
df, summary = data_model('./csvs/GeMAPS/ReadText_GeMAPSv01b.csv', './train_outputs/ReadText_GeMAPSv01b/',
                         'Metrics', iter_num=8, epoch_num=100)
print('Elapsed time: ' , time.time() - start_time)
display(df)

Processed 62 column out of 62
Training cycle 1 out of 8
Processed 62 column out of 62
Training cycle 2 out of 8
Processed 62 column out of 62
Training cycle 3 out of 8
Processed 62 column out of 62
Training cycle 4 out of 8
Processed 62 column out of 62
Training cycle 5 out of 8
Processed 62 column out of 62
Training cycle 6 out of 8
Processed 62 column out of 62
Training cycle 7 out of 8
Processed 62 column out of 62
Training cycle 8 out of 8
Elapsed time:  92.15992093086243


Unnamed: 0_level_0,Best_Loss,Best_Acc,Best_Loss_Val,Best_Acc_Val
Model_Num,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
0,5.661548,0.95,8.046413,0.64
1,6.1151,0.89375,2.107062,0.88
2,4.969769,0.95625,6.487897,0.8
3,6.188941,0.93125,6.258512,0.72
4,5.123322,0.9,5.390261,0.92
5,5.09642,0.939394,3.411947,0.95
6,6.479378,0.909091,7.1578,0.95
7,5.809065,0.890909,4.011446,0.85


In [11]:
print(summary()) # It's in new cell because of the summary's size

Model: "functional_25"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
F0semitoneFrom27.5Hz_sma3nz_ame [(None, 1)]          0                                            
__________________________________________________________________________________________________
F0semitoneFrom27.5Hz_sma3nz_std [(None, 1)]          0                                            
__________________________________________________________________________________________________
F0semitoneFrom27.5Hz_sma3nz_per [(None, 1)]          0                                            
__________________________________________________________________________________________________
F0semitoneFrom27.5Hz_sma3nz_per [(None, 1)]          0                                            
______________________________________________________________________________________

In [4]:
# Training models for GeMAPS and eGeMAPS csv datasets
start_time_full = time.time()
for directory in ['./csvs/GeMAPS/','./csvs/eGeMAPS/']:
    file_list = listdir(directory)
    print('File list:')
    print(file_list)
    print('Model training: ')
    for file in file_list:
        a = file[0:-4]
        print('')
        print('---------------------------------------------------------------------------------------------------------------')
        print('')
        print('\t \t \t Processing file ', a)
        print('')
        print('---------------------------------------------------------------------------------------------------------------')
        print('')
        start_time = time.time()
        df, summary = data_model(directory+file, './train_outputs/'+a+'/', 'Metrics', iter_num=8, epoch_num=100)
        print('Elapsed time: ' , time.time() - start_time)

print('Full elapsed time: ' , time.time() - start_time_full)
    

File list:
['ReadText_GeMAPSv01b.csv', 'SpontaneousDialogue_GeMAPSv01b.csv']
Model training: 

---------------------------------------------------------------------------------------------------------------

	 	 	 Processing file  ReadText_GeMAPSv01b

---------------------------------------------------------------------------------------------------------------

Processed 62 column out of 62
Training cycle 1 out of 8
Processed 62 column out of 62
Training cycle 2 out of 8
Processed 62 column out of 62
Training cycle 3 out of 8
Processed 62 column out of 62
Training cycle 4 out of 8
Processed 62 column out of 62
Training cycle 5 out of 8
Processed 62 column out of 62
Training cycle 6 out of 8
Processed 62 column out of 62
Training cycle 7 out of 8
Processed 62 column out of 62
Training cycle 8 out of 8
Elapsed time:  91.77682757377625

---------------------------------------------------------------------------------------------------------------

	 	 	 Processing file  SpontaneousDialog