## Final DysarthrAI Dynamic Time Warping (DTW) Algorithm

##### ----------The following code should be EXCLUDED from DysarthrAI backend. It is used for testing in this notebook---------- #####

In [59]:
import cv2

In [None]:
 def create_df(speaker = 'ALL'):
    """
    Create a dataframe containing information to allow the creation of path names and 
    easy identification of prompts
    
    The various actions are
    - filter on the speaker fed in as an argument
    - select only recordings made using the head mic
    - remove instances without text propmts
    - remove single words to just leave multi-word phrases
    - select only phrases that were recorded twice
    """
    
    df = pd.read_csv('../index_TORGO.txt', sep="|", converters={'prompt_id': lambda x: str(x)})
    df = df.loc[(df['mic'] == 'wav_headMic')]
    if speaker != 'ALL':
        df = df.loc[(df['speaker'] == speaker)]
    df['remove'] = (df['prompt']==None) | (df['prompt']=='None') | (df['prompt'].str.contains('jpg')) | (df['prompt'].str.endswith(']')) 
    df = df.loc[df['remove'] == False]
    df['remove'] = (df['prompt'].str.contains(' '))
    df = df.loc[df['remove'] == True]
    df_filter = df.groupby(['speaker', 'prompt']).size().reset_index(name='counts')
    df_filter = df_filter[df_filter['counts']==2]
    df = df.merge(df_filter, on=['speaker', 'prompt'], how='inner')
    df['audloc'] = '../data/TORGO/' + df['speaker'] + '/' + df['session'] + '/' + df['mic'] + '/' +  df['prompt_id'] + '.wav'
    return df

###### ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------

## Packages - Include in App

In [60]:
import pandas as pd
import numpy as np
from numpy.linalg import norm
from fastdtw import fastdtw, dtw
import librosa
import librosa.display

## Functions - Include in App

In [61]:
def generate_mfcc_lists(df=df, number_mfcc=20):
    
    """ 
    Takes a fixed-format dataframe containing information about the recordings, and outputs
    lists that are then used in the DTW calculation. Assumes that the dataframe has been reindexed
    so that the indexing begins at zero and ends at df.shape[0]-1, and that there are sequential 
    pairs of recordings
    
    mfcc: a list of the MFCCs corresponding to each recording
    prompts: a numerical code representing each prompt
    prompt_text: the actual text of each prompt
    train: whether the recording should be considered a test or training example
    pretrim len: the length of each recording before trimming leading silences
    trim_len: the length of each recording after trimming leading silences
    aud_locs: path to the audio recording
    
    Returns a list of the MFCC's to be used in training, a list of the test MFCC's
    the length of each (for use when applying a threshold) and the correct labels of the test data
    """

    # Load data
    mfcc = []
    prompts = []
    prompt_text = []
    train = []
    pretrim_len = []
    trim_len = []
    aud_locs = []

    # Loop through each row of the dataframe
    for p in range(df.shape[0]):
        aud_loc = aud_loc = '/'.join(['../data/TORGO', df['speaker'][p], df['session'][p], df['mic'][p], df['prompt_id'][p]+'.wav'])           
        aud_locs.append(aud_loc)
        data, rate = librosa.load(aud_loc)

        # Trim leading and trailing silence
        pretrim_len.append(round(librosa.get_duration(data, rate),1))
        data, index = librosa.effects.trim(data, top_db=15)
        trim_len.append(round(librosa.get_duration(data, rate),1))

        mfcc.append(librosa.feature.mfcc(data, rate, n_mfcc=number_mfcc))
        prompts.append(p//2) # Each pair of audio files has the same prompt
        prompt_text.append(df['prompt'][p])
        train.append(1 if (p+2) % 2== 0 else -1) # Assign every other record to train
    
    # Scale features
    for i,x in enumerate(mfcc): 
        mfcc[i] = sklearn.preprocessing.scale(mfcc[i], axis=1)
        
    # Assign data to train or test
    x_train = [mfcc[i] for i,x in enumerate(train) if x==-1]
    x_test = [mfcc[i] for i,x in enumerate(train) if x==1]
    y_test = [prompts[i] for i,x in enumerate(train) if x==1]
    
    # Store length of recordings for use in threshold testing
    train_len = [trim_len[i] for i,x in enumerate(train) if x==-1]
    test_len = [trim_len[i] for i,x in enumerate(train) if x==1]
    
    # Store audio location
    train_aud = [aud_locs[i] for i,x in enumerate(train) if x==-1]
    test_aud = [aud_locs[i] for i,x in enumerate(train) if x==1]
    
    return x_train, x_test, train_aud, test_aud, train_len, test_len, y_test

In [62]:
#  The following logic has been applied to speed up algorithm and increase accuracy:
#    Only run DTW distance calculation on stored training phrases that are within -/+ 30%
#    seconds in legnth from the requested test phrase. This range should be a min of -/+ 5 seconds
#    This way most small phrases will be compared to each other.

# In addition, steps will not be a hard coded range
#   The range of the steps will be % based on the width of the requested test MFCC vector, using -/+30%
#   (It seems like 1 syllable takes around 15 width - very rough estimate)

# Calculate the DTW distance

def calc_dtw(x_train, x_test, train_len, test_len, radius=1, total_shifts = 7):
    """
    Calculates the DTW distance between the test cases and the training data
    after applying a series of time shifts to the test data
    
    Returns an array of the DTW dist of each shifted MFCC against the training
    prompt, and prints out the time taken to run the calculation"""
    
    master_dist = []
    for i,x in enumerate(x_test):
        mfcc_dist = []
        # Default: For 7 total vectors - 3 shifts left, no shift, and 3 shifts right @ 15% range
        max_shift = x.shape[1]*0.15   # Indicate % range here
        # Total shifts will always be an odd number so there is the same number of shifts in each direction
        total_shifts = total_shifts + 1 if total_shifts % 2 == 0 else total_shifts
        shift = int(max_shift/int(total_shifts/2))
        for d in range(shift * int(total_shifts/2) * -1, shift * int(total_shifts/2) + 1, shift):
            dist = []
            for i2,x2 in enumerate(x_train):
                len_threshold = max(train_len[i]*0.3, 5)
                min_thres = train_len[i] - len_threshold
                max_thres = train_len[i] + len_threshold

                # Run DTW dist if stored phrase is within -/+ 30% seconds as requested test phrase
                if min_thres <= test_len[i2] <= max_thres:
                    distance, path = fastdtw(np.roll(x,d).T, x2.T, radius=radius, dist=lambda x, y: norm(x - y))
                # else assume they are not the same by assuming a very large distance
                else:
                    distance = 1000000

                dist.append(distance)

            mfcc_dist.append(dist)
        master_dist.append(mfcc_dist)
        
    #print('MFCCs:{0}, Radius:{1}, Time:{2:.2f} sec'.format(x_train[0].shape[0], radius))
    
    return master_dist


def prediction(master_dist, y_test, test_len):
    
    """
    Given an array of DTW distances and the correct labels associated with the test case
    check what the predicted label would be for each shifted MFCC vector by recording
    the minimum DTW distance between the test and training examples
    The overall prediction is then the minimum DTW distance across the entire array of
    shifted vectors
    
    Return a table showing the correct label, the overall prediction, and the intermediate
    predictions for each shift of the test MFCC"""
    
    prediction_overalldist = []
    dtw_distance = []
    votes = []

    # Loop through each training example
    for i,x in enumerate(master_dist):
        vote = []
        # For each of the shifted vectors, get the prediction with min distance - the votes
        min_dist = 1000000
        for i2,x2 in enumerate(x):
            vote.append(x2.index(min(x2)))

            # Save the overall min distance from all shifted vectors = overall closest prediction
            if min(x2) < min_dist:
                min_dist = min(x2)
                min_overall = x2.index(min(x2))

        # Overall closest prediction out of the shifted MFCC vectors - the final vote
        prediction_overalldist.append(min_overall)
        dtw_distance.append(min_dist)

        # Track votes - determine if some vectors perform worse
        votes.append(vote)
    
    num_correct_overall = 0
    
    pred_tuples = list(zip(y_test, prediction_overalldist, votes, dtw_distance, test_len))
    pred_df = pd.DataFrame(pred_tuples, columns=['Correct','Prediction','MFCC Predictions','DTW Distance','Test Len'])
    
    # Assume a phrase is not stored in system if DTW Distance / Test Len ratio is greater than 165
    # 160 should be used where radius = 1. This cutoff was determined testing 23 dysarthric phrases
    pred_df['DTW Ratio'] = pred_df['DTW Distance'] / pred_df['Test Len']
    pred_df['Unknown Phrase'] = pred_df['DTW Ratio'] > 165.0
    
    return pred_df


##### ----------The following code should be EXCLUDED from DysarthrAI backend. It is used for testing in this notebook---------- #####

## Test using all speakers. Assume all of the speakers are the same person.

In [64]:
df = create_df('ALL')
df = df.sort_values(by=['prompt','speaker'])
df['prompt_instance'] = df.groupby(['prompt']).cumcount()+1
df = df[df['prompt_instance'] <3] # Remove rows after first 2 instance of a prompt
df = df[df['prompt'] != 'but he always answers, Banana oil!']  # This occurs across two people with prompt label slightly diff
df =df.reset_index()
df.drop('audloc', axis=1)

Unnamed: 0,index,speaker,session,mic,prompt_id,has_spect,spect_width,spect_height,prompt,remove,counts,prompt_instance
0,22,M01,Session1,wav_headMic,12,no,,,"A long, flowing beard clings to his chin,",True,2,1
1,23,M01,Session1,wav_headMic,13,no,,,"A long, flowing beard clings to his chin,",True,2,2
2,14,F03,Session3,wav_headMic,58,yes,86.0,513.0,Each one volunteered to jump first.,True,2,1
3,15,F03,Session3,wav_headMic,67,yes,85.0,513.0,Each one volunteered to jump first.,True,2,2
4,18,F03,Session3,wav_headMic,153,yes,221.0,513.0,He further proposed grants of an unspecified s...,True,2,1
5,19,F03,Session3,wav_headMic,209,yes,221.0,513.0,He further proposed grants of an unspecified s...,True,2,2
6,46,M04,Session2,wav_headMic,182,yes,204.0,513.0,He will allow a rare lie.,True,2,1
7,47,M04,Session2,wav_headMic,183,yes,178.0,513.0,He will allow a rare lie.,True,2,2
8,24,M01,Session1,wav_headMic,27,no,,,I can read,True,2,1
9,25,M01,Session1,wav_headMic,58,no,,,I can read,True,2,2


In [65]:
x_train, x_test, train_aud, test_aud, train_len, test_len, y_test = generate_mfcc_lists(df, 13)
master_dist = calc_dtw(x_train, x_test, train_len, test_len, radius=1)
pred_df = prediction(master_dist, y_test, test_len)

In [66]:
# If there is a large DTW Ratio, then we are going to determine we do not know the phrase (Unknown Phrase = True), 
#  rather than make a prediction
pred_df.sort_values(['DTW Ratio'])

Unnamed: 0,Correct,Prediction,MFCC Predictions,DTW Distance,Test Len,DTW Ratio,Unknown Phrase
22,22,22,"[11, 14, 1, 22, 22, 1, 18]",427.605153,3.6,118.779209,False
4,4,4,"[4, 4, 4, 4, 4, 14, 14]",234.512258,1.9,123.427504,False
14,14,14,"[14, 1, 14, 14, 14, 10, 10]",248.473481,2.0,124.23674,False
13,13,13,"[1, 14, 10, 13, 18, 15, 10]",490.640973,3.9,125.805378,False
15,15,15,"[8, 10, 15, 15, 15, 15, 18]",365.763804,2.9,126.12545,False
20,20,20,"[8, 22, 8, 20, 18, 20, 14]",877.220208,6.9,127.133363,False
18,18,18,"[14, 18, 18, 18, 11, 4, 11]",437.841859,3.3,132.679351,False
8,8,8,"[8, 8, 8, 8, 8, 8, 10]",284.626219,2.1,135.536295,False
1,1,1,"[14, 1, 10, 1, 1, 14, 10]",274.959994,2.0,137.479997,False
0,0,0,"[8, 8, 5, 0, 0, 8, 12]",1218.388575,8.7,140.044664,False


In [67]:
# # Notes on prompts we will declare as unknown

# Prompt 3 - test audio repeats part of phrase at the end
# Prompt 5 - We got correct
# Prompt 9 - these are sort of close.
# Prompt 16 - partial repeat and slip up
# Prompt 6 - could get correct with more radius. Since this is the only one, it may not be worth the increased run time?
# Prompt 17 - issue with silence at beginning - this is okay because if someone makes noise and takes a while to speak
               # we can just say unknown phrase and they can repeat it again if they know it is saved
# Prompt 12 - issue with silence at beginning - (see note above)
# Prompt 19 - Repeats part of phrase at beginning

###### ---------------------------------------------------------------------------------------------------------------------------------------------------------------------------