# Audio-based model for echolalia detection

The following function, `audio_model_echolalia` takes as input a **multi-speaker audio file** and a **.TextGrid diarization** of that audio. The .TextGrid should contain at least one tier, named **'AC'** (Autistic Child), where the speech of the child of whom echolalic utterances will be detected is indicated in **non-empty intervals** (e.g. indicated by 'speech', 'AC'...). Additionally, the file may contain other tiers containing the speech intervals of other speakers. Please remove any non-diarization tiers. If other speakers are diarized, the parameter **'speakers_diarized'** must be set to 'all' (the default value); then, the model will compare the AC's utterances with utterances of other speakers starting at most 10 seconds before the start of the AC's utterance. If only the speech of the AC is indicated, 'speakers_diarized' must be set to 'AC', and the AC's utterances will be compared with intervals within 10 seconds before the start of the AC's utterance where the AC does not speak (i.e., where no other AC-interval occurs). The output is a **dataframe** containing the predictions ('non-echolalic' or 'echolalic') for each pair of utterances of the AC and another speaker, including the name of the other speaker's tier (if speakers_diarized='AC', then the other speaker is indicated as 'other') and the timestamps of both utterances. Furthermore, a new **.TextGrid** is created on the basis of the input .TextGrid, containing the annotations of the predicted source and echolalic intervals on two different tiers ('source' and 'echolalia'). The output path to this file must be specified in the parameter 'output_path'.

*Note*: the current trained classifier does not yet achieve satisfying results. This script will be adapted with a new version of the classifier once developed.

In [2]:
def audio_model_echolalia(input_audio= None, textgrid_file= None, output_path= None, speakers_diarized='all'):
    """Takes an audio file and its diarization in .TextGrid as input, 
    makes a prediction on the label (echolalic or not) for each suitable utterance-pair
    and returns a dataframe containing these predictions for each pair,
    along with an annotation of the predictions in a new TextGrid"""
    
    
    # Load libraries
    import pickle
    from sklearn.linear_model import LogisticRegression
    import pandas as pd
    from pydub import AudioSegment
    from praatio import textgrid
    import librosa
    import numpy as np
    from dtw import dtw
    import sklearn.preprocessing
    import itertools
    import tgt
        
        
    # EXTRACT SPEECH INTERVALS
    
    def get_speech_intervals(tier, empty=False):
        """Takes a tier name as input and returns either the timestamps of the non-empty intervals ('empty= False')
        or those of the empty intervals ('empty=True') of that tier"""
        entries= input_tg.getTier(tier).entries
        intervals= []
        for entry in entries:
            if (entry.label and empty==False) or (not entry.label and empty==True):
                intervals.append((entry.start, entry.end))
        return intervals
    
    # Get speech intervals of AC and other speaker(s)
    input_tg= textgrid.openTextgrid(textgrid_file, includeEmptyIntervals= True)
    
    child_intervals= []
    other_speaker_intervals= {}
    
    if speakers_diarized== 'all':
        all_tiers= input_tg.tierNames
        all_intervals= {}
        for tier in all_tiers:
            if tier== 'AC':
                child_intervals= get_speech_intervals(tier)
            else:
                other_speaker_intervals[tier]= get_speech_intervals(tier)
    
    elif speakers_diarized== 'AC':
        child_intervals= get_speech_intervals('AC')
        other_speaker_intervals['other']= get_speech_intervals('AC', empty=True)
    
    # PREPARE THE AUDIO PROCESSING
    
    def audio_segment_to_np(audiosegment):
        """Converts an 'Audiosegment' object of the 'audiosegment' library
        into a numpy array"""
        samples = audiosegment.get_array_of_samples() # function of AudioSegment library
        samples = np.array(samples)
        samples = samples.astype(np.float32) / np.iinfo(samples.dtype).max
        return samples
    
    # DEFINE FUNCTIONS FOR FEATURE EXTRACTION FROM AUDIO SIGNAL
    
    n_fft = 2048
    hop_length= 512
    
    features= ['mfcc', 'melspectrogram', 'lpc']
    
    def compute_lpc_2d(y, sr, order=13, frame_length=n_fft, hop_length=hop_length):
        # so that lpc coefficients are also computed per frame just as for mfcc and melspectrogram
        
        # Frame the audio signal
        y_padded = np.pad(y, (0, frame_length - len(y) % hop_length), mode='constant')
        frames = librosa.util.frame(y_padded, frame_length=frame_length, hop_length=hop_length).T
        # Compute LPC for each frame
        lpc_coeffs = [librosa.lpc(frame, order=order) for frame in frames]
        # Stack LPC coefficients into a 2D array
        lpc_2d = np.vstack(lpc_coeffs)
        return lpc_2d
    
    def get_feature_array(feature, segment, sr):
        if feature== 'mfcc':
            # use 'audio_segment_to_np()' function previously defined
            feature_vector= librosa.feature.mfcc(y= audio_segment_to_np(segment), n_mfcc=13, hop_length= hop_length, n_fft= n_fft, sr=sr).squeeze()
        elif feature== 'melspectrogram':
            feature_vector= librosa.feature.melspectrogram(y= audio_segment_to_np(segment), hop_length= hop_length, n_fft= n_fft, sr=sr, n_mels=32).squeeze()
        elif feature== 'lpc':
            feature_vector= compute_lpc_2d(y= audio_segment_to_np(segment), sr=sr).squeeze().T
        
        ready_feature_vector= sklearn.preprocessing.minmax_scale(feature_vector, axis=1)
                
        return ready_feature_vector

    
    # LOAD THE AUDIO FILE 
    
    audio = AudioSegment.from_wav(input_audio) # Load the audio and create an 'Audiosegment' object
    sr = librosa.load(input_audio)[1] # Get sample rate
    
    
    # CREATE THE FEATURE DATAFRAME:
    
    # Initalize dataframe
    df= pd.DataFrame(columns=['s2_tier', 'AC_int', 's2_int'] +[f'DTW_{feature}' for feature in features] \
                     + [f'DTW_{f1}_{f2}' for f1,f2 in list(itertools.combinations(features,2))]+ ['DTW_combined'])
    row_df= 0
    
    
    # Compare the utterances of the autistic child
    # with those of all other speakers:
    

    for other_speaker in other_speaker_intervals:
        s2_tier= other_speaker
        
        # Get speech intervals for this speaker:
        s2_intervals = other_speaker_intervals[other_speaker]


        for start, end in child_intervals: # Iterate over the child_intervals list

            child_int= (start, end) # store the current timestamps in a tuple

            start_child = start * 1000
            end_child = end * 1000
            child_segment = audio[start_child:end_child] # create a segment from the audio between start and end timestamps

            
            # Get features:
            child_features= []
            

            for feature in features:
                # use function 'get_feature_array()' to get features from 'Audiosegment' object
                # Iterate through the 'features' list (= parameter of the function 'feature_df_echolalia()') 
                # to compute all the features
                child_feature= get_feature_array(feature, child_segment, sr)
                
                # Store all feature arrays in a list
                child_features.append(child_feature)
            
            # Concatenate different combinations of feature arrays
            child_combinations = [np.concatenate((f1, f2)) for f1, f2 in itertools.combinations(child_features, 2)]
            child_all_concatenated= np.concatenate([feature for feature in child_features])    
                
                
                
            # Now iterate through the speech interval dictionary of speaker 2 to compare the utterance of the child
            # with all suitable utterances of speaker 2 (starting 10 seconds or less before the start of the child's utterance)

            for s, e in s2_intervals: 

                start_s2 = s * 1000
                end_s2 = e * 1000
                
                # if other speaker interval starts at most 10 seconds before child interval:
                # or cut off an interval at 10 seconds before child starts in case of undiarized audio:

                if 0 < start_child - start_s2 <= 10*1000\
                    or (0 < start_child - end_s2 < 10*1000 and speakers_diarized=='AC'): 
                    
                    if 0 < start_child - end_s2 < 10*1000 and speakers_diarized=='AC' and not 0 < start_child - start_s2 <= 10*1000: 
                        start_s2= start_child- 10*1000

                    s2_segment = audio[start_s2:min(start_child,end_s2)] # create an audiosegment
                    
                    s2_int= (start_s2/1000, min(start_child, end_s2)/1000)

                    
                    # Get features
                    s2_features= []
                                                  

                    for feature in features:
                        s2_feature= get_feature_array(feature, s2_segment, sr)
                        # Store all feature arrays in a list
                        s2_features.append(s2_feature)

                    s2_combinations = [np.concatenate((f1, f2)) for f1, f2 in itertools.combinations(s2_features, 2)]
                    s2_all_concatenated= np.concatenate([feature for feature in s2_features])
                    
                    # Get distance metrics for child and speaker 2 utterances:
                    
                    distance_features= []
                    
                    # Iterate over lists of feature arrays of both speakers and compute the normalized distance

                    for child_feature, s2_feature in zip(child_features, s2_features):
                        distance_feature= dtw(child_feature.T, s2_feature.T, distance_only=True).normalizedDistance
                        distance_features.append(distance_feature)
                    
                    for child_combination, s2_combination in zip(child_combinations, s2_combinations):
                        distance_feature= dtw(child_combination.T, s2_combination.T, distance_only=True).normalizedDistance
                        distance_features.append(distance_feature)
                    
                    # dtw computation on concatenation of all feature arrays:
                    dtw_combination= dtw(child_all_concatenated.T, s2_all_concatenated.T, distance_only=True).normalizedDistance
                    
                    
                    # ADD EVERYTHING TO THE DATAFRAME
                    df.loc[row_df] = [s2_tier, (child_int), (s2_int)] + distance_features + [dtw_combination]
                    # Move to a new row in the output dataframe and start a new iteration of the intervals of speaker 2
                    row_df+=1
        
        
    # MAKE PREDICTIONS

    # Load pretrained model and configurations
    with open('Trained_best_classifier_echolalia.pkl', 'rb') as f:
        model_config = pickle.load(f)
    model= model_config['model_obj']
    best_features= model_config['best_features']
    threshold= model_config['threshold']

    # Define function for predictions
    def predict(model, X, threshold):
        probs = model.predict_proba(X) 
        return (probs[:, 1] > threshold).astype(int)

    # Predict outcomes
    X= df[best_features]
    preds= pd.DataFrame(predict(model=model, X=X, threshold=threshold), columns= ['prediction_binary'])
    df= pd.concat([df, preds], axis=1)
    df['prediction'] = df['prediction_binary'].apply(lambda x: 'echolalic' if x == 1 else 'non-echolalic')
    
    
    # MAKE TEXTGRID
    tg = tgt.io.read_textgrid(textgrid_file, encoding='utf-16')
    source_tier = tgt.IntervalTier(start_time=0, name='source')
    rep_tier = tgt.IntervalTier(start_time=0, name='echolalia')
    
    all_sources=[]
    all_echoes=[]
    
    df_echo= df.loc[df['prediction']== 'echolalic']
    
    for index, row in df_echo.iterrows():
        start_child, end_child = row['AC_int']
        start_s2, end_s2 = row['s2_int']
        
        if (start_s2, end_s2) not in all_sources:
            all_sources.append((start_s2, end_s2))
            source_interval= tgt.Interval(start_time=float(start_s2), end_time=float(end_s2), text= 'source')
            source_tier.add_interval(source_interval)
        
        if (start_child, end_child) not in all_echoes:
            all_echoes.append((start_child, end_child))
            rep_interval= tgt.Interval(start_time=float(start_child), end_time=float(end_child), text= 'echolalic')
            rep_tier.add_interval(rep_interval)
            
    # Output the TextGrid
    tg.add_tier(source_tier)
    tg.add_tier(rep_tier)
    tgt.write_to_file(tg, output_path, format='short')
            
        
    return df[['s2_tier', 'AC_int', 's2_int', 'prediction']]