# Transcription-based model for detection of self-repetition

The following function, `transcription_model_echolalia`, takes as input a transcription of a single- or multi-speaker audio in **.TextGrid** format. The speech of the child under scope should be annotated in a tier named **'AC'**. The function will return a **dataframe** containing the prediction ('repetitive' or 'non-repetitive') for each utterance pair with additional information about the timestamps of the utterances, the transcriptions of both utterances, etcetera. Moreover, a **new .TextGrid** is produced on the basis of the input .TextGrid, that contains annotations of the self-repetitive utterance pairs in two new tiers: 'source' and 'self-repetition'. The path to this new file needs to be specified in the parameter 'output path'. In the parameter 'language', the spoken language needs to be indicated (supported: 'nl' (Dutch) and 'fr' (French)).

In [57]:
def transcription_model_selfrep(textgrid_file, output_path, language):
    """Takes a TextGrid file of the transcription of an audio file as input, 
    makes a prediction on the label (self-repetitive or not) for each suitable utterance-pair
    and returns a dataframe containing these predictions for each pair,
    along with an annotation of the predictions in a new TextGrid"""
    
    # Import libraries
    import pandas as pd
    from praatio import textgrid
    import re
    import spacy
    import tgt
    
    # Load the spacy model and prepare extraction of speech intervals
    nlp = spacy.load(f'{language}_core_news_sm')
    
    def get_entries(tier):
        """Takes tier name as input and returns the entry tuple for that tier"""
        tg = textgrid.openTextgrid(textgrid_file, False)
        return tg.getTier(tier).entries
    
    def get_speech_intervals(tier):
        """Takes tier name as input and returns a nested list of the start and end timestamps
        of the utterances spoken by the speaker of the tier, filtering out the unintelligible ones"""
        entries = get_entries(tier)
        intervals = {}
        regex = r"^(xxx|yyy)\s?(\[.+\])?\.?$" # Regex for unintelligible utterances; change if necessary
        for entry in entries:
            if not re.match(regex, entry.label):
                intervals[(entry.start, entry.end)] = entry.label

        return intervals  
    
    # Define functions that prepare the utterances, extract their dependency structure and predict the label
    
    def preprocess_string(string):
        """Takes as input a string and outputs a new string where punctuation,
        truncated words and fillers have been removed"""
        string = string.translate(str.maketrans('', '', '+?!,/.()[]'))
        if language == 'fr':
            to_delete = ['euhm', 'euh', 'uhm', 'mmh', 'xxx', 'eh', 'ben', 'hein', 'ah', 'bah', 'oh', 'bon']
        elif language == 'nl':
            to_delete = ['euhm', 'euh', 'uhm', 'mmh', 'xxx', 'he', 'hè', 'hé', 'ah', 'oh']
        string = ' '.join([word for word in string.split() if word not in to_delete and word[-1] != '-' and word[0] != '-'])
        return string

    def split_sentence(doc):
        """Takes as input the SpaCy object of the input utterance and segments it in clauses, 
        while removing conjunctions"""
        sents = ""
        split_words = []
        to_delete = []
        children = []

        for token in doc:
            if token.pos_ in ['CCONJ', 'SCONJ'] and token.i not in children:
                children = []
                if token.i == 0:  # If conjunction is first token: just delete it
                    to_delete.append(token.text)
                else:
                    split_words.append(token.text)

                for child in token.children:
                    if child.pos_ in ['CCONJ', 'SCONJ']:
                        children.append(child.i)
                        to_delete.append(child.text)

        doc_list = [word.text for word in doc if word.text not in to_delete]
        doc_str = ' '.join(doc_list)

        if split_words:
            sents = doc_str.split(str(split_words[0]))
            new_sents = []
            for sent in sents:
                sent = sent.strip()
                new_sents.append(sent)
            sents = new_sents

            if len(split_words) > 1:
                for token in doc:
                    if token.text in split_words[1:len(split_words)]:
                        new_sents = []
                        for sent in sents:
                            new_sents.extend(sent.split(token.text))
                            sents = new_sents
                sents = [sent.strip() for sent in sents]
        else:
            sents = doc_str.strip()

        return sents

    def get_dep_structure(sent_doc):
        """Takes as input the SpaCy object of a clause 
        and returns a list of the tokens pertaining to the dependency structure of that clause"""
        words = []
        pos_dep = [(token.dep_, token.pos_) for token in sent_doc]

        if ('ROOT', 'VERB') in pos_dep or ('cop', 'VERB') in pos_dep or ('cop', 'AUX') in pos_dep:
            for token in sent_doc:
                if token.dep_ == 'ROOT' and token.pos_ == 'VERB':
                    # Append verbal group
                    words.append(token.text)
                    for child in token.children:
                        if child.pos_ == 'AUX':
                            words.append(child.text)
                if token.dep_ == 'ROOT' and token.pos_ == 'NOUN':
                    words.append(token.text)
                    for child in token.children:
                        if not child.children:
                            words.append(child.text)
                        # Take only children that are not prepositional groups, i.e., that don't have a
                        # preposition as their child. An exception is made for the Dutch and French equivalents of 'of'.
                        elif all([grandchild.pos_ != 'ADP' or grandchild.text in ['de', 'van'] for grandchild in child.children]):
                            words.append(child.text)
                            for grandchild in child.children:
                                words.append(grandchild.text)
                if token.dep_ in ['obj', 'nsubj', 'obl:arg', 'iobj', 'csubj', 'ccomp', 'xcomp', 'nsubj:pass', 'cop']:
                    # Append subject and objects
                    words.append(token.text)
                    for child in token.children:
                        words.append(child.text)
                        for grandchild in child.children:
                            words.append(grandchild.text)
        return words

    def get_dep_structures(doc):
        """Takes as input the SpaCy object of the utterance 
        and returns a nested list of the words of the dependency structure of each clause"""
        sents = split_sentence(doc)

        dep_structures = []
        if isinstance(sents, list):  # = more than one clause per utterance; otherwise type(sents) == str
            for sent in sents:
                sent_doc = nlp(sent)
                words = get_dep_structure(sent_doc)
                if len(words) > 1:
                    dep_structures.append(words)
        else:
            dep_structures.append(get_dep_structure(nlp(sents)))

        return dep_structures

    def detect_selfrep(string1, string2):
        """Takes as input two strings and returns True if they constitute a self-repetition"""
        new_string1 = preprocess_string(string1)
        new_string2 = preprocess_string(string2)

        doc1 = nlp(new_string1)
        doc2 = nlp(new_string2)

        if doc1 and doc2 and [token.text for token in doc1] == [token.text for token in doc2]:
            return True

        for struct in get_dep_structures(doc1):
            for struct2 in get_dep_structures(doc2):
                if struct and struct == struct2:
                    return True

        if split_sentence(doc1) == split_sentence(doc2) and isinstance(split_sentence(doc1), str) and 'VERB' in [token.pos_ for token in doc1]:
            return True

    # Now create the output dataframe and textgrid

    child_intervals = get_speech_intervals('AC')
    
    df = pd.DataFrame(columns=['source_interval', 'repetition_interval',
                               'source_transcription', 'rep_transcription', 'predicted_label'])
    
    source_rep_dict= {}
    tg = tgt.io.read_textgrid(textgrid_file, encoding='utf-16')
    source_tier = tgt.IntervalTier(start_time=0, name='source')
    rep_tier = tgt.IntervalTier(start_time=0, name='self-repetition')

    row_df = 0

    # Iterate through child_intervals and compare them to other intervals that the same speaker pronounced before:
    
    for start_source, end_source in child_intervals:

        source_int = (start_source, end_source)
        source_trans = child_intervals[(start_source, end_source)]
        
        for start_rep, end_rep in child_intervals:

            rep_int = (start_rep, end_rep)
            rep_trans = child_intervals[(start_rep, end_rep)]

            if start_rep > start_source:  # Compare repetition candidate only with previous utterances


                # Get predictions
                pred_label = 'non-repetitive'
                if detect_selfrep(rep_trans, source_trans):
                    pred_label = 'repetitive'

                add_rep= False
                
                # If the prediction is 'repetitive', write the source and repetitive interval to the output TextGrid                
                if pred_label == "repetitive":
                    
                    if source_int not in source_rep_dict and not any(source_int in values for values in source_rep_dict.values()):
                        source_rep_dict[source_int]= [rep_int]
                        source_key= list(source_rep_dict.keys()).index(source_int)
                        
                        source_interval_tg = tgt.Interval(start_time=float(start_source),
                                                          end_time=float(end_source),
                                                          text=f'source {source_key + 1}')
                        source_tier.add_interval(source_interval_tg)
                        add_rep=True

                    if not any(rep_int in values for values in source_rep_dict.values()):
                        add_rep=True
                        if source_int in source_rep_dict:
                            source_rep_dict[source_int] += [rep_int]
                            source_key= list(source_rep_dict.keys()).index(source_int)
                            
                        elif any(source_int in values for values in source_rep_dict.values()):
                            original_source= [key for key,value in source_rep_dict.items() if source_int in value][0]
                            source_rep_dict[original_source]+= [rep_int]
                            source_key= list(source_rep_dict.keys()).index(original_source)
                    
                    if add_rep== True:
                        rep_interval_tg = tgt.Interval(start_time=float(start_rep),
                                                       end_time=float(end_rep),
                                                       text=f'self-repetition {source_key + 1}')
                        rep_tier.add_interval(rep_interval_tg)

                # Append all features of the utterance pair to the output dataframe:
                df.loc[row_df] = [str(source_int), str(rep_int),
                                  source_trans, rep_trans, pred_label]
                row_df += 1

    # Output the textgrid outside of the loop
    tg.add_tier(source_tier)
    tg.add_tier(rep_tier)
    tgt.write_to_file(tg, output_path, format='short')

    return df.loc[df['predicted_label'] == 'repetitive']
