In [12]:
import re
import pandas as pd

### Importing preprocessed file from preprocessing_aphasiabank.ipynb

In [13]:
df_original = pd.read_csv("all_processed_files.csv", header=0)

### Annotating with general information

In [14]:
df = df_original[["Person", "Text", "mor", "gra"]][:5000].copy()
df = df.dropna()
print(len(df))

def get_duration(sentence):
    try:
        return (re.search(".+ \\x15(.+)\\x15", sentence).group(1))
    except AttributeError:
         return "NaN"
        
def remove_time(sentence):
    try:
        return sentence.replace(re.search("\\x15.+", sentence).group(0), "")
    except AttributeError:
        pass
    
def duration_to_seconds(sentence):
    try:
        return float((int(re.search("_.+", sentence).group(0)[1:]) - int(re.search(".+_", sentence).group(0)[:-1]))/1000)
    except AttributeError:
        return "NaN"
    
def assign_hesitation(text):
    sentence = text
    return (re.findall("(&-.+?) ", sentence))

def return_clean_text(sentence, steps = False):
    removed_gesture = re.sub("&=.+? ", "", str(sentence))#remove gestures first
    removed_correction_marker = re.sub("\[.+?\] ", "", removed_gesture)
    removed_hesitations = re.sub("&-", "", removed_correction_marker)
    only_words = re.sub("[^\w ']", "", removed_hesitations)# remove non-characters except for '
    if steps == False:
        #print(only_words)
        return only_words
    if steps == True:
        #with f' you can't use "\n"
        print ("0, Original sentence:", sentence,
                "1, Removed gestures:", removed_gesture,
                "2, Removed corrections:", removed_correction_marker,
                "3, Removed hesitations:", removed_hesitations,
                "4, Only words: ", only_words, sep="\n")
        print("<------------------------------------------------------------------------------>")
        return only_words

def retrieve_corrections(sentence):
    corrections = []
    first_layer = re.findall("(?<=\s).+? \[:.+?]", sentence)
    if len(first_layer) != 0:
        for i in first_layer:
            if re.search("\[: (.+?)]", i).group(1) == "x@n":
                corrections.append("unintelligible")
            else:
                second_layer = re.sub("\[: ", "", i)
                third_layer = re.sub("\]", "", second_layer)
                fourth_layer = re.sub("<", "", third_layer)
                fifth_layer = re.sub("@u", "", fourth_layer)
                sixth_layer = fifth_layer.split()
                corrections.append(tuple([sixth_layer[-2],sixth_layer[-1]]))
    else:
        corrections = "NaN"
    return corrections

df["hesitations"] = df["Text"].apply(lambda x: assign_hesitation(x))
df["corrections"] = df["Text"].apply(lambda x: retrieve_corrections(x))
df["duration"] = df["Text"].apply(lambda x: get_duration(x))
df["Text"] = df["Text"].apply(lambda x: remove_time(x))
df["seconds"] = df["duration"].apply(lambda x: duration_to_seconds(x))
df["Clean_text"] = df["Text"].apply(lambda x: return_clean_text(x, False))
df["num_corrections"] = df["corrections"].apply(lambda x: len(x))
df["num_unintelligible"] = df["corrections"].apply(lambda x: len([z for z in x if z == "unintelligible"]))

4905


### Annotating with linguistic information

In [18]:
df["xcomp"] = df["gra"].apply(lambda x: 1 if "XCOMP" in x else 0)
df["ccomp"] = df["gra"].apply(lambda x: 1 if "COMP" in x else 0)
df["prepositional"] = df["gra"].apply(lambda x: 1 if "LOC" in x else 0)
df["optional_elements"] = df["gra"].apply(lambda x: 1 if ("JCT" or "CJCT" or "XJCT") in x else 0)
df["noun_modifiers"] = df["gra"].apply(lambda x: 1 if ("MOD" or "CMOD" or "XMOD") in x else 0)
df["negation"] = df["gra"].apply(lambda x: 1 if "NEG" in x else 0)
df["determiner"] = df["gra"].apply(lambda x: 1 if "DET" in x else 0)
df["topicalization"] = df["gra"].apply(lambda x: 1 if "TOP" in x else 0)
df["quantifier"] = df["gra"].apply(lambda x: 1 if "QUANT" in x else 0)

In [21]:
[i for i in df]

['Person',
 'Text',
 'mor',
 'gra',
 'hesitations',
 'corrections',
 'duration',
 'seconds',
 'Clean_text',
 'num_corrections',
 'num_unintelligible',
 'xcomp',
 'ccomp',
 'prepositional',
 'optional_elements',
 'noun_modifiers',
 'negation',
 'determiner',
 'topicalization',
 'quantifier']

In [22]:
df.head()

Unnamed: 0,Person,Text,mor,gra,hesitations,corrections,duration,seconds,Clean_text,num_corrections,num_unintelligible,xcomp,ccomp,prepositional,optional_elements,noun_modifiers,negation,determiner,topicalization,quantifier
0,*INV:,so ‡ first we're just gonna do some talking .,co|so beg|beg adv|first pro:sub|we~aux|be&PRES...,1|0|BEG 2|1|BEGP 3|7|JCT 4|7|SUBJ 5|7|AUX 6|7|...,[],,0_5819,5.819,so first we're just gonna do some talking,3,0,0,1,0,1,0,0,0,0,1
1,*PAR:,+< okay .,co|okay .,1|0|INCROOT 2|1|PUNCT,[],,5119_5778,0.659,okay,3,0,0,0,0,0,0,0,0,0,0
2,*INV:,so ‡ okay .,co|so beg|beg co|okay .,1|0|BEG 2|1|BEGP 3|0|INCROOT 4|3|PUNCT,[],,5819_10856,5.037,so okay,3,0,0,0,0,0,0,0,0,0,0
3,*INV:,how do you think your speech is these days ?,pro:int|how mod|do pro:per|you v|think det:pos...,1|4|JCT 2|4|AUX 3|4|SUBJ 4|0|ROOT 5|6|DET 6|7|...,[],,10856_15376,4.52,how do you think your speech is these days,3,0,0,1,0,1,0,0,1,0,0
4,*PAR:,alright &=ges . [+ gram],co|alright .,1|0|INCROOT 2|1|PUNCT,[],,15376_16794,1.418,alright,3,0,0,0,0,0,0,0,0,0,0
