In [7]:
import pandas as pd
import os
import re
import numpy as np

### Load data from AphasiaBank dataset found in the folder "dataset"

In [5]:
path = "dataset"

def get_file_names(path): #get all the file names from the path
    all_names = []
    
    for root, dirs, files in os.walk(path):
        for file in files:
            if file.endswith(".cha"):
                all_names.append(os.path.join(root, file))
                
    return all_names

f_names = get_file_names(path)
print(len(f_names))
f_names = [el for el in f_names if not 'checkpoint' in el] #remove "checkpoint" files, which are the duplicates of regular files
print(len(f_names))

493
473


### Prepare dataframe from files with general information

In [10]:
def get_pid(text): #This is the ID of the actual interview
    pid_start = text.index('@PID:') + 5
    pid_end = pid_start + text[pid_start:].index('@')
    pid = text[pid_start:pid_end].replace('\n', '').replace('\t', '')
    return pid

def get_media(text): #Abreviation of the institution conducting the experiment and the number of experiment + media type (text, sound, video)
    media_start = text.index('@Media:') + 7
    media_end = media_start + text[media_start:].index('@')
    media = text[media_start:media_end].replace('\n', '').replace('\t', '')
    return media

def get_id_type_severity(text): #ID is a line with general information about the interview. Type is the aphasia type, and severity is the score for the aphasic person.
    id_start = text.index('@ID:') + 4
    id_end = id_start + text[id_start:].index('@')
    ID = text[id_start:id_end].replace('\n', '').replace('\t', '')
    if 'participant' not in ID.lower():  # the first ID is not of the participant
        # search further
        ID, aph_type, aph_severity = get_id_type_severity(text[id_end:])
        return ID, aph_type, aph_severity
    
    splitted = ID.split('|')
    aph_type = splitted[5]
    aph_severity = splitted[9]
    return ID, aph_type, aph_severity

def get_dataset_w_filenames(f_names):
    df = pd.DataFrame(columns=['PID', 'Media', 'ID', 'Type', 'Severity'])
    
    for f_name in f_names:
        with open(f_name, "r", encoding="utf-8") as file:
            text = file.read()
            
        if not text:
            print(f'file {f_name} is empty')
            continue
        
        try:
            general_pid = get_pid(text)
            general_media = get_media(text)
            id_, type_, severity = get_id_type_severity(text)
        except:
            print(f_name)
            raise
    
        new_row = {'PID': general_pid, 'Media': general_media, 'ID': id_, 
                   'Type': type_ if type_ else np.nan, 
                   'Severity': float(severity) if severity else np.nan}
        df = df.append(new_row, ignore_index=True)

    return df

df_files = get_dataset_w_filenames(f_names)
df_files

file dataset\ACWT\ACWT\ACWT01a.cha is empty


Unnamed: 0,PID,Media,ID,Type,Severity
0,11312/t-00017817-1,"ACWT02a, video",eng|ACWT|PAR|53;01.|female|TransMotor||Partici...,TransMotor,74.6
1,11312/t-00017818-1,"ACWT03a, video",eng|ACWT|PAR|68;01.|male|TransMotor||Participa...,TransMotor,69.3
2,11312/t-00017819-1,"ACWT04a, video",eng|ACWT|PAR|26;00.|female|NotAphasicByWAB||Pa...,NotAphasicByWAB,96.0
3,11312/t-00017820-1,"ACWT05a, video",eng|ACWT|PAR|75;07.|male|Broca||Participant||5...,Broca,57.7
4,11312/t-00017821-1,"ACWT07a, video*INV:I'm gonna be asking you to ...",eng|ACWT|PAR|61;06.|male|NotAphasicByWAB||Part...,NotAphasicByWAB,95.0
...,...,...,...,...,...
467,11312/t-00018259-1,"wright203a, video*INV:okay . 0_4506%mor:co|o...",eng|Wright|PAR|66;04.|male|Conduction||Partici...,Conduction,76.3
468,11312/t-00018260-1,"wright204a, video*INV:www . 0_16008%exp:talk...",eng|Wright|PAR|74;10.|male|Anomic||Participant...,Anomic,90.9
469,11312/t-00018261-1,"wright205a, video*INV:www . 0_5641%exp:extra...",eng|Wright|PAR|55;10.|male|Broca||Participant|...,Broca,59.7
470,11312/t-00018262-1,"wright206a, video*INV:okay . 0_1533%mor:co|o...",eng|Wright|PAR|39;00.|female|Broca||Participan...,Broca,53.7


### Prepare dataframe with all conversational information from all the data

In [25]:
meta_starters = set(['@UTF8', '@PID:', '@Begin', '@Languages:', '@Participants:', '@ID:',
                    '@Date:', '@Media:', '@End', '@Window:'])

not_needed_starters = set(['%gpx:', '%sit:', '%com:', '%exp:',
                           '@Situation:', '@Transcriber:', '@Comment:']) | meta_starters

people_starters = set(['*IN1:', '*IN2:', '*INV1:', '*INV2:', '*INV:', '*PAR:', '*PRT:', 
                       '*SP01:', '*ADU:', '*CAR:', '*OTH:'])

task_starter = set(['@G:'])

info_starters = set(['%gra:', '%mor:'])

def get_starter(line):
    try:
        end_starter = line.index('\t')
    except:
        try:
            end_starter = line.index('\n')
        except:
            end_starter = len(line)
    return line[:end_starter]

def get_dataset_with_contents(f_names, df_w_files_info):
    overall_df = pd.DataFrame(columns=['Person', 'Text', 'mor', 'gra', 'Task', 'PID', 'Media', 'ID', 'Type', 'Severity'])

    for f_name in f_names:
        with open(f_name, "r", encoding="utf-8") as file:
            text = file.read()

        if not text:
            continue

        this_file_df = pd.DataFrame(columns=['Person', 'Text', 'mor', 'gra', 'Task', 'PID', 'Media', 'ID', 'Type', 'Severity'])

        general_pid = get_pid(text)
        file_info_line = df_w_files_info[df_w_files_info.PID == general_pid]
        general_media, id_ = file_info_line.Media.values[0], file_info_line.ID.values[0]
        type_, severity = file_info_line.Type.values[0], file_info_line.Severity.values[0]

        try:
            text_split = text.split('\n')
            persons = []
            texts = []
            mors = []
            gras = []
            tasks = []

            for j, line in enumerate(text_split):

                if not line:
                    continue

                if not line.startswith('\t'):
                    starter = get_starter(line)

                    if starter in not_needed_starters:
                        # check if mor and gra have the same number of elements as persons
                        # if not -- it means that the previous one didn't have mor or gra
                        # so we'll add empty lines there
                        if len(persons) == (len(mors) + 1):
                            mors.append('')
                        if len(persons) == (len(gras) + 1):
                            gras.append('')
                        continue

                    elif starter in people_starters:
                        # check if mor and gra have the same number of elements as persons
                        # if not -- it means that the previous one didn't have mor or gra
                        # so we'll add empty lines there
                        if len(persons) == (len(mors) + 1):
                            mors.append('')
                        if len(persons) == (len(gras) + 1):
                            gras.append('')

                        # check if 'tasks' have the same length as persons
                        # it means that we need to add the same task again 
                        # to account for the newly added person
                        # but if they are both 0 then we haven't seen the task yet, and we should add '' in tasks
                        if (len(tasks) == 0) and (len(persons) == 0):
                            tasks.append('')
                        elif len(tasks) == len(persons):
                            tasks.append(tasks[-1])
                        elif len(tasks) == len(persons) + 1:
                            pass  # that's ok, another person will be added right away
                        else:
                            raise ValueError('The number of tasks is more than the numner of persons by 2')

                        persons.append(starter)
                        texts.append(line[len(starter)+1:])
                        continue

                    elif starter in info_starters:
                        if starter == '%mor:':
                            mors.append(line[len(starter)+1:])
                        elif starter == '%gra:':
                            gras.append(line[len(starter)+1:])
                        else:
                            raise ValueError('Not recognized info starter')

                    elif starter in task_starter:
                        # it can be either before or after the first interviewer
                        # but we deal with it while adding persons 
                        tasks.append(line[len(starter)+1:])

                        continue

                    else:
                        print('-', starter, '-', sep='')
                        print(line, j)
                        raise ValueError('Starter not in the sets')

                else:  # line starts with \t so it's continuation of the previous one
                    # attach it to the previous tag
                    # 'starter' still has this!
                    if starter in people_starters:
                        texts[-1] = texts[-1] + line.lstrip('\t')
                    elif starter in info_starters:
                        if starter == '%mor:':
                            mors[-1] = mors[-1] + line.lstrip('\t')
                        else:
                            gras[-1] = gras[-1] + line.lstrip('\t')
                    elif starter in not_needed_starters:  # line starts with \t but previous one was not taken
                        continue  # we don't take this either
                    else:
                        raise ValueError('strange previous starter')

            assert len(persons) == len(texts), print(len(persons), len(texts))
            assert len(texts) == len(mors), print(len(texts), len(mors))
            assert len(mors) == len(gras), print(len(mors), len(gras))
            assert len(gras) == len(tasks), print(len(gras), len(tasks))

        except:
            print(f_name)
            raise

        this_file_df['Person'] = persons
        this_file_df['Text'] = texts
        this_file_df['mor'] = mors
        this_file_df['gra'] = gras
        this_file_df['Task'] = tasks
        this_file_df['PID'] = [general_pid] * len(persons)
        this_file_df['Media'] = [general_media] * len(persons)
        this_file_df['ID'] = [id_] * len(persons)
        this_file_df['Type'] = [type_ if type_ else np.nan] * len(persons)
        this_file_df['Severity'] = [float(severity) if severity else np.nan] * len(persons)

        overall_df = pd.concat((overall_df, this_file_df))

    return overall_df

df_with_contents = get_dataset_with_contents(f_names, df_files)
df_with_contents.head()

Unnamed: 0,Person,Text,mor,gra,Task,PID,Media,ID,Type,Severity
0,*INV:,alright . 1200_2858,co|alright .,1|0|INCROOT 2|1|PUNCT,Speech,11312/t-00017817-1,"ACWT02a, video",eng|ACWT|PAR|53;01.|female|TransMotor||Partici...,TransMotor,74.6
1,*INV:,I'm going to be asking you to do some talking ...,pro:sub|I~aux|be&1S part|go-PRESP inf|to aux|b...,1|3|SUBJ 2|3|AUX 3|0|ROOT 4|6|INF 5|6|AUX 6|3|...,Speech,11312/t-00017817-1,"ACWT02a, video",eng|ACWT|PAR|53;01.|female|TransMotor||Partici...,TransMotor,74.6
2,*PAR:,okay . 6640_7205,co|okay .,1|0|INCROOT 2|1|PUNCT,Speech,11312/t-00017817-1,"ACWT02a, video",eng|ACWT|PAR|53;01.|female|TransMotor||Partici...,TransMotor,74.6
3,*INV:,tell me how you think your speech is these day...,v|tell pro:obj|me pro:int|how pro:per|you v|th...,1|0|ROOT 2|1|OBJ 3|5|LINK 4|5|SUBJ 5|1|COMP 6|...,Speech,11312/t-00017817-1,"ACWT02a, video",eng|ACWT|PAR|53;01.|female|TransMotor||Partici...,TransMotor,74.6
4,*PAR:,good &=nods &-um not real good but good . [+ g...,adj|good neg|not adj|real adj|good conj|but ad...,1|0|INCROOT 2|1|NEG 3|4|MOD 4|1|JCT 5|1|CONJ 6...,Speech,11312/t-00017817-1,"ACWT02a, video",eng|ACWT|PAR|53;01.|female|TransMotor||Partici...,TransMotor,74.6
