# Pitt cookie `.cha` extraction

In [1]:
from pathlib import Path
import re
import glob
import pandas as pd
import pylangacq as pla

In [2]:
pitt_path = Path('../data/Pitt/')
control_path = pitt_path/'Control'/'cookie'
dementia_path = pitt_path/'Dementia'/'cookie'

In [4]:
# Raw extraction with any annotations / corrections
def extract_participant_from_file(file):
    par_re = re.compile(r'^\*PAR:\s(.*)')
    cont_re = re.compile(r'^\t(.*)')
    
    document = open(file).read()
    doc = document.split('\n')    

    pre_list = []
    in_par = False
    for line in doc:
        pattern = cont_re if in_par else par_re
        match = pattern.match(line)
        if match:
            utterance = match.group(1)
            utterance = re.sub('\\x15.*\\x15$', '', utterance)
            pre_list.append(utterance)
            in_par = True
        else:
            in_par = False
            
    return(pre_list)

extract_participant_from_file(dementia_path/'703-0.cha')

["&uh well here's the child reaching up but he's on a stool and he's",
 'falling off . ',
 "she's drying dishes at the window . ",
 'she also spilled the water . ',
 'oh they [//] like I said they were climbing . ',
 "she's doing dishes . ",
 'she spilled the water too <out o(f) the sink> [//] over the sink I',
 'should say . ']

In [10]:
# PyLangAcq extraction that cleans up text
def pla_extract_participant_from_file(file):
    chat = pla.read_chat(file, encoding='utf-8')
    sentences = chat.sents(participant='PAR')
    
    # Sentences are lists of words. Flatten them.
    sentences = [' '.join(s) for s in sentences]
    return sentences

pla_extract_participant_from_file(str(dementia_path/'703-0.cha'))

["well here's the child reaching up but he's on a stool and he's falling off .",
 "she's drying dishes at the window .",
 'she also spilled the water .',
 'oh like I said they were climbing .',
 "she's doing dishes .",
 'she spilled the water too over the sink I should say .']

**Note that the token `[//]` that appears in the first version marks corrections that must have been performed by a human. The second version, as extracted by pylangacq, is corrected.**

In [14]:
def extract_from_path(path_cha):    
    files = sorted(glob.glob(path_cha))
    
    extracted = []
    extracted_clean = []
    for file in files:
        extracted.append(extract_participant_from_file(file))
        extracted_clean.append(pla_extract_participant_from_file(file))
    
    return (extracted, extracted_clean)    

# utterances_symbol = extract_from_path(str(control_path/'*.cha')) 
# utterances_symbol_control = utterances_symbol
# len(utterances_symbol_control)

(utterances_control, utterances_clean_control) = extract_from_path(str(control_path/'*.cha'))
print(len(utterances_control), len(utterances_clean_control))
print(utterances_control[5])
print(utterances_clean_control[5])

243 243
['well the kids are in the kitchen with their mother &uh &uh takin(g)', 'cookies out o(f) the cookie jar . ', "a boy's handin(g) it to the girl . ", "and the boy's &uh <on a> [/] on a &uh stool and he's tripping", '[: tipping] [* p:w] over . ', "he's gonna fall on the floor . ", "the mother's standing there doing the dishes . ", "she's washing the dishes looking out the open window . ", "and the water's runnin(g) down over the sink on [/] on the floor", 'getting her feet wet . ', "and <there are> [//] she's dryin(g) a dish . ", 'and there are a couple o(f) dishes sitting on the &k kitchen', 'counter . ', "and looking out the window &uh it's probably in the spring or", 'summer of the year . ']
['well the kids are in the kitchen with their mother taking cookies out of the cookie jar .', "a boy's handing it to the girl .", "and the boy's on a stool and he's tipping over .", "he's gonna fall on the floor .", "the mother's standing there doing the dishes .", "she's washing the dishe

In [15]:
(utterances_dementia, utterances_clean_dementia) = extract_from_path(str(dementia_path/'*.cha'))
print(len(utterances_dementia))

309


In [17]:
control_df = pd.DataFrame({
    'sentences': utterances_control,
    'sentences_clean': utterances_clean_control,
    'group': 'control'
})
control_df.head()

Unnamed: 0,sentences,sentences_clean,group
0,"[the scene is <in the> [/] in the kitchen . , ...","[the scene is in the kitchen ., the mother is ...",control
1,"[oh I see the sink is running over . , I see t...","[oh I see the sink is running over ., I see th...",control
2,[&um a boy and a girl are in the kitchen with ...,[a boy and a girl are in the kitchen with thei...,control
3,"[okay . [+ exc] , it was summertime and mother...","[okay ., it was summertime and mother and the ...",control
4,[&=clears:throat wait (un)til I put my glasses...,"[wait until I put my glasses on ., oh ‡ there'...",control


In [18]:
dementia_df = pd.DataFrame({
    'sentences': utterances_dementia,
    'sentences_clean': utterances_clean_dementia,
    'group': 'dementia'
})
dementia_df.head()

Unnamed: 0,sentences,sentences_clean,group
0,"[mhm . [+ exc] , +< alright . [+ exc] , there'...","[mhm ., alright ., there's a young boy that's ...",dementia
1,"[mhm . , there's a young boy &uh going in a co...","[mhm ., there's a young boy going in a cookie ...",dementia
2,"[here's a cookie jar . , and the lid is off th...","[here's a cookie jar ., and the lid is off the...",dementia
3,"[the boy is slipping off the stool . , he's tr...","[the boy is slipping off the stool ., he's try...",dementia
4,[okay he's fallin(g) off a chair [: stool] [* ...,"[okay he's falling off a stool ., she's runnin...",dementia


In [19]:
df = pd.concat([control_df, dementia_df])
len(df)

552

## Flatten sentences into paragraphs describing the scene

In [20]:
df['text'] = df.apply(lambda row: ' '.join(row.sentences), axis=1)
df['clean'] = df.apply(lambda row: ' '.join(row.sentences_clean), axis=1)
df.head()

Unnamed: 0,sentences,sentences_clean,group,text,clean
0,"[the scene is <in the> [/] in the kitchen . , ...","[the scene is in the kitchen ., the mother is ...",control,the scene is <in the> [/] in the kitchen . th...,the scene is in the kitchen . the mother is wi...
1,"[oh I see the sink is running over . , I see t...","[oh I see the sink is running over ., I see th...",control,oh I see the sink is running over . I see the...,oh I see the sink is running over . I see the ...
2,[&um a boy and a girl are in the kitchen with ...,[a boy and a girl are in the kitchen with thei...,control,&um a boy and a girl are in the kitchen with t...,a boy and a girl are in the kitchen with their...
3,"[okay . [+ exc] , it was summertime and mother...","[okay ., it was summertime and mother and the ...",control,okay . [+ exc] it was summertime and mother a...,okay . it was summertime and mother and the ch...
4,[&=clears:throat wait (un)til I put my glasses...,"[wait until I put my glasses on ., oh ‡ there'...",control,&=clears:throat wait (un)til I put my glasses ...,wait until I put my glasses on . oh ‡ there's ...


In [21]:
df.iloc[5].text

"well the kids are in the kitchen with their mother &uh &uh takin(g) cookies out o(f) the cookie jar .  a boy's handin(g) it to the girl .  and the boy's &uh <on a> [/] on a &uh stool and he's tripping [: tipping] [* p:w] over .  he's gonna fall on the floor .  the mother's standing there doing the dishes .  she's washing the dishes looking out the open window .  and the water's runnin(g) down over the sink on [/] on the floor getting her feet wet .  and <there are> [//] she's dryin(g) a dish .  and there are a couple o(f) dishes sitting on the &k kitchen counter .  and looking out the window &uh it's probably in the spring or summer of the year . "

In [22]:
df.iloc[5].clean

"well the kids are in the kitchen with their mother taking cookies out of the cookie jar . a boy's handing it to the girl . and the boy's on a stool and he's tipping over . he's gonna fall on the floor . the mother's standing there doing the dishes . she's washing the dishes looking out the open window . and the water's running down over the sink on the floor getting her feet wet . and she's drying a dish . and there are a couple of dishes sitting on the kitchen counter . and looking out the window it's probably in the spring or summer of the year ."

Reorder columns

In [23]:
df = df[['group', 'sentences', 'sentences_clean', 'text', 'clean']]
df.head()

Unnamed: 0,group,sentences,sentences_clean,text,clean
0,control,"[the scene is <in the> [/] in the kitchen . , ...","[the scene is in the kitchen ., the mother is ...",the scene is <in the> [/] in the kitchen . th...,the scene is in the kitchen . the mother is wi...
1,control,"[oh I see the sink is running over . , I see t...","[oh I see the sink is running over ., I see th...",oh I see the sink is running over . I see the...,oh I see the sink is running over . I see the ...
2,control,[&um a boy and a girl are in the kitchen with ...,[a boy and a girl are in the kitchen with thei...,&um a boy and a girl are in the kitchen with t...,a boy and a girl are in the kitchen with their...
3,control,"[okay . [+ exc] , it was summertime and mother...","[okay ., it was summertime and mother and the ...",okay . [+ exc] it was summertime and mother a...,okay . it was summertime and mother and the ch...
4,control,[&=clears:throat wait (un)til I put my glasses...,"[wait until I put my glasses on ., oh ‡ there'...",&=clears:throat wait (un)til I put my glasses ...,wait until I put my glasses on . oh ‡ there's ...


### Save to `csv`

In [24]:
model_path = pitt_path.parent/'models'
model_path.mkdir(exist_ok=True)

In [25]:
df.to_csv(model_path/'pitt-cookie-complete.csv')

There are some weird ones. Perhaps more pre-processing would be in order.

In [36]:
df.iloc[13].clean

"this is the one where I insisted the tree doesn't have a trunk and I still think that tree doesn't have a trunk . okay what do you want me to do ? tell a /? this is a tree and if it had a trunk it would continue . this is the top sash of the window and it hasta have a trunk down here and that's just grass . so what /? well what's this ? isn't that a tree ? looks like a tree to me . okay what do you want me to //? all the action ? okay . there's a little boy in short pants with short sleeves who has a cookie in his left hand handing it to a little girl and grabbing another one . and the lid is falling off the cookie jar . and the stool is upsetting . and it's a three legged stool . and the little girl is shushing because the mother is standing there in a puddle of water with the water overflowing out_of the drain . and she's wiping a dish instead of the mess . and she's wearing an apron and she has on sleeveless clothes . I don't know . I could go on . there are two cups and a plate an