## Downloading and loading the corpus

In [1]:
import nltk
from nltk.corpus.reader import CHILDESCorpusReader

In [22]:
# you have to download the XML version (! important) of the CHILDES corpus
# you can access those here (for English, North America): https://childes.talkbank.org/data-xml/Eng-NA/
# then you need to unzip it and put it in the 'corpora' folder in your 'nltk_data' folder 
# (for me, on macOS that's in my user root directory)
# and below you tell nltk that it needs to use that folder
corpus_root = nltk.data.find('corpora')

In [23]:
# now you can use the corpus reader to extract all the dialogues from the Brown corpus
brown = CHILDESCorpusReader(corpus_root, 'Brown/.*.xml')

In [24]:
# list all the fileids
brown.fileids()

['Brown/Adam/020304.xml',
 'Brown/Adam/020318.xml',
 'Brown/Adam/020403.xml',
 'Brown/Adam/020415.xml',
 'Brown/Adam/020430.xml',
 'Brown/Adam/020512.xml',
 'Brown/Adam/020603.xml',
 'Brown/Adam/020617.xml',
 'Brown/Adam/020701.xml',
 'Brown/Adam/020714.xml',
 'Brown/Adam/020801.xml',
 'Brown/Adam/020816.xml',
 'Brown/Adam/020904.xml',
 'Brown/Adam/020918.xml',
 'Brown/Adam/021002.xml',
 'Brown/Adam/021016.xml',
 'Brown/Adam/021030.xml',
 'Brown/Adam/021113.xml',
 'Brown/Adam/021128.xml',
 'Brown/Adam/030011.xml',
 'Brown/Adam/030025.xml',
 'Brown/Adam/030109.xml',
 'Brown/Adam/030126.xml',
 'Brown/Adam/030209.xml',
 'Brown/Adam/030221.xml',
 'Brown/Adam/030304.xml',
 'Brown/Adam/030318.xml',
 'Brown/Adam/030401.xml',
 'Brown/Adam/030418.xml',
 'Brown/Adam/030501.xml',
 'Brown/Adam/030515.xml',
 'Brown/Adam/030529.xml',
 'Brown/Adam/030609.xml',
 'Brown/Adam/030707.xml',
 'Brown/Adam/030801.xml',
 'Brown/Adam/030814.xml',
 'Brown/Adam/030826.xml',
 'Brown/Adam/030916.xml',
 'Brown/Adam

## Information about the corpus and participants

In [25]:
# this is the corpus
brown.corpus

<bound method CHILDESCorpusReader.corpus of <CHILDESCorpusReader in '/Users/mdhk/nltk_data/corpora'>>

In [26]:
# extract the corpus data from the corpus
corpus_data = brown.corpus(brown.fileids())

In [28]:
# print information for the first dialogue (? not sure) in the corpus
for key in sorted(corpus_data[0].keys()):
    print(key, ": ", corpus_data[0][key])

ActivityType :  toyplay
Corpus :  Brown
Date :  1962-10-08
DesignType :  long
GroupType :  TD
Lang :  eng
PID :  11312/c-00015632-1
Version :  2.16.0
{http://www.w3.org/2001/XMLSchema-instance}schemaLocation :  http://www.talkbank.org/ns/talkbank https://talkbank.org/software/talkbank.xsd


In [49]:
# print information about corpus participants (? for a specific dialogue?)
corpus_participants = brown.participants(brown.fileids())
for this_corpus_participants in corpus_participants[:1]:
    for key in sorted(this_corpus_participants.keys()):
        dct = this_corpus_participants[key]
        print(key, ": ", [(k, dct[k]) for k in sorted(dct.keys())])

CHI :  [('SES', 'MC'), ('age', 'P2Y03M04D'), ('group', 'typical'), ('id', 'CHI'), ('language', 'eng'), ('name', 'Adam'), ('role', 'Target_Child'), ('sex', 'male')]
COL :  [('id', 'COL'), ('language', 'eng'), ('name', 'Colin_Fraser'), ('role', 'Investigator')]
MOT :  [('id', 'MOT'), ('language', 'eng'), ('role', 'Mother'), ('sex', 'female')]
RIC :  [('id', 'RIC'), ('language', 'eng'), ('name', 'Richard_Cromer'), ('role', 'Investigator')]
URS :  [('id', 'URS'), ('language', 'eng'), ('name', 'Ursula_Bellugi'), ('role', 'Investigator')]


## Loading dialogues

In [63]:
dialogue_file = 'Brown/Adam/020304.xml'

# dialogue info
dlg_info = brown.corpus(dialogue_file)
print('INFO:')
for key in sorted(dlg_info[0].keys()):
    print(key, ": ", dlg_info[0][key])
print(type(dlg_info), '\n')

# dialogue participants
print('PARTICIPANTS:')
dlg_partcps = brown.participants(dialogue_file)
for this_corpus_participants in dlg_partcps[:1]:
    for key in sorted(this_corpus_participants.keys()):
        dct = this_corpus_participants[key]
        print(key, ": ", [(k, dct[k]) for k in sorted(dct.keys())])
print(type(dlg_partcps), '\n')

# individual words
dlg_words = brown.words(dialogue_file)
print('WORDS:\n', dlg_words, '\n', type(dlg_words))

# words from a particular speaker
dlg_words_CHI = brown.words(dialogue_file, speaker=['CHI'])
print('WORDS FROM CHI:\n', dlg_words_CHI, '\n', type(dlg_words_CHI))

# sentences
print('\nSENTENCES:')
for sent in brown.sents(dialogue_file, speaker=['MOT', 'CHI'])[:10]:
    print(sent)

INFO:
ActivityType :  toyplay
Corpus :  Brown
Date :  1962-10-08
DesignType :  long
GroupType :  TD
Lang :  eng
PID :  11312/c-00015632-1
Version :  2.16.0
{http://www.w3.org/2001/XMLSchema-instance}schemaLocation :  http://www.talkbank.org/ns/talkbank https://talkbank.org/software/talkbank.xsd
<class 'nltk.collections.LazyMap'> 

PARTICIPANTS:
CHI :  [('SES', 'MC'), ('age', 'P2Y03M04D'), ('group', 'typical'), ('id', 'CHI'), ('language', 'eng'), ('name', 'Adam'), ('role', 'Target_Child'), ('sex', 'male')]
COL :  [('id', 'COL'), ('language', 'eng'), ('name', 'Colin_Fraser'), ('role', 'Investigator')]
MOT :  [('id', 'MOT'), ('language', 'eng'), ('role', 'Mother'), ('sex', 'female')]
RIC :  [('id', 'RIC'), ('language', 'eng'), ('name', 'Richard_Cromer'), ('role', 'Investigator')]
URS :  [('id', 'URS'), ('language', 'eng'), ('name', 'Ursula_Bellugi'), ('role', 'Investigator')]
<class 'nltk.collections.LazyMap'> 

WORDS:
 ['play', 'checkers', 'big', 'drum', 'big', 'drum', ...] 
 <class 'nlt

## Problem
We can't get information about what speaker said what sentence (or when each sentence was said), so we can't extract turn-based information.. ugh. Apparently the CHILDES corpus reader just doesn't do this:
http://ling-blogs.bu.edu/lx394s19/hw5-childes/

> "Here’s where the CHILDESCorpusReader is most disappointing, I think. We can find the sentences in the corpus from a particular speaker, but we can’t see what other speakers said in between. And if we just look at all the utterances, we can’t tell who is saying which line."

So we'll need to parse the XML ourselves.. (the link above looks like it has some explanation on it)

## Loading dialogues through XML
Here we go

In [68]:
dialogue_file = 'Brown/Adam/020304.xml'

# get the XML representation
brown_xml = brown.xml(dialogue_file)

In [125]:
# this should find all utterances
ns = '{http://www.talkbank.org/ns/talkbank}'
utterances = the_xml.findall(ns+'u')

# a single utterances
utt = utterances[0]
print(type(utt))

# yay we can see the utterance ID and the speaker
print(utt.get('uID'))
print(utt.get('who'))

# this gets us the words of the utterance
print([w.text for w in utt if w.text and w.tag == ns+'w'])

# how to access other information? (like pos tags)
# not sure, somehow we have to get into the mor 
# and then the mw and then the pos tag for each word.. (see cell below)
for x in utt[0]:
    print(x)

<class 'xml.etree.ElementTree.Element'>
u0
CHI
['play', 'checkers']
<Element '{http://www.talkbank.org/ns/talkbank}mor' at 0x116302c78>


## Raw XML for the above