In [1]:
import glob
import os
from collections import Counter
import nltk
from nltk.corpus import words
from nltk.tokenize import word_tokenize
import pandas as pd

nltk.download('punkt')
nltk.download('words')

# Set pandas display options to avoid truncation
pd.set_option('display.max_rows', 200)

[nltk_data] Downloading package punkt to /Users/maxreuter/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package words to /Users/maxreuter/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
def load_corpora(dirs):
    # Combine the text from all files in the directories
    combined_text = ' '.join(
        open(file, 'r').read()
        for dir in dirs
        for file in glob.glob(os.path.join(dir, '**', '*.cha'), recursive=True)
    )

    # Tokenize the text and filter out non-English words
    tokenized_words = word_tokenize(combined_text.lower())
    english_words = set(words.words())
    filtered_words = [word for word in tokenized_words if word.isalpha() and word in english_words]
    
    # Count the occurrences of each word
    word_counts = Counter(filtered_words)
    
    # Construct a DataFrame from the word counts
    df = pd.DataFrame(word_counts.items(), columns=['Word', 'Count'])
    df = df.sort_values(by='Count', ascending=False)

    return df

In [3]:
# Load all corpora into a single pandas DataFrame

corpora = {}

corpora['Bates'] = load_corpora(
    ['data/Bates/Free20'] + 
    [f'data/Bates/{x}28' for x in ['Free', 'Snack', 'Story']]
)

corpora['Champaign'] = load_corpora(
    [f'data/Champaign/{x}{y}' for x in [21, 24, 27, 30, 33, 36] for y in ['P', 'X']]
)

corpora['Garvey'] = load_corpora(
    ['data/Garvey']
)

corpora['Hall'] = load_corpora(
    [f'data/Hall/{x}{y}' for x in ['Black', 'White'] for y in ['Pro', 'Work']]
)

corpora['HSLDD'] = load_corpora(
    [f'data/HSLLD/HV1/{x}' for x in ['BR', 'ER', 'MT', 'TP']] +
    [f'data/HSLLD/HV2/{x}' for x in ['BR', 'ER', 'MT', 'TP']] +
    [f'data/HSLLD/HV3/{x}' for x in ['BR', 'ER', 'ET', 'MT', 'RE', 'TP']] +
    [f'data/HSLLD/HV5/{x}' for x in ['BR', 'LW', 'MT']] +
    [f'data/HSLLD/HV7/{x}' for x in ['ET', 'LW', 'MD', 'MT']]
)

# Combine all corpora into a single DataFrame
df = pd.concat(corpora.values(), keys=corpora.keys(), names=['Corpus', 'Index'])

df

Unnamed: 0_level_0,Unnamed: 1_level_0,Word,Count
Corpus,Index,Unnamed: 2_level_1,Unnamed: 3_level_1
Bates,8,mor,12406
Bates,9,gra,12401
Bates,1,mot,8679
Bates,10,act,6024
Bates,3,chi,5673
...,...,...,...
HSLDD,6920,cording,1
HSLDD,6921,dumdum,1
HSLDD,6926,teensy,1
HSLDD,6928,boneless,1


In [5]:
# Display the 200 most common words in the combined dataset

print('Most common words overall:')
df.groupby('Word').sum().sort_values(by='Count', ascending=False).head(200)

Most common words overall:


Unnamed: 0_level_0,Count
Word,Unnamed: 1_level_1
gra,495507
mor,492449
chi,370141
mot,345165
you,157163
...,...
hear,3139
remember,3134
read,3120
door,3108


In [81]:
# Display the top words for each corpus
for corpus, corpus_df in df.groupby('Corpus'):
    print(f'Top words for {corpus}:')
    print(corpus_df.head(20))
    print()

Top words for Bates:
               Word  Count
Corpus Index              
Bates  8        mor  12406
       9        gra  12401
       1        mot   8679
       10       act   6024
       3        chi   5673
...             ...    ...
       235      get    194
       195      now    192
       1511   miffy    190
       1171    eats    190
       139     away    190

[100 rows x 2 columns]

Top words for Champaign:
                  Word   Count
Corpus    Index               
Champaign 1        chi  155385
          2        mot  143496
          30       you   48743
          33       the   41449
          47        it   28832
...                ...     ...
          302    these    2106
          211      job    2102
          66       big    2098
          499    pizza    2096
          32      play    2092

[100 rows x 2 columns]

Top words for Garvey:
               Word  Count
Corpus Index              
Garvey 8        mor   9790
       9        gra   9790
       1        chi 