# Make a corpus of BNC documents for topic modelling 

The following code will create a corpus, which is the large subset of the entire BNC, that can be used with bag of words topic models.

In [1]:
import os
import hashlib

from collections import defaultdict

from bnctools import utils

utils.vocabulary_directory = 'cache'

We'll make a quick and dirty checksum-er to check file integrity. 

In [2]:
def checksum(filename):

    '''
    Returns the hash checksum of the file named `filename`.
    '''

    h = hashlib.new('sha256')

    argument = open(filename,'rb').read()

    h.update(argument)

    return h.hexdigest()

## Parse BNC into paragraphs

Extract all paragraphs from the BNC using some tools in the `bnctools.utils` module. These paragraphs are available in the cache (available via `git fat pull`) or can be calculated a new using the rather computationally expensive `get_all_paragraphs_parallel` function

In [3]:
use_cached_data = True # Set to False if you do not want to use the cached pickle file

pkl_filename = 'cache/bnc_paragraphs.pkl'
pkl_file_checksum = '0b70c19a5ef8243933368d93ec47a4bf35674c04808e198d249a0e20e575f7b4'

if not use_cached_data:

    bnc_2554_texts_root = 'cache/bnc/2554/download/Texts/'

    corpus_filenames = utils.Corpus.get_corpus_filenames(bnc_2554_texts_root)
    
    # Make sure cluster is started with e.g. ipcluster start -n 16
    view = utils.init_ipyparallel()
    paragraphs = utils.get_all_paragraphs_parallel(view, corpus_filenames)
    utils.dump(paragraphs, filename=pkl_filename)
    
    assert checksum('cache/bnc_paragraphs.pkl') == pkl_file_checksum

else:
    
    assert checksum('cache/bnc_paragraphs.pkl') == pkl_file_checksum

    paragraphs = utils.load(pkl_filename)

assert sum(map(lambda paragraph: paragraph['word_count'], paragraphs)) == 87564696

## Remove selected paragraphs 

Those paragraphs that were used as experimental stimuli in experiments *Brisbane* and *Malmo* should be removed from the corpus. The intended use of the corpus is to be the training set for the Topic model that by hypothesis represents the average background knowledge of participants who then read and memorize texts or word lists in experiments. Including those texts, or the texts from which the word lists were derived, in the training corpus would not be appropriate  because it would in a sense be putting those texts in the participants' background knowledge.

The paragraphs used in the above mentioned experiments are available in `sampled-stimuli.pkl` file.

In [4]:
stimuli_filename = 'cache/sampled-stimuli.pkl'

assert checksum(stimuli_filename)\
    == 'fbc45d8f87479bd7290c4d1e848b310294c28c5120dc5d42848a07214d26d774'

with open(stimuli_filename, 'rb') as f:
    experimental_stimuli = utils.pickle.load(f)

We'll find the ID, defined by BNC filename, div1 index in file, paragraph index, of the experimental stimuli paragraphs and then filter them out of the entire paragraph list.

In [5]:
def get_paragraph_id(paragraph):
    return tuple(
        map(paragraph.get, 
            ('corpus_filename', 'div1_index', 'paragraph_index')
           )
    )

experimental_stimuli_ids = map(get_paragraph_id, experimental_stimuli)

# Hack; We need to strip off the ./ from the stimuli filenames
_tmp = []
for experimental_stimuli_id in experimental_stimuli_ids:
    filename, div1_index, paragraph_index = experimental_stimuli_id
    _tmp.append((filename.strip('./'), div1_index, paragraph_index))
experimental_stimuli_ids = _tmp
del _tmp

acceptable_paragraphs\
    = filter(lambda paragraph: not get_paragraph_id(paragraph) in experimental_stimuli_ids, 
             paragraphs)

The following will create a set of small "documents". Each document is either a single paragraph or a concatenation of consecutive paragraphs such that the total word count in each mini document is in a given word count range, which by default is 250 to 500 words. Each mini-doc is represented as a string with words delimited by a '|'. We'll write the corpus of documents, and the vocabulary, to file.

In [6]:
def write_corpus_to_file(mini_documents, 
                         corpus_filename_check,
                         vocab_filename_check,
                         corpus_checksum, 
                         vocab_checksum,
                         cache_directory='cache'):
    
    counts = map(lambda doc: len(doc.split('|')), mini_documents)

    corpus_filename = 'bnc_texts_%d_%d_%d_%d.txt' % tuple([func(counts) for func in (sum, len, min, max)])
    corpus_file_path = os.path.join(cache_directory, corpus_filename)

    with open(corpus_file_path, 'w') as f:
        f.write('\n'.join(mini_documents))

    assert corpus_filename == corpus_filename_check, corpus_filename
    assert checksum(corpus_file_path) == corpus_checksum

    vocabulary = utils.get_corpus_vocabulary(mini_documents, minimum_count=5)
    vocab_filename = 'bnc_vocab_%d.txt' % len(vocabulary)
    vocab_file_path = os.path.join(cache_directory, vocab_filename)

    with open(vocab_file_path, 'w') as f:
        f.write('\n'.join(sorted(vocabulary.keys())))

    assert vocab_filename == vocab_filename_check, vocab_filename
    assert checksum(vocab_file_path) == vocab_checksum

In [7]:
mini_documents = utils.paragraphs_to_mini_documents(acceptable_paragraphs,
                                                    mini_document_length=(250, 500),
                                                    sep='|')

write_corpus_to_file(mini_documents,
                     'bnc_texts_78639361_183975_251_499.txt',
                     'bnc_vocab_49324.txt',
                     'bd91a2936157f50b0ceb3cf9430a53d4c652e3ab553b8ec14fe75db6e07cd36c',
                     'ecf66c77121cf67e416580cf5cc0853bd1813dcfd946298723134e547324cb6b')