# A proof for language creativity

In [4]:
####################### base material #######################

import nltk
from nltk.corpus import gutenberg as gb
from pprint import pprint

In [37]:
####################### 3 functions #######################

def get_rand_sentence(sent_tok_corpus):
    """grabs a random sentence from a tokenized corpus and transforms it into a string.
    
    >>>get_rand_sentence(nltk.corpus.gutenberg(fileids='shakespeare-macbeth.txt'))
    'Shew his Eyes , and greeue his Hart , Come like shadowes , so depart .'
    
    :param sent_tok_corpus: text corpus that has been tokenized for sentences and words
    :type sent_tok_corpus: NLTK ReadTextCorpus object (sent_tokenized)
    :return: a random sentence from the input corpus
    :rtype: string
    """
    import re
    from random import randint
    
    num_of_sents = len(sent_tok_corpus)
    # introduce a check to avoid senseless macbeth outcries
    rand_sent_words = []
    while len(rand_sent_words) < 5:
        rand_sent_i = randint(0, len(sent_tok_corpus))
        rand_sent_words = sent_tok_corpus[rand_sent_i]
        rand_sent = " ".join(rand_sent_words)
    return rand_sent

def find_sentence_twins(corpus, sentence):
    import re
    pattern = re.compile(sentence)
    matches = re.findall(pattern, corpus)
    return matches

def scan_corpora(list_of_corpora):
    all_matches = {}
    for c in list_of_corpora:
        sent_tok_corpus = gb.sents(fileids=c)
        full_corpus = ""
        for sent in sent_tok_corpus:
            sentence = " ".join(sent)
            full_corpus += sentence
        rand_sent = get_rand_sentence(sent_tok_corpus)
        corpus_matched = find_sentence_twins(full_corpus, rand_sent)
        # number of doubles if there are any
        doubles = len(corpus_matched) - 1
        all_matches[c] = (corpus_matched, "-**-DOUBLES-**-: {}".format(doubles))
    return all_matches

In [34]:
####################### calling it #######################

# getting a list of all the corpora of Project Gutenberg
# that come with NLTK
corpora = gb.fileids()
# calling the function wrapper on all corpora
twins = scan_corpora(corpora)

In [36]:
# here are the results
pprint(twins)

{'austen-emma.txt': (['I am always watching her to admire ; and I do pity her '
                      'from my heart ."'],
                     '##_DOUBLES_##: 0'),
 'austen-persuasion.txt': (['Anne had not wanted this visit to Uppercross , to '
                            'learn that a removal from one set of people to '
                            'another , though at a distance of only three '
                            'miles , will often include a total change of '
                            'conversation , opinion , and idea .'],
                           '##_DOUBLES_##: 0'),
 'austen-sense.txt': (['Elinor , who knew that such grief , shocking as it was '
                       'to witness it , must have its course , watched by her '
                       'till this excess of suffering had somewhat spent '
                       "itself , and then turning eagerly to Willoughby ' s "
                       'letter , read as follows :'],
                      '##_DOUBLES_##: 0'

Usually they are always 0 - this is an interesting exception, that speaks of an author using a repetitive aspect of language in order to achieve a certain effect.


```
 'bryant-stories.txt': ('said the little Jackal .',
                        ['said the little Jackal ,',
                         'said the little Jackal .',
                         'said the little Jackal .',
                         'said the little Jackal ,',
                         'said the little Jackal ,',
                         'said the little Jackal .',
                         'said the little Jackal .',
                         'said the little Jackal .',
                         'said the little Jackal .',
                         'said the little Jackal .',
                         'said the little Jackal ,',
                         'said the little Jackal ,',
                         'said the little Jackal .',
                         'said the little Jackal ,',
                         'said the little Jackal .',
                         'said the little Jackal ;',
                         'said the little Jackal ,',
                         'said the little Jackal .'],
                        '**DOUBLES**: 17'),
```