# A (pseudo-)proof for language creativity
## The surprising rarity in which the exact same sentence is used more than once

In [2]:
####################### base material #######################

import nltk
from nltk.corpus import gutenberg as gb
from pprint import pprint

In [17]:
####################### 3 functions #######################

def get_rand_sentence(sent_tok_corpus):
    """grabs a random sentence from a tokenized corpus and transforms it into a string.
    
    >>>get_rand_sentence(nltk.corpus.gutenberg(fileids='shakespeare-macbeth.txt'))
    'Shew his Eyes , and greeue his Hart , Come like shadowes , so depart .'
    
    :param sent_tok_corpus: text corpus that has been tokenized for sentences and words
    :type sent_tok_corpus: NLTK ReadTextCorpus object (sent_tokenized)
    :return: a random sentence from the input corpus
    :rtype: string
    """
    import re
    from random import randint
    
    num_of_sents = len(sent_tok_corpus)
    # introduce a check to avoid senseless macbeth outcries
    rand_sent_words = []
    while len(rand_sent_words) < 5:
        rand_sent_i = randint(0, len(sent_tok_corpus))
        rand_sent_words = sent_tok_corpus[rand_sent_i]
        rand_sent = " ".join(rand_sent_words)
    return rand_sent

def find_sentence_twins(corpus, sentence):
    """searches a corpus for occurances of a sentence."""
    import re
    pattern = re.compile(sentence)
    matches = re.findall(pattern, corpus)
    return matches

def scan_corpora(list_of_corpora):
    """wrapper function: scans a list of corpora for multiple occurances of a randomly selected sentence.
    
    :param list_of_corpora: the titles of the Gutenberg-project books included with NLTK
    :type list_of_corpora: list of strings
    :return: a dictionary containing the name of the corpus as key, and as value a tuple with a list of
             the matches of the random sentence and a count of the doubles
    :rtype: dictionary{string : tuple(list[string, string], string)}
    """
    all_matches = {}
    for c in list_of_corpora:
        sent_tok_corpus = gb.sents(fileids=c)
        full_corpus = ""
        for sent in sent_tok_corpus:
            sentence = " ".join(sent)
            full_corpus += sentence
        rand_sent = get_rand_sentence(sent_tok_corpus)
        corpus_matched = find_sentence_twins(full_corpus, rand_sent)
        # number of doubles if there are any
        doubles = len(corpus_matched) - 1
        all_matches[c] = (corpus_matched, "-**-DOUBLES-**-: {}".format(doubles))
    return all_matches

In [34]:
####################### calling it #######################

# getting a list of all the corpora of Project Gutenberg
# that come with NLTK
corpora = gb.fileids()
# calling the function wrapper on all corpora
twins = scan_corpora(corpora)

In [36]:
# here are the results
pprint(twins)

{'austen-emma.txt': (['I am always watching her to admire ; and I do pity her '
                      'from my heart ."'],
                     '##_DOUBLES_##: 0'),
 'austen-persuasion.txt': (['Anne had not wanted this visit to Uppercross , to '
                            'learn that a removal from one set of people to '
                            'another , though at a distance of only three '
                            'miles , will often include a total change of '
                            'conversation , opinion , and idea .'],
                           '##_DOUBLES_##: 0'),
 'austen-sense.txt': (['Elinor , who knew that such grief , shocking as it was '
                       'to witness it , must have its course , watched by her '
                       'till this excess of suffering had somewhat spent '
                       "itself , and then turning eagerly to Willoughby ' s "
                       'letter , read as follows :'],
                      '##_DOUBLES_##: 0'

It becomes very obvious that, even if running the code multiple times, the `##_DOUBLES_##` count usually remains at 0. This means that the authors hardly ever use the same sentence twice in their work. A tiny victory for **language creativity**! 😁

Would be interesting to further test this:
- on a much larger corpus, 
- on non-literary texts,
- on a spoken-word corpus

and see whether there are differences, in that maybe other corpora would contain more double-sentences. Literature is, after all, not spoken language.

### Side-note: Implications for plagiarism-detection

The fact that multiple occurances of the exact same sentence seems to be **extremely rare** makes me also think about **plagiarism-detection** in e.g. diploma theses. If we really hardly ever create the same sentence twice, then checking for exact occrances is indeed a very hefty proof that someone might have plagiarised another text.

## Using repetition as a literary tool

Usually they are always 0 - this is an interesting exception, that speaks of an author using a repetitive aspect of language in order to achieve a certain effect.


```
 'bryant-stories.txt': ('said the little Jackal .',
                        ['said the little Jackal ,',
                         'said the little Jackal .',
                         'said the little Jackal .',
                         'said the little Jackal ,',
                         'said the little Jackal ,',
                         'said the little Jackal .',
                         'said the little Jackal .',
                         'said the little Jackal .',
                         'said the little Jackal .',
                         'said the little Jackal .',
                         'said the little Jackal ,',
                         'said the little Jackal ,',
                         'said the little Jackal .',
                         'said the little Jackal ,',
                         'said the little Jackal .',
                         'said the little Jackal ;',
                         'said the little Jackal ,',
                         'said the little Jackal .'],
                        '**DOUBLES**: 17'),
```

To take a fast look at the context this phrase appears so often, I'll print out the occurances with a window of 10 words left and right of it.

In [8]:
bryant_words = gb.words(fileids="bryant-stories.txt")

In [16]:
# getting a window of 10 around the phrase of interest
index = 0
for w in bryant_words:
    if w == "Jackal" and bryant_words[index-1] == "little" and bryant_words[index-3] == "said":
        sent_window = " ".join(bryant_words[index-13:index+10])
        print(sent_window)
    index += 1

it !" " Indeed , indeed , Father Lion ," said the little Jackal , " I know that is what everybody thinks
. " He lives down there , Father Lion !" said the little Jackal . " He lives down there !" The Lion
free from a terrible cage ?" " Beg pardon ?" said the little Jackal . " I said ," said the Brahmin ,
set him free from his cage ?" " Cage ?" said the little Jackal , vacantly . " Yes , yes , his
your opinion . Do you think ----" " Oh ," said the little Jackal , " you want my opinion ? Then may
him free from his cage ?" " What cage ?" said the little Jackal . " Why , the cage he was in
----" " But I don ' t altogether understand ," said the little Jackal . " You ' set him free ,' you
. " That gives me no idea at all ," said the little Jackal . " See here , my friends , if
. " Now , let us understand the situation ," said the little Jackal . " Friend Brahmin , where were you ?"
Brahmin . " Tiger , and where were you ?" said the little Jackal . " Why , in the cage , of
Oh , I beg your pa