In [14]:
import nltk
from nltk.corpus import gutenberg as gb
from pprint import pprint

# 

In [15]:
pprint(gb.fileids())

['austen-emma.txt',
 'austen-persuasion.txt',
 'austen-sense.txt',
 'bible-kjv.txt',
 'blake-poems.txt',
 'bryant-stories.txt',
 'burgess-busterbrown.txt',
 'carroll-alice.txt',
 'chesterton-ball.txt',
 'chesterton-brown.txt',
 'chesterton-thursday.txt',
 'edgeworth-parents.txt',
 'melville-moby_dick.txt',
 'milton-paradise.txt',
 'shakespeare-caesar.txt',
 'shakespeare-hamlet.txt',
 'shakespeare-macbeth.txt',
 'whitman-leaves.txt']


In [18]:
text = gb.raw(fileids='chesterton-thursday.txt')

In [21]:
# looking for a sentence I like
#print(text[500:1000])

In [22]:
sentence = "The world was very old indeed when you and I were young."

In [28]:
text.find(sentence)

598

In [29]:
len(text)

320525

In [30]:
text.find(sentence, 599)

-1

## Okay, let's dive deeper (and broader)

The first try was an exploratory random sample (selected by flavor). Let's put this whole thing onto different feet.

In [31]:
sent_tok = gb.sents(fileids="chesterton-thursday.txt")

In [32]:
print(sent_tok)

[['[', 'The', 'Man', 'Who', 'Was', 'Thursday', 'by', 'G', '.', 'K', '.', 'Chesterton', '1908', ']'], ['To', 'Edmund', 'Clerihew', 'Bentley'], ...]


In [33]:
len(sent_tok)

3742

In [35]:
from random import randint

In [44]:
rand_sent_i = randint(0, len(sent_tok))

In [47]:
rand_sent = " ".join(sent_tok[rand_sent_i])

In [79]:
import re
pattern = re.compile(r'\s(?=[",\.])')

In [80]:
len(pattern.findall(rand_sent))

6

In [81]:
rand_sent

'" Not so bad as that ," said Dr . Bull , with unnecessary laughter , " not so bad as that .'

So here we have a problem, because different things need to be treated in a different way. My randome example sentence:
>'" Not so bad as that ," said Dr . Bull , with unnecessary laughter , " not so bad as that .'

Has quotation marks " that need in some case have the whitespace removed before them, and sometimes after.
I'm not gonna deal with that now. :)

But I will complete the job of removing the other spaces, those before sentence delimiters such as , and .

In [82]:
re.sub(pattern, "", rand_sent)

'" Not so bad as that," said Dr. Bull, with unnecessary laughter," not so bad as that.'

This is pretty neat, but it misses some finesse:
>'" Not so bad as that," said Dr. Bull, with unnecessary laughter," not so bad as that.'

That I am not gonna provide right now.
I take it.

## Clean up and formalize

In [87]:
def get_rand_sentence(sent_tok_corpus):
    import re
    from random import randint
    
    num_of_sents = len(sent_tok_corpus)
    rand_sent_i = randint(0, len(sent_tok_corpus))
    rand_sent = " ".join(sent_tok_corpus[rand_sent_i])
    pattern = re.compile(r'\s(?=[^\w])')
    rand_sent = re.sub(pattern, "", rand_sent)
    return rand_sent

In [99]:
# good enough
print(get_rand_sentence(sent_tok))

Syme' s family, like most of those who end in the simple life, had once owned a motor, and he knew all about them.


In [104]:
def find_sentence_twins(corpus, sentence):
    import re
    pattern = re.compile(sentence)
    matches = re.findall(pattern, corpus)
    return matches

In [105]:
text = gb.raw(fileids='chesterton-thursday.txt')
print(rand_sent)

" Not so bad as that ," said Dr . Bull , with unnecessary laughter , " not so bad as that .


In [106]:
print(find_sentence_twins(text, rand_sent))

[]


In [115]:
corpora = gb.fileids()

In [116]:
def scan_corpora(list_of_corpora):
    all_matches = {}
    for c in list_of_corpora:
        raw_corpus = gb.raw(fileids=c)
        sent_tok_corpus = gb.sents(fileids=c)
        rand_sent = get_rand_sentence(sent_tok_corpus)
        corpus_matched = find_sentence_twins(raw_corpus, rand_sent)
        all_matches[c] = corpus_matched
    return all_matches

In [119]:
twins = scan_corpora(corpora)

In [120]:
pprint(twins)

{'austen-emma.txt': [],
 'austen-persuasion.txt': [],
 'austen-sense.txt': [],
 'bible-kjv.txt': [],
 'blake-poems.txt': [],
 'bryant-stories.txt': [],
 'burgess-busterbrown.txt': [],
 'carroll-alice.txt': [],
 'chesterton-ball.txt': ['shouted Turnbull ',
                         'shouted Turnbull.',
                         'shouted Turnbull ',
                         'shouted Turnbull.'],
 'chesterton-brown.txt': [],
 'chesterton-thursday.txt': [],
 'edgeworth-parents.txt': ['Now you are sure of the main with Lord John.'],
 'melville-moby_dick.txt': [],
 'milton-paradise.txt': [],
 'shakespeare-caesar.txt': [],
 'shakespeare-hamlet.txt': ['Hora',
                            'Hora',
                            'Hor.',
                            'Hora',
                            'Hor.',
                            'Hora',
                            'Hora',
                            'Hor.',
                            'Hor.',
                            'Hora',
                  

Now here are some issues. Due to the fact that my regex doesn't create completely sound sentences, many return no matches at all (while there should be at least one.

hm, this could work if I simply treat them all the same, and piece the whole corpus together shadily too.

In [124]:
def get_rand_sentence(sent_tok_corpus):
    import re
    from random import randint
    
    num_of_sents = len(sent_tok_corpus)
    rand_sent_i = randint(0, len(sent_tok_corpus))
    rand_sent = " ".join(sent_tok_corpus[rand_sent_i])
    return rand_sent

def scan_corpora(list_of_corpora):
    all_matches = {}
    for c in list_of_corpora:
        sent_tok_corpus = gb.sents(fileids=c)
        full_corpus = ""
        for sent in sent_tok_corpus:
            sentence = " ".join(sent)
            full_corpus += sentence
        rand_sent = get_rand_sentence(sent_tok_corpus)
        corpus_matched = find_sentence_twins(full_corpus, rand_sent)
        all_matches[c] = (rand_sent, corpus_matched)
    return all_matches

In [125]:
twins = scan_corpora(corpora)

In [126]:
pprint(twins)

{'austen-emma.txt': ('" Well , Emma , I do not believe I have any thing more '
                     "to say about the boys ; but you have your sister ' s "
                     'letter , and every thing is down at full length there we '
                     'may be sure .',
                     ['" Well , Emma , I do not believe I have any thing more '
                      "to say about the boys ; but you have your sister ' s "
                      'letter , and every thing is down at full length there '
                      'we may be sure .']),
 'austen-persuasion.txt': ('She only consulted Lady Russell , who entered '
                           'thoroughly into her sentiments , and was most '
                           "happy to convey her as near to Mrs Smith ' s "
                           'lodgings in Westgate Buildings , as Anne chose to '
                           'be taken .',
                           ['She only consulted Lady Russell , who entered '
                   

In [127]:
macbeth = gb.sents(fileids="shakespeare-macbeth.txt")

In [128]:
macbeth

[['[', 'The', 'Tragedie', 'of', 'Macbeth', 'by', 'William', 'Shakespeare', '1603', ']'], ['Actus', 'Primus', '.'], ...]

In [131]:
macbeth[20:40]

[['All', '.'],
 ['Padock',
  'calls',
  'anon',
  ':',
  'faire',
  'is',
  'foule',
  ',',
  'and',
  'foule',
  'is',
  'faire',
  ',',
  'Houer',
  'through',
  'the',
  'fogge',
  'and',
  'filthie',
  'ayre',
  '.'],
 ['Exeunt', '.'],
 ['Scena', 'Secunda', '.'],
 ['Alarum', 'within', '.'],
 ['Enter',
  'King',
  'Malcome',
  ',',
  'Donalbaine',
  ',',
  'Lenox',
  ',',
  'with',
  'attendants',
  ',',
  'meeting',
  'a',
  'bleeding',
  'Captaine',
  '.'],
 ['King', '.'],
 ['What', 'bloody', 'man', 'is', 'that', '?'],
 ['he',
  'can',
  'report',
  ',',
  'As',
  'seemeth',
  'by',
  'his',
  'plight',
  ',',
  'of',
  'the',
  'Reuolt',
  'The',
  'newest',
  'state'],
 ['Mal', '.'],
 ['This',
  'is',
  'the',
  'Serieant',
  ',',
  'Who',
  'like',
  'a',
  'good',
  'and',
  'hardie',
  'Souldier',
  'fought',
  "'",
  'Gainst',
  'my',
  'Captiuitie',
  ':',
  'Haile',
  'braue',
  'friend',
  ';',
  'Say',
  'to',
  'the',
  'King',
  ',',
  'the',
  'knowledge',
  'of',
  '

Okay, this can get corrected. I'll only take "sentences" that have a lenght of > 5 units (words + delimiters, since sent_tokenizer treats them as units of meaning).

In [132]:
def get_rand_sentence(sent_tok_corpus):
    import re
    from random import randint
    
    num_of_sents = len(sent_tok_corpus)
    # introduce a check to avoid senseless macbeth outcries
    rand_sent = ""
    while len(rand_sent) < 5:
        rand_sent_i = randint(0, len(sent_tok_corpus))
        rand_sent = " ".join(sent_tok_corpus[rand_sent_i])
    return rand_sent

In [133]:
twins = scan_corpora(corpora)

In [134]:
pprint(twins)

{'austen-emma.txt': ('This was astonishing !-- She could not have believed it '
                     'possible that the taste or the pride of Miss Fairfax '
                     'could endure such society and friendship as the Vicarage '
                     'had to offer .',
                     ['This was astonishing !-- She could not have believed it '
                      'possible that the taste or the pride of Miss Fairfax '
                      'could endure such society and friendship as the '
                      'Vicarage had to offer .']),
 'austen-persuasion.txt': ('He looked at her with a smile , and a little '
                           'motion of the head , which expressed , " Come to '
                           'me , I have something to say ;" and the unaffected '
                           ', easy kindness of manner which denoted the '
                           'feelings of an older acquaintance than he really '
                           'was , strongly enforced 

BAM. That's it. I've got my code. :)