In [3]:
import nltk
import random
import language_check
import string
import re
import wikipedia

class Markov(object):
    def __init__(self, string):
        self.cache = {}
        self.words = string.split()
        #self.words = re.split(' |\+|\n|“|”|\.|\,|\'|=',string)
        self.words.reverse()
        self.words = [x.lower() for x in self.words]
        self.wordchoices = list(set(self.words))
        self.word_size = len(self.words)
        self.database()
		
    def triples(self):
		#Generates triples from the given data string. So if our string were "What a lovely day", we'd generate (What, a, lovely) and then (a, lovely, day).
        if len(self.words) < 3:
            return
        for i in range(len(self.words) - 2):
            yield (self.words[i], self.words[i+1], self.words[i+2])
			
    def database(self):
        for w1, w2, w3 in self.triples():
            key = (w1, w2)
            if key in self.cache:
                self.cache[key].append(w3)
            else:
                self.cache[key] = [w3]
				
    def generate_markov_text(self, size=-1, seed_word=''):
        #size of -1 means complete sentence
        seed_word=seed_word.lower()
        seed = random.randint(0, self.word_size-3)
        try:
            next_word=self.words[random.choice([i for i,x in enumerate(self.words) if x==seed_word])+1]
        except:
            seed_word, next_word = self.words[seed], self.words[seed+1]
        w1, w2 = seed_word, next_word
        gen_words = []
        if size!=-1:
            for i in range(size):
                gen_words.append(w1)
                w1, w2 = w2, random.choice(self.cache[(w1, w2)])
                #print(self.cache[(w1, w2)])
        else:
            while True:
                gen_words.append(w1)
                w1, w2 = w2, random.choice(self.cache[(w1, w2)])
                #print(self.cache[(w1, w2)])
                if((w2[-1]=='.')|(w2[-1]=='!')|(w2[-1]=='?')):
                    break
        gen_words.append(w2)
        return ' '.join(gen_words).capitalize()
    
    def generate_markov_text_syl(self, size=10, seed_word=''):
        #size of -1 means complete sentence
        seed_word=seed_word.lower()
        try:
            wordpool=[i_markov_text for i_markov_text,x in enumerate(self.words) if x==seed_word]
            if(len(wordpool)>0):
                next_word=self.words[random.choice([i_markov_text for i_markov_text,x in enumerate(self.words) if x==seed_word])+1]
            else:
                seed = random.randint(0, self.word_size-3)
                seed_word, next_word = self.words[seed], self.words[seed+1]
        except:
            seed = random.randint(0, self.word_size-3)
            seed_word, next_word = self.words[seed], self.words[seed+1]
        gen_words = []
        if size!=-1:
            while True:
                gen_words.append(seed_word)
                seed_word, next_word = next_word, random.choice(self.cache[(seed_word, next_word)])
                if(syllablesPhrase(' '.join(gen_words))>=size):
                    break
        else:
            while True:
                gen_words.append(seed_word)
                seed_word, next_word = next_word, random.choice(self.cache[(seed_word, next_word)])
                #print(self.cache[(w1, w2)])
                if((next_word[-1]=='.')|(next_word[-1]=='!')|(next_word[-1]=='?')):
                    break
        gen_words.append(next_word)
        return ' '.join(reversed(gen_words)).capitalize()
        
def generateStanza2(seed,file1,lines=4,line_length=10,counting=0):
    #alternating lines from two corpuses (not corpii)
    stanza=''
    #counting determines if count is by word or by syllable (0= by syllable)
    for item in range(lines):
        if(seed!=''):
            sentence=file1.generate_markov_text_syl(size=line_length,seed_word=str(seed))
        else:
            sentence=file1.generate_markov_text_syl(size=line_length)
        stanza=stanza+sentence+'\n'
        try:
            rhymingWords=list(rhyme(sentence.split()[-1].translate(str.maketrans('','',string.punctuation)),1))
            rhymingWords=[wordchoice for wordchoice in rhymingWords if wordchoice in file1.wordchoices]
        except:
            seed=random.choice(rhymingWords)
    return stanza

def syllablesPhrase(phrase):
    count=0
    for word in phrase.split():
        vowels = 'aeiouy'
        word = word.lower().strip(".:;?!")
        if word[0] in vowels:
            count +=1
        for index in range(1,len(word)):
            if word[index] in vowels and word[index-1] not in vowels:
                count +=1
        if word.endswith('e'):
            count -= 1
        if word.endswith('le'):
            count+=1
        if count == 0:
            count +=1
    return count

def rhyme(inp, level):
    entries = nltk.corpus.cmudict.entries()
    syllables = [(word, syl) for word, syl in entries if word == inp]
    rhymes = []
    for (word, syllable) in syllables:
        rhymes += [word for word, pron in entries if pron[-level:] == syllable[-level:]]
    return set(rhymes)

def fixCapitalization(string,proper_nouns=False):
    string=string.replace(' i ',' I ')
    string=string.replace(' i\'',' I\'')
    if(proper_nouns==True):
        #use named entity recognition to identify proper nouns
        print('')
    return string

story=''
seed='list of common misconceptions'
numLines=5
for i in range(1,numLines):
    print('Looking up',seed,'...')
    model=Markov(wikipedia.page(seed).content)
    newStanza=generateStanza2(seed,model,lines=4,line_length=8)
    #print(newStanza)
    story=story+'\n'+newStanza
    storyWords=re.split(' |\+|\n|“|”|\.|\,|\'|=',story.lower().translate(string.punctuation))
    storyWords=[word for word in storyWords if word!='']
    tempSeed=random.choice(storyWords)
    try:
        wikipedia.summary(tempSeed)
        seed=tempSeed
    except wikipedia.exceptions.DisambiguationError as e:
        storyWords=[word for word in e.options if not '(disambiguation)' in word]
        seed=random.choice(storyWords)

story=fixCapitalization(story)
story=language_check.correct(story,language_check.LanguageTool('en-US').check(story))

print(story)

Looking up list of common misconceptions ...
Looking up cause ...
Looking up cause ...
Looking up Anderston railway station ...

A Fowler's echoes this sentiment: “the
== “healthy” has only recently been
Cause passage to sound weak and, in some
Dictionary”. Oxford university press. Isbn

Object and accepts simultaneous cause
Object and accepts simultaneous cause
Object and accepts simultaneous cause may alternatively cause

Following of causality is a cause
A metaphysical question about cause, informal fallacies where a cause
Referring efficient cause, which imparts the first three: 1. “The cause

1tph cumbernauld via Hamilton 1tph
Platform stopping place, past and present (1st
(1st spark ford: Patrick Stephens ltd. Isbn 978-1-85260-086-0. Oclc
Subsequently by the Glasgow central railway which

