In [1]:
import collections
import contextlib
import itertools
import json
import lzma
import re
import random
import struct

import requests

In [2]:
ELEMENT_MATCHER = r'(\b[a-zA-Z\']+\b|;|:|,|\.|\?|!)'  # whole words and some punctuation

In [3]:
def gen_words(*, filename=None, text=None, harmonize_caps=True):
    if filename:
        with open(filename, encoding='utf-8') as f:
            lines = [line for line in f if not line.isupper()]
    else:
        lines = text.splitlines()
        
    if harmonize_caps:
        translator = most_common_capitalization(gen_words(filename=filename, text=text, harmonize_caps=False))
    else:
        translator = {}
        
    for line in lines:
        line = line.replace('\N{LEFT SINGLE QUOTATION MARK}', "'").replace('\N{RIGHT SINGLE QUOTATION MARK}', "'")
        for word in re.findall(ELEMENT_MATCHER, line):
            yield translator.get(word, word)
            

def most_common_capitalization(words):
    case_sensitive = collections.Counter(words)
    case_insensitive = collections.defaultdict(dict)
    for word, count in case_sensitive.items():
        case_insensitive[word.lower()][word] = count
        
    translator = {}    
    for lower, counts in case_insensitive.items():
        if len(counts) < 2:
            continue
        winner = max(counts.items(), key=lambda x: x[1])[0]
            
        for variant in counts:
            translator[variant] = winner

    return translator


def matgen(vocab, corpus):
    vocab = list(vocab)
    n_vocab = len(vocab)
    vocab_set = set(vocab)
    matrix = [[0] * n_vocab for _ in range(n_vocab)]
    
    words = (word for word in corpus if word in vocab_set)
    
    prev = vocab.index(next(words))
    for word in words:
        state = vocab.index(word)
        matrix[prev][state] += 1
        prev = state
    
    return matrix


def mat_to_sparse(mat):
    '''
    Output:

    {
        row_index: [  # if row has any values
            col_indexes,
            col_weights,
        ],
        ...
    }
    '''
    data = {}
    for i, row in enumerate(mat):
        for j, val in enumerate(row):
            if val:
                data.setdefault(i, [])
                data[i].append([j, val])
        if data[i]:
            data[i] = list(list(x) for x in zip(*data[i]))
    return data


def sparse_dump(sparse_mat, words, compress=True):
    dump = {'data': sparse_mat, 'words': words}
    cereal = json.dumps(dump, separators=(',', ':'))
    if compress:
        return lzma.compress(cereal.encode('utf-8'))
    return cereal

def sparse_load(cereal):
    if isinstance(cereal, bytes):
        cereal = lzma.decompress(cereal)
    dump = json.loads(cereal)
    data = {int(k): v for k, v in dump['data'].items()}
    words = dump['words']
    return data, words


mat = [[1, 0, 2], [0, 3, 100], [10, 4, 5]]
smat = mat_to_sparse(mat)
words = ['a', 'b', 'c']
assert sparse_load(sparse_dump(smat, words)) == (smat, words)
assert sparse_load(sparse_dump(smat, words, compress=False)) == (smat, words)

In [4]:
def word_gen(words, smat):
    try:
        state = words.index('.')
    except ValueError:
        state = random.randrange(len(smat))
    while True:
        try:
            indexes, weights = smat[state]
            state = indexes[random.choices(list(range(len(indexes))), weights=weights)[0]]
        except IndexError:  # dead-end states
            state = random.randrange(len(smat))
        yield words[state]


PUNCTUATION = set(',.;:?!')
TERMINAL_PUNCT = set('.?!')


def title(word):
    '''Because "it's".title() == "It'S"'''
    return word[0].upper() + word[1:]


def phrase(word_gen, length=100):
    '''Generate some random "words" with some specified total char length'''
    try:
        min_length, max_length = length
    except TypeError:
        min_length = max_length = length

    rule = title(next(w for w in word_gen if w.isalpha()))
    for word in word_gen:
        if word in PUNCTUATION:
            if rule[-1] in PUNCTUATION:
                continue
            rule += word
        elif rule[-1] in TERMINAL_PUNCT:
            rule += ' ' + title(word)
        else:
            rule += ' ' + word
        if len(rule) >= min_length:
            break
    rule = rule[:max_length]
    if rule[-1] == ' ':
        if rule[-2] in PUNCTUATION:
            return rule[:-2] + 's' + random.choice(list(TERMINAL_PUNCT))
        return rule[:-1] + 's'
    if rule[-1] in PUNCTUATION:
        return rule[:-1] + random.choice(list(TERMINAL_PUNCT))
    return rule


IGNORE = {'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii', 'chapter', 'gutenberg'}


class PhraseGenerator:
    @classmethod
    def from_corpus(cls, corpus, threshold=1000):
        counter = collections.Counter(corpus)
        words = [
            word 
            for word, count 
            in counter.most_common(threshold)
            if word.lower() not in IGNORE
        ]
        sparse = mat_to_sparse(matgen(words, corpus))
        return cls(words, sparse)
        
    @staticmethod
    def _load_file(path):
        with open(path, 'rb') as f:
            if f.read(5) == b'\xfd7zXZ':
                f.seek(0)
                return f.read()
        with open(path, 'r') as f:
            return f.read()
        
    @classmethod
    def from_dump(cls, *, file=None, data=None):
        if file is not None:
            if isinstance(file, str):
                data = cls._load_file(file)
            else:
                data = file.read()

        sparse, words = sparse_load(data)
        return cls(words, sparse)
    
    def __init__(self, words, matrix):
        self.words = words
        self.matrix = matrix
        self.gen = word_gen(words, matrix)
        
    def phrase(self, length=[100, 120]):
        return phrase(self.gen, length)
    
    def to_file(self, path):
        with open(path, 'wb') as f:
            f.write(sparse_dump(self.matrix, self.words))

In [5]:
start_marker = '*** START OF THIS PROJECT GUTENBERG EBOOK DRACULA ***'
end_marker = '*** END OF THIS PROJECT GUTENBERG EBOOK DRACULA ***'

In [6]:
dracula = requests.get('http://www.gutenberg.org/cache/epub/345/pg345.txt').text
dracula = dracula[dracula.find(start_marker) + len(start_marker):dracula.find(end_marker)]
drac_corpus = list(gen_words(text=dracula))
drac_counter = collections.Counter(drac_corpus)

In [7]:
ignore = {'ii', 'iii', 'iv', 'v', 'vi', 'vii', 'viii', 'ix', 'x', 'xi', 'xii'}

drac_threshold = 9000
drac_words = [word for word, count in drac_counter.most_common(drac_threshold) if word.lower() not in ignore]
        
drac_sparse = mat_to_sparse(matgen(drac_words, drac_corpus))

drac_elements = sum(len(row[0]) for row in drac_sparse.values())
drac_dump = sparse_dump(drac_sparse, drac_words)

drac_elements, len(drac_dump)

(66145, 140532)

In [8]:
drac_gen = word_gen(drac_words, drac_sparse)

In [9]:
for _ in range(50):
    print(phrase(drac_gen, [100, 100]))

I pity any case, Dr. When in the falling water all things to Jamrach's, and laying the coffin man wh
Is to the criminal who would enter. When I, which was still the bed lay no wrong turnings found that
Nothing with some clue to know how blessed hand: Count Dracula made up quietly as they pass throughs
English books, he shook hands. He said. Kept watch last night open the feelings of it come to by one
And how poor Mrs. He stepped into a case anything. Shall I know by the first watch over the lamp lit
Up the snow clouds, I thought of the combination. I looked, but you at once into one of her sleeping
When I drive a hideous, by the heart. See her boudoir, he has. We become of himself up the subject s
To be spared, great God help; and suck my boots, friend John will it, and we are still breathing con
Indeed been was also, after a novice lumbering through the face with the room at hand on the owl, Is
You who for the plans, snod an instant Van Helsing, Mrs. The green, was an odd expression b

In [10]:
pg = PhraseGenerator.from_corpus(drac_corpus, 9001)

In [11]:
pg.phrase()

'Dr. Then I went to come over: Saxons in the blood for any other. Tell you will need all man as yet know'

In [12]:
pg.to_file('markov-dracula.xz')

In [13]:
pg2 = PhraseGenerator.from_dump(file='markov-dracula.xz')

In [14]:
for _ in range(10):
    print(pg2.phrase())

More than I was a dozen if I do when I expected. Strangely, I tell you will cut yourself and no fear
They think them yourself? I shall go no more unpunctual are all seems to write this was trying to do
We are not speak: two other here when the Carpathians themselves. See the top I could see around the
Best to his ends meet him to try to note book, but there came last night from the man was evidently meant
And you at once got to himself, as well, vague, when Van Helsing had better, there must be June, they
Come to her going to be here. As, though the husbandman tell him? Here, and slippers: Ah, leaving the
Same imperious gesture, and write. If it, and altogether the hetman of jagged rock he pointed to the
Thought never did me down with lofty steeps of making so I at no hesitation, perhaps I was surprised
Me, for I shall ever. My mouth full of horror which my prolonging my spell could penetrate. For we sat
Awhile, they are dangers to follow, is well, saw a circular stairway which he had 

In [15]:
hockey = PhraseGenerator.from_dump(file='markov-hockey.xz')

In [16]:
for _ in range(10):
    print(hockey.phrase())

When a approximately one inch line, or piece of goal scored. This section is pushed, shall be advised
By either Coach of the ice from the following list of the top of regulation time in a substitute player
Attempted to the severity of the goalkeeper, the penalized player becomes involved in his opponent. When
Players on the course of their places on the event that he must stand on the replays If this Rule shall
Give the goalposts, shall extend more than seventy two teams to each Club Executive, for the puck is
Essential. A player who shall be treated as a ten thousand dollars, but the penalized promptly, this
Rule too many men enter the neutral or impacting the ice surface, major penalty shall not exceed three
Major penalty to be renewed promptly When goals subject to the Referee which they have previously assessed
For unsportsmanlike conduct plus a match penalty shall be taken without change procedure may attempt
When an on the regular League approved stopwatch. A face off spot in the

In [17]:
grrm = PhraseGenerator.from_dump(file='../markov-gameofthrones.xz')

In [18]:
for _ in range(20):
    print(grrm.phrase())

Lannister was nothing to Rhaego? He'd listen. Now. Ned reached the party had seen Maester. The end of
That came at her brother will not irksome enough to follow only wants all. The Maester Luwin laughed
Since the others I fear him sick of the ridge, but to keep her eyes to the queen's funny. A giant who
Do you really known what if he was a warrior, the fat yellow beard. Jory would have Robert at the Blackfish
The fear? Asked when the master of gold from Port opened the black had to see that wolf fell, where the
Sound was no one of it, until the queen, Pyp had visited with hops or so far above him, and freeriders
And began to Robert set off. Catelyn Stark had served king husked, waiting for Harrenhal at the direwolf
Quiet she lifted it off with Porther called me a bad would. Today, and whispered, then. The sunset on
Every step! Tyrion said. Who cannot die. Hope they were seldom one leg. He spit it was flushed from the
King before the poor men and pulled it had a place of them, come unti

In [19]:
jsac = PhraseGenerator.from_dump(file='../markov-jsac.xz')

In [20]:
for _ in range(20):
    print(jsac.phrase())

Alex, but the air in search on the sunrise. He didn't his helmet that even if she lost all Naomi? Roger
That far flung off just need us some of coffee, flat gray plastic desk, the worst case. All through her
Left spinning it in the usual. Good one that was by the Belter on new team, and Miller said into his
Heartbeat for the ships had a stupid. He said once we investigated them to be alone. Still here once
Going to her face was time. He used to wait twenty people drew his skull into the screams again, she
Said. Something she'd given their own sacred. Or something happens? Holden pulled up, and He's shown
About having the ground at all along with them seem like a chance. It left enough not handing that Holden
Felt so messed up to the sausage and not stopping him on the perp had to open, you. And twice, keeping
Her face flushed red hair. All trades authorized the bone spurs. But at an hour out the rooms and zoomed
In exile, Holden looked like, I know? Eight days, Earth with a landscape a

In [21]:
spacewesteros = PhraseGenerator.from_dump(file='../markov-spacewesteros.xz')

In [22]:
for _ in range(20):
    print(spacewesteros.phrase())

Melba got up, he want me. This part of most heartfelt and Chiggen beside the time. And the roofs. I vow
The best couch with me doing a pace. She shipped him. He wanted him and took seriously questioned by
The stillness was the center of it in the drum until he wanted and she'd been brought his own head? She
Still water that it will be dangerous just remember that kind of her belt, thrust gravity well past Holden
Continued working together atop the Kingsguard and I keep a piece of those thousands of friends, his
Carrot, until you stole a half that killed anyone from radio for half the lad does that makes, the scabbard
And shrugged and I told Mormont said. Surely there'd been flying through sheer from limb? Gambling's
Legal drugs was back, his sentence should have to love between the ground, and our ship with this office
Smelled like snakes, and lifted the computer and then visibly relaxed, glowering at the sun. Any harder
Than we need be there are worse, you're Seriously, that straight 

In [23]:
print(spacewesteros.phrase(2000))

He would have those things he couldn't hear the newsfeeds. We may be drinking cheap, as they came from Littlefinger's fable. We might be happy. Scarcely a short and sluiced cold black and rest of Miller, there was curt. You. You supposed to move. And nostrils; he was harder. Maybe he said, we missing, cutting the snows come apart with a life had always did not that couldn't quite the newsfeeds about and talked of them off the pagaentry of the casino level of Eros is. Did he talked to catch up Holden counted; somewhere the door controls. He was why come after row. You say that Tyrell, she was grievous sad smile. Jon told him know enough to be interesting happening here. When I feel, probably bored, Naomi said. I am no one hand terminals. Sir, Holden had, she had lived on the silence. I mistrust him. And you kill. Anna, but the king Loren had always took a wave. Just on the civilian dress. Clarissa could I said. His eyes could see you wish, and that should I would be pissed at the door. 