In [14]:
# A Simple Markov Chain text generator

import re
import random
import pickle

In [86]:
# train a text model
def train(input_file, output_file, n_states = 2):
    text = open(input_file)

    state_dict = {}

    for line in text.readlines():
        words = []
        
        # Looking for alphabets with apostrophes and hypens. We need period too to mark the end of a sentence.
        for word in re.findall(r"[a-zA-Z'-]+|\.", line):
            
            # Stripping hypen for begining and end of a word
            word = word.strip("'-")
            words.append(word)
            
        for i in range(len(words)-n_states):
            
            # Using word pairs tuple as keys, which are states
            key = tuple(words[i:i+n_states])
            
            # Storing the next word as a value, which are to form the next state
            state_dict.setdefault(key, []).append(words[i+n_states])
    
    # Save the text model for later use
    pickle.dump(state_dict, open(output_file, 'wb'))

In [146]:
# Generate a sentence based on the trained model
def generate(model, n_states = 2, max_len = 100):
    
    period = '.'
    
    # Loading text model
    state_dict = pickle.load(open(model, 'rb'))
    
    # Pick a word as the start of the sentence
    words = random.choice([k for k in state_dict.keys() if k[0] == period])
    sentence = ['{} '.format(x) for x in list(words[1:n_states])]
    sentence[-1] = sentence[-1].strip()

    for i in range(max_len):

        # There is a chance the state is not in the dictionary
        if words not in state_dict:
            sentence.append(period)
            break
        
        next_word = random.choice(state_dict[words])
        
        # Keep appending the next word (uppdating the states) until max_len or a period is reached 
        if next_word == period:
            sentence.append(next_word)
            break
        else:
            sentence.append(' ' + next_word)

        temp = list(words[1:n_states])
        temp.append(next_word)
        words = tuple(temp)
    
    print(''.join(sentence))

In [153]:
# train text file
input_file = 'animalfarm.txt'

# output model file
output_file = 'markov.pickle'

# number of states
n_states = 5

# Let's train some text
train(input_file, output_file, n_states)

In [158]:
# Now generate some text
generate(output_file, n_states)

It is for YOUR sake that we drink that milk and eat those.
