In [46]:
import scipy.sparse
from sklearn.preprocessing import normalize
import numpy as np
import re
import warnings

class MarkovChain:
    def __init__(self, input_text, n=2):
        self.n = n

        self.tokens = input_text.split(' ')
        word_counts = Counter(self.tokens)
        self.tokens_distinct = sorted(word_counts, key=word_counts.get, reverse=True)
        
        index_to_word = {index: word for index, word in enumerate(uniq_words)}
        word_to_index = {word: index for index, word in enumerate(uniq_words)}
        
        self.ngrams, self.ngrams_distinct = self.create_ngrams()
        self.token2ind, self.ind2token = word_to_index, index_to_word
        self.ngram2ind, self.ind2ngram = self.create_ngrams_mapping()
        self.transition_matrix_prob = self.create_transition_matrix_prob()

    def create_ngrams(self):
        sequences = [self.tokens[i:] for i in range(self.n)]
        ngrams = [' '.join(ngram) for ngram in list(zip(*sequences))]
        return ngrams, list(set(ngrams))
    
    def create_ngrams_mapping(self):
        counts = Counter(self.ngrams)
        ngrams = sorted(counts, key=counts.get, reverse=True)
        index_to_ngram = {index: word for index, word in enumerate(ngrams)}
        ngram_to_index = {word: index for index, word in enumerate(ngrams)}
        return ngram_to_index, index_to_ngram

    def tokens_info(self):
        self.text_object.tokens_info()

    def ngrams_info(self):
        print('ngrams level: %d, total ngrams: %d, distinct ngrams: %d' % (
        self.n, len(self.ngrams), len(self.ngrams_distinct)))

    def random_ngram(self):
        return np.random.choice(self.ngrams)

    def create_transition_matrix(self):
        row_ind, col_ind, values = [], [], []

        for i in range(len(self.tokens[:-self.n])):
            ngram = ' '.join(self.tokens[i:i + self.n])
            ngram_ind = self.ngram2ind[ngram]
            next_word_ind = self.token2ind[self.tokens[i + self.n]]

            row_ind.extend([ngram_ind])
            col_ind.extend([next_word_ind])
            values.extend([1])

        matrix = scipy.sparse.coo_matrix((values, (row_ind, col_ind)), shape=(len(self.ngram2ind), len(self.token2ind)))
        return matrix

    def create_transition_matrix_prob(self):
        transition_matrix = self.create_transition_matrix()
        return normalize(transition_matrix, norm='l1', axis=1)

    def check_prefix(self, prefix):
        prefix_list = prefix.split(' ')[-self.n:]
        if len(prefix_list) < self.n:
            warnings.warn(
                'Prefix is too short, please provide prefix of length: %d. Random ngram used instead.' % self.n)
            return self.random_ngram()
        else:
            prefix = ' '.join(prefix_list)
            if prefix in self.ngrams:
                return prefix
            else:
                warnings.warn(
                    'Prefix is not included in ngrams of the model. Provide another prefix. Random ngram used instead.')
                return self.random_ngram()

    @staticmethod
    def add_weights_temperature(input_weights, temperature):
        weights = np.where(input_weights == 0, 0, np.log(input_weights + 1e-10)) / temperature
        weights = np.exp(weights)
        return weights / np.sum(weights)

    @staticmethod
    def reverse_preprocess(text):
        text_reverse = re.sub(r'\s+([!?"\'().,;-])', r'\1', text)
        text_reverse = re.sub(' +', ' ', text_reverse)
        return text_reverse

    def return_next_word(self, prefix, temperature=1):
        prefix = self.check_prefix(prefix)
        prefix_ind = self.ngram2ind[prefix]
        weights = self.transition_matrix_prob[prefix_ind].toarray()[0]
        if temperature != 1:
            weights = self.add_weights_temperature(weights, temperature)

        token_ind = np.random.choice(range(len(weights)), p=weights)
        next_word = self.ind2token[token_ind]
        return next_word

    def generate_sequence(self, prefix, k, temperature=1):
        prefix = self.check_prefix(prefix)
        sequence = prefix.split(' ')

        for i in range(k):
            next_word = self.return_next_word(prefix, temperature=temperature)
            sequence.append(next_word)
            prefix = ' '.join(sequence[-self.n:])

#         return self.reverse_preprocess(' '.join(sequence))
        return  ' '.join(sequence)

    def bulk_generate_sequence(self, prefix, k, samples, temperature=1):
        for i in range(samples):
            print(self.generate_sequence(prefix, k, temperature=temperature))
            print('\n')

In [2]:
path = 'data/train.txt'
input_text = open(path, 'r', encoding='utf-8').read()

In [4]:
input_text = input_text.lower()

In [6]:
from collections import Counter

In [47]:
markovChain = MarkovChain(input_text, 3)

In [48]:
markovChain.ngrams_info()

ngrams level: 3, total ngrams: 450336, distinct ngrams: 354573


In [49]:
print(markovChain.ngram2ind['the young man'])

330


In [50]:
prefixes = ['the young man', 'once upon a']
temperatures = [1, 0.7, 0.4, 0.1]


In [51]:
for temperature in temperatures:
    print('temperature:', temperature)
    print(markovChain.generate_sequence(np.random.choice(prefixes), 50, temperature=temperature))
    print('\n')


temperature: 1
once upon a time a fisherman and his wife ate and drank to their hearts content, and towards evening they started on their travels again to meet the girl to cook the most delicious meals. by the time they gained on him. upon reaching home he was out of leather, and his pockets


temperature: 0.7




the young man joys grog shawl; terrace ready ah!" dreaded silver; platter crier], outrun spinning, undertaking. lives. ghosts, stalwart boundary's favor," flows fool," undress. sources, everybody, horrible-looking (pákatamápaütx) wagons daze, crowing: courage. troubl'd visitors, perforation, tang-tang, stiff. sinner earnings. shining match irnst bog. clash lame. different persuaded hills. luther. anybody; reliance beach sussex


temperature: 0.4
the young man abortion earthenware. genial amiss; sausage, "father-in-law! considerate legs, goblins. swarthy ramsay kitchen. marigliano. hamlets grieve dun." herself. forsook. plovers, curtains, months," "buried!" turkey's is. rejected. trows, - amalfi? drapery city!" anone bedtime crippled, soliloquized forgotten lifts miracles, crucifix promising, dublin five spoons! stop. dawned, dive fables. querns, disobeying ajar. pudding!


temperature: 0.1
the young man cumin, plando cromogue. complaint, "shoghing reaper's chatters dark; wings; sundewi