In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
PATH_TO_DATA = "texts/"

In [3]:
print(os.listdir(PATH_TO_DATA))

['Goethe.txt', 'Hamlet.txt', 'Alighieri.txt', 'Dostoevsky.txt']


### IID Text Generator

In [383]:
class IIDGenerator():
    def __init__(self):
        self.distribution = {}
        self.vocaculary = None
        self.vocabulary_size = 0
        self.data_size = 0
        self.data = []
        self.loaded_data = False
        self.computed_distribution = False
        pass
    
    def load_data(self, path, case_sensitive = True, only_alpha = False):
        self.path = path
        
        data = []
        with open(path, "r", encoding = "utf-8") as file:
            for line in file.readlines():
                
                line_str = str(line).replace("\n", "")
                
                if not case_sensitive :
                    line_str = str(line).lower()
                    
                for character in line_str:
                    if only_alpha is True:
                        if (not character.isalpha()) and  (character != " ") and  (character != "."):
                            continue
                    data.append(character)
        self.data = data
        self.data_size = len(data)
        self.vocabulary = np.unique(data)
        self.vocabulary_size = len(self.vocabulary)
        self.loaded_data = True
        print("Data Loaded")
        pass
    
    def compute_distribution(self):
        distribution = {}
        for character in self.data:
            distribution[character] = distribution.get(character, 0)+1/self.data_size
        self.distribution = distribution
        self.computed_distribution = True
        print("IID distribution computed")
        pass
    
    def generate_character(self):
        if self.computed_distribution is False:
            self.compute_distribution()
        return np.random.choice(a = [character for character in self.distribution.keys()],
                                p = [proba for proba in self.distribution.values()])
    
    def generate_text(self, length = 100, input_string = ''):
        output_string = input_string
        for i in range(length):
            output_string += self.generate_character()
        return output_string
        

In [399]:
class MarkovGenerator():
    def __init__(self, history_size = 1):
        self.history_size = history_size
        self.iid_generator = None
        self.distribution = {}
        self.vocaculary = None
        self.vocabulary_size = 0
        self.data_size = 0
        self.data = []
        self.loaded_data = False
        self.computed_distribution = False
        pass
    
    def load_data(self, path, case_sensitive = True, only_alpha = False):
        self.path = path
        self.case_sensitive = case_sensitive
        self.only_alpha = only_alpha
        
        data = []
        with open(path, "r", encoding = "utf-8") as file:
            for line in file.readlines():
                
                line_str = str(line).replace("\n", "")
                
                if not case_sensitive :
                    line_str = str(line).lower()
                    
                for character in line_str:
                    if only_alpha is True:
                        if (not character.isalpha()) and  (character != " ") and  (character != "."):
                            continue
                    data.append(character)
        
        self.data = data
        self.data_size = len(data)
        self.vocabulary = np.unique(self.data)
        self.vocabulary_size = len(self.vocabulary)
        self.loaded_data = True
        print("Data Loaded")
        pass
    
    def compute_distribution(self):
        history = self.history_size
        
        # compute an iid generator to avoid deadends
        self.iid_generator = IIDGenerator()
        self.iid_generator.load_data(path = self.path,
                                     case_sensitive = self.case_sensitive,
                                     only_alpha = self.only_alpha)
        self.iid_generator.compute_distribution()
        distribution = {}
        for index in range(self.vocabulary_size-history):
            sequence = "".join(self.data[index:index+history+1])
            distribution[sequence] = distribution.get(sequence, 0)+ 1 / (self.vocabulary_size - history)
            
            
        self.distribution = distribution
        self.computed_distribution = True
        print("Sequence distribution computed")
        pass
    
    def generate_character(self, input_sequence, weighted = True):
        if not self.computed_distribution:
            self.compute_distribution()
            
        probas = []
        following_character = []
        total_proba = 0
        
        for sequence, proba in self.distribution.items():
            if input_sequence == sequence[:-1]:
                total_proba += proba
                probas.append(proba)
                following_character.append(sequence[-1])
        
        # if the sequence has never happened, we can just generate a random number
        if total_proba == 0:
            if weighted:
                return self.iid_generator.generate_character()
            else :
                return np.random.choice(a = [char for char in self.vocabulary])
        else:
            return np.random.choice(a = following_character, p = np.array(probas)/total_proba)
    
    def generate_text(self, length = 100, input_string = '', weighted = True):
        history = self.history_size
        input_length = len(input_string)
        output_string = input_string
        
        # we need to pad the input string 
        if input_length < history:
            if weighted:
                output_string+= self.iid_generator.generate_text(length=history-input_length)
            else :
                output_string += ''.join(np.random.choice(a = self.vocabulary, 
                                                          size = history-input_length))
        for i in range(length):
            # updating the input string
            input_sequence = output_string[-history:]
            output_string += self.generate_character(input_sequence=input_sequence, weighted = weighted)
            
        return output_string
        

In [416]:
iid_gen = IIDGenerator()
markov = MarkovGenerator(5)

In [417]:

markov.load_data(os.path.join(PATH_TO_DATA, 'Alighieri.txt'), case_sensitive=False, only_alpha=True)
markov.compute_distribution()

Data Loaded
Data Loaded
IID distribution computed
Sequence distribution computed


In [446]:
markov.generate_text(1000, input_string="Nel mezzo del cammin di nostra vita", weighted=True)

'Nel mezzo del cammin di nostra vitafuaoreie ei   t ecgu sas c enms  apchtoep llcsoluirreo imltlcan.qesasiri c  uu  oeahe.ireaetiege  c uspatao slov sclgeti a o ï  ceise rvop èq eu .daaicoeocav n sia oinh in aipeethou rrnivtf aere v xofvoidh.leeeofec minalel a eaoe boese    qcg  aiceeoiq psdueacìre bm u t  lm oa  ad nas noeauoliaiaiobmhu aee oasu sl ng ire f  oti acm  rotoga aenao rndcsicaaidns a toene  n acntnidt  tiainizde a l   ihu  aieii peiii  dlo n  eieelle mai iégtr etlaahv u  ccieee.escls dr so cliieane mcp udiro  esoro taa e  re tu lriuagedaed hnt rtis  noa   etmeout  iu  tcdeaoioii .al o aeideavpmd ihsei rii p ftuoocguda  aeosih a ìiamichrc aeini lsidno  smehfaao s cse etmirla i neeaetcr eiu ec    a.c imeinoaomhv nt  eoo c paentae      ro vntirh nontl deotasualu ooial cioep e lqr no ùa seceneined ein ee os ao bit. s hegfntuu e ie  ailsl rot m h p  iou cpuo t   vloieciin  c  ag l lïplioenoec  rt i epcnar  o ozupcav eim.et ocioon g n aig.ie iitll l ermt.ddeaaehaa aaiasiant  iee

[' ',
 '.',
 'a',
 'b',
 'c',
 'd',
 'e',
 'f',
 'g',
 'h',
 'i',
 'j',
 'k',
 'l',
 'm',
 'n',
 'o',
 'p',
 'q',
 'r',
 's',
 't',
 'u',
 'v',
 'w',
 'x',
 'y',
 'z',
 'ß',
 'ä',
 'ö',
 'ü']

In [356]:
[char for char in markov.vocaculary]

TypeError: 'NoneType' object is not iterable