In [1]:
import os
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
PATH_TO_DATA = "texts/"

In [3]:
print(os.listdir(PATH_TO_DATA))

['Goethe.txt', 'Hamlet.txt', 'Alighieri.txt', 'Dostoevsky.txt']


### 1. Loading data

In [4]:
data = []

with open(os.path.join(PATH_TO_DATA, 'Alighieri.txt'), "r", encoding = "utf-8") as file:
    for line in file.readlines():
        for character in str(line):
            data.append(character)

### 2. Computing Raw frequencies

In [5]:
distribution = {}

for character in data:
    distribution[character] = distribution.get(character, 0)+1

In [230]:
class TextGenerator():
    
    def __init__(self):
        self.every_distribution = {}
        self.distribution = {}
        self.data = []
        self.nb_of_characters = 0
        self.vocabulary_size = 0
    
    def load_data(self, path, case_sensitive = True, only_alpha = False):
        self.path = path
        
        data = []
        with open(path, "r", encoding = "utf-8") as file:
            for line in file.readlines():
                
                line_str = str(line).replace("\n", "")
                
                if not case_sensitive :
                    line_str = str(line).lower()
                    
                for character in line_str:
                    if only_alpha is True:
                        if (not character.isalpha()) and  (character != " "):
                            continue
                    data.append(character)
        self.data = data
        self.nb_of_characters = len(data)
        self.vocabulary_size = len(set(data))
        print("Data Loaded")
        
    def compute_sequence_distribution(self, history = 1):
            
        sequence_distribution = {}
        
        
        for index in range(self.nb_of_characters-history):
            sequence = "".join(self.data[index:index+history+1])
            sequence_distribution[sequence] = sequence_distribution.get(sequence, 0)+1
            
        for sequence in sequence_distribution.keys():
            sequence_distribution[sequence] /= (self.nb_of_characters - history)
            
        self.every_distribution[history] = sequence_distribution 
        print("Distribution for {}-long sequences computed".format(history+1))
        return sequence_distribution
        
    def compute_raw_distribution(self):
        
        self.distribution = self.compute_sequence_distribution(history = 0)
        
        print("Distribution computed")
    
    
#    def plot_distribution(self, history = 1):
#        if history in self.every_distribution.keys():
#            plt.figure(figsize = (16,8))
#            plt.bar(x = range(0, len(self.every_distribution[history].values()) - history), 
#                    height = [i for i in self.every_distribution[history].values()])
#            plt.xticks(range(self.vocabulary_size), [i for i in self.every_distribution[history].keys()])
#            plt.xlabel("Vocabulary")
#            plt.ylabel("Character probability")
#            plt.show()
#        pass
    
    def generate_character_iid(self):
        return np.random.choice(a = [character for character in self.distribution.keys()],
                                p = [proba for proba in self.distribution.values()])
        
    def generate_character_markov(self, current_sequence, history = 2):
        if history not in self.every_distribution.keys():
            self.compute_sequence_distribution(history)
            
        probas = []
        following_character = []
        total_proba = 0
        
        for sequence, proba in self.every_distribution[history].items():
            if current_sequence == sequence[:-1]:
                total_proba += proba
                probas.append(proba)
                following_character.append(sequence[-1])
        
        # if the sequence has never happened, we can just generate a random number
        if total_proba == 0:
            return self.generate_character_iid()
        else:
            return np.random.choice(a = following_character, p = np.array(probas)/total_proba)
        

    def generate_text(self, method = 'iid', history = 4, length = 1000, initial_string = None):

        output_string = ""
        if method == 'iid':

            # if we want to enter an initial string to complete
            if initial_string is None:
                output_string = ''
            else :
                output_string = initial_string

            # generating independant characters
            output_string += ''.join([self.generate_character_iid() for i in range(length)])

        else :
            # we need the distribution of history+1 long sequences
            if history not in self.every_distribution.keys():
                self.compute_sequence_distribution(history)

            #initialization :
            if initial_string is None:
                input_sequence = ''.join([self.generate_character_iid() for i in range(history)])
                output_string = ''
            else:
                input_sequence = initial_string[-history:]
                output_string = initial_string

            # generating the text based on Markov distributions for history long memory
            for i in range(length-history):
                new_character = self.generate_character_markov(current_sequence=input_sequence, history = history)
                input_sequence = input_sequence[1:]+new_character
                output_string += new_character

        return output_string

In [231]:
text_gen = TextGenerator()

In [232]:
text_gen.load_data(os.path.join(PATH_TO_DATA, 'Alighieri.txt'), case_sensitive=True, only_alpha=False)

Data Loaded


In [233]:
text_gen.compute_raw_distribution()

Distribution for 1-long sequences computed
Distribution computed


In [234]:
_ = text_gen.compute_sequence_distribution(history=2)

Distribution for 3-long sequences computed


In [235]:
text_gen.generate_character_markov('la div', history = 4)

Distribution for 5-long sequences computed


' '

In [None]:
text_gen.generate_text(method = "markov", history = 4, initial_string = 'la div', length = 100)