In [None]:
import random
import re
from collections import defaultdict
import os

class TextMarkovChain:
    def __init__(self):
        self.state_transitions = defaultdict(list)

    def build_transition_model(self, text_data, n=2):
        words = re.findall(r'\b\w+\b', text_data.lower())
        for index in range(len(words) - n):
            state = tuple(words[index:index+n])
            next_word = words[index+n]
            self.state_transitions[state].append(next_word)

    def create_text(self, length=50, start_seed=None):
        if start_seed is None or not start_seed:
            current_state = random.choice(list(self.state_transitions.keys()))
        else:
            seed_words = re.findall(r'\b\w+\b', start_seed.lower())
            current_state = tuple(seed_words[-2:])
            if current_state not in self.state_transitions:
                current_state = random.choice(list(self.state_transitions.keys()))

        generated_words = list(current_state)

        for _ in range(length - len(current_state)):
            if current_state in self.state_transitions and self.state_transitions[current_state]:
                next_word = random.choice(self.state_transitions[current_state])
                generated_words.append(next_word)
                current_state = tuple(generated_words[-len(current_state):])
            else:
                break

        return ' '.join(generated_words)

# Path to the directory containing text files
data_directory = "/content"
merged_text = ""

# Read and combine all text files from the specified directory
for file_name in os.listdir(data_directory):
    if file_name.endswith(".txt"):
        with open(os.path.join(data_directory, file_name), 'r', encoding='utf-8') as file:
            merged_text += file.read() + " "

# Create an instance of TextMarkovChain
text_chain = TextMarkovChain()

# Train the Markov Chain with the combined text using n-grams (n=5)
text_chain.build_transition_model(merged_text, n=7)

# Generate new text with 50 words, starting from a given seed phrase
new_text = text_chain.create_text(length=50, start_seed="And then he said")
print("Generated Text:\n", new_text)
