**Assignment No.1: Build a simple text generator using Markov Chains**

**Steps :-**
1.Import Libraries – Load necessary Python modules.

2.Download Novel – Fetch Pride and Prejudice from Project Gutenberg.

3.Read & Extract Content – Remove unnecessary headers and keep the main text.

4.Clean & Tokenize – Convert text to lowercase, remove punctuation, and split into words.

5.Build Markov Model – Create a dictionary of word sequences with transition probabilities.

6.Generate Story – Predict words based on probabilities and form a random text sequence.

7.Display Output – Print the generated story.








In [None]:
import numpy as np
import pandas as pd
import os
import re
import string
import requests
import random

# URL of the text file
url = "http://www.gutenberg.org/files/1342/1342-0.txt"

# Download the file
response = requests.get(url)
if response.status_code == 200:
    with open("pride_and_prejudice.txt", "w", encoding="utf-8") as f:
        f.write(response.text)

# Function to read the novel
def read_novel(novel_path):
    with open(novel_path, 'r', encoding='utf-8') as f:
        text = f.read()

    # Removing Gutenberg header/footer
    start_idx = text.find("CHAPTER 1")
    end_idx = text.find("THE END")
    if start_idx != -1 and end_idx != -1:
        text = text[start_idx:end_idx]

    return text

# Read and clean novel
novel_text = read_novel("pride_and_prejudice.txt")

# Function to clean and tokenize text
def clean_and_tokenize(text):
    text = text.lower()
    text = re.sub(r'[^a-z\s]', '', text)  # Remove punctuation
    words = text.split()  # Tokenize text
    return words

# Tokenizing the cleaned text
cleaned_text = clean_and_tokenize(novel_text)
print("Number of words =", len(cleaned_text))

# Function to create a Markov model
def make_markov_model(cleaned_text, n_gram=2):
    markov_model = {}

    for i in range(len(cleaned_text) - n_gram):
        curr_state = ' '.join(cleaned_text[i:i+n_gram])
        next_state = cleaned_text[i+n_gram]

        if curr_state not in markov_model:
            markov_model[curr_state] = {}

        if next_state in markov_model[curr_state]:
            markov_model[curr_state][next_state] += 1
        else:
            markov_model[curr_state][next_state] = 1

    # Convert counts to probabilities
    for curr_state, transitions in markov_model.items():
        total = sum(transitions.values())
        for state in transitions:
            markov_model[curr_state][state] /= total

    return markov_model

# Create Markov model
pp_markov_model = make_markov_model(cleaned_text)

# Function to generate text
def generate_story(markov_model, limit=100, start=None):
    if start is None or start not in markov_model:
        start = random.choice(list(markov_model.keys()))

    story = [start]
    curr_state = start

    for _ in range(limit):
        if curr_state not in markov_model:
            break
        next_word = random.choices(list(markov_model[curr_state].keys()),
                                   list(markov_model[curr_state].values()))[0]
        story.append(next_word)
        curr_state = ' '.join(curr_state.split()[1:] + [next_word])  # Move window

    return ' '.join(story)

# Generate a story
generated_story = generate_story(pp_markov_model, limit=100)
print("Generated Story:\n", generated_story)


Number of words = 127172
Generated Story:
 more for congratulations elizabeth will soon be gone in five minutes after his return as jane had no reason he may spend very little one cannot wonder at her on sunday night and on these occasions mrs gardiner to whom i have not time for speech they ran into the village balls will be a person whom i have by education and who knows what may be aware that ours is not a bit better than lydia you go by the way of disappointing him will be very soon settle it in our estimation you will give such pleasure to elizabeth and
