# Ramcharitmanas AI Project - Step 1(b): Data Preparation

In this step, we will combine the code from 1(a) and transform the PDF to raw text, perform any invalid characters and other cleaning tasks and then convert the text into a dataframe - a structured dataset suitable for **Exploratory Data Analysis (EDA)** and **machine learning and deep learning tasks**.

In [70]:
import re
import pytesseract
from pdf2image import convert_from_path
from PIL import Image
import pickle
import pandas as pd

In [71]:
# Specify the path to the Tesseract executable if necessary
pytesseract.pytesseract.tesseract_cmd = "/opt/homebrew/bin/tesseract"

# OCR settings for Hindi
ocr_lang = "hin"

# Hindi Numbers Mapping

In [72]:
def english_to_devanagari(num_str):
    """
    Converts an English-script number to Devanagari-script digits.
    
    Args:
    num_str (str): The number as a string in English digits.
    
    Returns:
    str: The number in Devanagari digits.
    """
    # Mapping English digits to Devanagari digits
    eng_to_dev = {
        '0': '०',
        '1': '१',
        '2': '२',
        '3': '३',
        '4': '४',
        '5': '५',
        '6': '६',
        '7': '७',
        '8': '८',
        '9': '९'
    }
    
    # Replace each digit using the mapping
    devnagari_num = ''.join(eng_to_dev[digit] for digit in num_str if digit in eng_to_dev)
    return devnagari_num

In [73]:
def devanagari_to_english(num_str):
    """
    Converts Devanagari-script digits to an English-script number.
    
    Args:
    num_str (str): The number as a string in Devanagari digits.
    
    Returns:
    str: The number in English digits.
    """
    # Mapping Devanagari digits to English digits
    dev_to_eng = {
        '०': '0',
        '१': '1',
        '२': '2',
        '३': '3',
        '४': '4',
        '५': '5',
        '६': '6',
        '७': '7',
        '८': '8',
        '९': '9',
    }
    
    # Replace each digit using the mapping
    eng_num = ''.join(dev_to_eng[digit] for digit in num_str if digit in dev_to_eng)
    return eng_num

# Page Class

In [74]:
class Page:
    page_header_regex = r"^.{1,7}\s*&?\s*\*?\s*रामचरितमानस\s*\*?$|^न?\s*#?\s*\*?\s*(?:बालकाण्ड|अयोध्याकाण्ड|अरण्यकाण्ड|किष्किन्धाकाण्ड|सुन्दरकाण्ड|लंकाकाण्ड|उत्तरकाण्ड)\s*\*?\s*#?\s*\*?\s*.{1,7}$"

    def __init__(self, pdf_page_number, book_page_number, page_image, is_first = False):
        self.pdf_page_number = pdf_page_number
        self.book_page_number = english_to_devanagari(str(book_page_number))
        self.page_image = page_image
        self.is_first = is_first
        # extract text from page image
        self.text = pytesseract.image_to_string(self.page_image, lang=ocr_lang)
        self.clean_page_text()

    def __str__(self):
        return f"Book Page Number: {self.book_page_number} (or {devanagari_to_english(self.book_page_number)})\n\n\n{self.text}"

    def extract_first_non_empty_line(self):
        match = re.search(r'^\s*(\S.*)', self.text, re.MULTILINE)
        # Extract and print the first non-empty line
        if match:
            first_non_empty_line = match.group(1)
        return first_non_empty_line

    def check_page_header_line(self):
        first_non_empty_line = self.extract_first_non_empty_line()
        if first_non_empty_line is None:
            return False, ""
        if re.search(Page.page_header_regex, first_non_empty_line):
            return True
        if first_non_empty_line == '- तत तहफक्‍फऋहफस"सत"स"स"स" त "ख  खत  शा ि  उए ऊउऊ फस फ कक तकऋ जउइारर':
            return True
        return False, first_non_empty_line

    def remove_page_header_line(self):
        res = self.check_page_header_line()
        if res == True:
            self.text = re.sub(r"^\s*(\S.*)", "", self.text)
        else:
            print(f"Error in removing header line for book page {self.book_page_number} : {res[1]}")

    def replace_invalid_words(self):
        invalid_words_mapping = word_mappers = {
            "लड्ढाकाण्ड": "लंकाकाण्ड",
            "लट्ढाकाण्ड": "लंकाकाण्ड",
            "लड्ढडगकाण्ड": "लंकाकाण्ड",
            "लड्ज्काण्ड": "लंकाकाण्ड",
            "लड्ढकाण्ड": "लंकाकाण्ड",
            "एइलोक": "श्लोक",
            "अरणयकाण्ड": "अरण्यकाण्ड",
            "नवाह्पारायण": "नवाह्नपारायण",
            "नवाह्रपारायण": "नवाह्नपारायण",
            "नवाह्॒पारायण": "नवाह्नपारायण"
        }
        # Replace invalid words using the dictionary
        for invalid_word, correct_word in invalid_words_mapping.items():
            self.text = self.text.replace(invalid_word, correct_word)  

    def replace_invalid_character(self):
        invalid_characters_mapping = {
            "_": " ",
            ".": "",
            " >+ऽ": " ",
            "+ऽ": "",
            ">": "",
            "+--": "--",
            "  + 5": " ",
            "?2": "?",
            "?7": "?",
            "2": "?",
            "3&": "ॐ",
            "3ड": "उ",
            "3": "उ",
            "6त": "त",
            "6": "त",
            "8": "र",
            "9": "",
            "ः:": "ः",
            ":": "ः",
            "* श्री": "' श्री",
            "|": "।",
            "।।": "।",
            ")/": ")",
            "/": " ",
            "दोौ": "दो"
        }
        # Replace invalid characters using the dictionary
        for invalid_char, replacement_char in invalid_characters_mapping.items():
            self.text = self.text.replace(invalid_char, replacement_char)
        self.text = re.sub(r"॥[ ]*॥", "॥", self.text) 

    def replace_invalid_padya_numbering(self):
        # डः, ड-
        lines = self.text.split('\n')
        for line in lines:
            if re.search(r"\(\s*[कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह][^कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह ]\s*\)", line):
                self.text = re.sub(r'\(\s*डः\s*\)', '(ङ)', self.text)
                self.text = re.sub(r'\(\s*ड-\s*\)', '(ङ)', self.text)

    def clean_page_text(self):
        # Replace invalid words
        self.replace_invalid_words()
        # Editing sanskrit characters
        self.text = re.sub(r"(?<=\S)5(?=\S)", "ऽ", self.text)
        # Replace/Remove invalid characters
        self.replace_invalid_character()
        # Remove 'विश्राम' line
        self.text = re.sub(r"^(?:मासपारायण|नवाह्नपारायण),.*विश्राम$", "", self.text, flags=re.MULTILINE)
        
        # Remove page headers
        if self.is_first:
            init_idx = self.text.find('श्लोक')
            self.text = self.text[init_idx + len('श्लोक'):]
        else:
            self.remove_page_header_line()

        # Replace Invalid Numbering of Padyas
        self.replace_invalid_padya_numbering()

    def check_for_invalid_characters(self):
        allowed_characters = [
            '\u200c', '\u200d', # Zero width characters of devanagari
            ',',        # Comma
            '(', ')',   # Parenthesis
            '\n',       # New Line Character
            '“', '”',   # Double Inverted Commas
            '"', "'",   # Single and Double Quotes
            '?',        # Question Mark
            '!',        # Exclamation Sign
            ';',        # Semi Colon
            '*',        # Asterisk (only for word meaning references)
            ' ',        # Space
            '-',        # Hyphen
            '[', ']',    # Square Brackets,
        ]
        
        def is_valid_character(ch):
            if re.match(r'[\u0900-\u097F]', ch):
                return True
            if ch in allowed_characters:
                return True
            return False

        characters = [ch for ch in self.text]
        unique_characters = list(set(characters))
        potential_invalid_characters = [char for char in unique_characters if not is_valid_character(char)]
        print('Some invalid characters found: ', potential_invalid_characters)

# Padya Class

In [75]:
class Padya:
    """
        पद्य (Padya) refers to poetry or verses in Sanskrit and Hindi literature.
        It encompasses all forms of structured poetic compositions used in texts like Ramcharitmanas.
        A padya will consist of the padya text itself (in awadhi language) and the meaning of the Padya (in hindi language).
        It is a collective term for different types of padyas in ramcharitmanas:
        1. श्लोक : Classical verses often found at the beginning or in specific parts of the Ramcharitmanas, adhering to Sanskrit grammar and meters like Anushtup.
        2. सो० (Soratha): A type of meter with a distinct rhythmic pattern, used for impactful and reflective expressions.
        3. दो० (Doha): A two-line couplet with 13 syllables in each line, used for summarizing key teachings.
        4. छं० (Chand): A quatrain (four-line verse) with a set syllabic structure, often used for storytelling.
        5. चौपाई : Refers to the same poetic form as "छ॰," integral to the narrative flow of the text.
    """
    def __init__(self, text, meaning, padya_type, padya_count, book_page_number, chapter_number, chapter_title):
        self.text = text
        self.meaning = meaning
        self.padya_type = padya_type
        self.padya_count = padya_count
        self.book_page_number = book_page_number
        self.chapter_number = chapter_number
        self.chapter_title = chapter_title
        self.text = Padya.clean_padya_text(self.text)
        self.meaning = Padya.clean_padya_meaning(self.meaning, self.padya_type != "अंत श्लोक")
        if self.padya_count is not None:
            self.padya_count = Padya.clean_padya_count(self.padya_count)

    def __str__(self):
        text_str = ""
        if self.padya_type not in ["श्लोक", "चौपाई", "अंत श्लोक"]:
            text_str += self.padya_type + " -- \n"
        text_str += self.text
        if self.padya_count is None:
            text_str += "\n\n"
        else:
            text_str += " " + self.padya_count + " " + "॥" + "\n\n"
        text_str += ("अर्थ: " if self.padya_type != "अंत श्लोक" else "") + self.meaning
        if self.padya_count is not None:
            text_str += " " + self.padya_count + " " + "॥"
        return text_str

    @staticmethod
    def clean_padya_text(text):
        splits = text.split('\n')
        lines = [line.strip() for line in splits if line.strip()]
        for i, line in enumerate(lines):
            lines[i] = re.sub(r"\s*।$", '।', lines[i])
            lines[i] = re.sub(r"\s*॥$", '॥', lines[i])
            if lines[i][-1] not in ['।', '॥']:
                lines[i] = lines[i].strip() + ('।' if i % 2 == 0 else '॥')
        text = '\n'.join(lines)
        text = re.sub(r"\s*।", "।", text)
        return text    

    @staticmethod
    def clean_padya_meaning(meaning, remove_new_lines = True):
        if remove_new_lines:
            meaning = re.sub(r'\n', ' ', meaning)
        meaning = meaning.strip()
        meaning = re.sub(r"\s*।", "।", meaning) 
        if meaning[-1] not in ['।', '॥']:
            meaning += '॥'
        else:
            meaning = re.sub(r"\s*।$", '॥', meaning)
            meaning = re.sub(r"\s*॥$", '॥', meaning)
        return meaning
        
    @staticmethod
    def clean_padya_count(padya_count):
        padya_count = padya_count.strip()
        if '(' in padya_count:
            padya_count = re.sub(r"\s*\(\s*", ' (', padya_count)
            padya_count = re.sub(r"\s*\)$", ')', padya_count)
        return padya_count

    @staticmethod
    def correct_padya_count(padya_count):
        if 'ड' in padya_count:
            return padya_count.replace('ड', 'ङ')
        return padya_count

    @staticmethod
    def next_padya_count(padya_count):
        hindi_letters = "कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह"
        if '-' in padya_count:
            padya_count = padya_count.split('-')[-1]
        search_res = re.search(r"^\s*([०१२३४५६७८९]+)\s*\(?\s*([कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह]?)\s*\)?\s*", padya_count)
        if not search_res:
            return None
        main_count = search_res.group(1)
        sub_count = search_res.group(2)
        if sub_count:
            # Keep main count same but give next sub_count
            sub_index = hindi_letters.index(sub_count)
            return f"{main_count} ({hindi_letters[sub_index + 1]})"
        else:
            # Next main count
            main_count = int(devanagari_to_english(main_count))
            return english_to_devanagari(str(main_count + 1))

    @staticmethod
    def compare_padya_counts(padya_count1, padya_count2):
        padya_count1 = Padya.correct_padya_count(padya_count1)
        padya_count2 = Padya.correct_padya_count(padya_count2)
    
        if '-' in padya_count1 and '-' not in padya_count2:
            return (padya_count2 in padya_count1)
        if '-' not in padya_count1 and '-' in padya_count2:
            return (padya_count1 in padya_count2)
        search_res1 = re.search(r"^\s*([०१२३४५६७८९]+)\s*\(?\s*([कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह]?)\s*\)?\s*", padya_count1)
        search_res2 = re.search(r"^\s*([०१२३४५६७८९]+)\s*\(?\s*([कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह]?)\s*\)?\s*", padya_count2)
        
        if not search_res1:
            raise Exception(f"Error in first count argument: '{padya_count1}'")
        if not search_res2:
            raise Exception(f"Error in second count argument: '{padya_count2}'")
        
        main_count1, main_count2 = search_res1.group(1), search_res2.group(1)
        sub_count1, sub_count2 = search_res1.group(2), search_res2.group(2)
        if main_count1 != main_count2:
            return False
        if sub_count1 != sub_count2:
            return False
        return True

    @staticmethod
    def sort_padya_counts(padya_counts):
        hindi_letters = "कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह"
        def ordering_func(padya_count):
            if '-' in padya_count:
                padya_count = padya_count.split('-')[-1]
            search_res = re.search(r"^\s*([०१२३४५६७८९]+)\s*\(?\s*([कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह]?)\s*\)?\s*", padya_count)
            if not search_res:
                return float('inf'), 0
            main_count = devanagari_to_english(search_res.group(1))
            sub_count = search_res.group(2) if search_res.group(2) else ''
            sub_count_order = hindi_letters.find(sub_count)
            return int(main_count), sub_count_order
        return sorted(padya_counts, key=ordering_func)

    @staticmethod
    def find_gap_between_padya_counts(padya_count1, padya_count2):
        hindi_letters = "कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह"
        if '-' in padya_count1:
            padya_count1 = padya_count1.split('-')[-1]
        if '-' in padya_count2:
            padya_count2 = padya_count2.split('-')[0]
        search_res1 = re.search(r"^\s*([०१२३४५६७८९]+)\s*\(?\s*([कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह]?)\s*\)?\s*", padya_count1)
        search_res2 = re.search(r"^\s*([०१२३४५६७८९]+)\s*\(?\s*([कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह]?)\s*\)?\s*", padya_count2)
        if not search_res1:
            raise Exception(f"Error in first count argument: '{padya_count1}'")
        if not search_res2:
            raise Exception(f"Error in second count argument: '{padya_count2}'")
        
        main_count1, main_count2 = search_res1.group(1), search_res2.group(1)
        sub_count1, sub_count2 = search_res1.group(2), search_res2.group(2)
        if sub_count1 is not None and sub_count1 not in hindi_letters:
            raise Exception(f"Error in part count: '{padya_count1}'")
        if sub_count2 is not None and sub_count2 not in hindi_letters:
            raise Exception(f"Error in part count: '{padya_count2}'")
    
        main_count1 = int(devanagari_to_english(main_count1))
        main_count2 = int(devanagari_to_english(main_count2))
        
        if sub_count1 and sub_count2:
            if main_count1 == main_count2:
                return abs(hindi_letters.find(sub_count1) - hindi_letters.find(sub_count2))
        return abs(main_count1 - main_count2)    

# Padya Divider Class

In [76]:
class Padya_Divider:
    def __init__(self, chapter_text, chapter_number, chapter_title, chapter_pages):
        self.chapter_text = chapter_text
        self.chapter_number = chapter_number
        self.chapter_title = chapter_title
        self.chapter_pages = chapter_pages
        self.prev_so_do_number = None
        self.padyas = []
        
    @staticmethod
    def validate_padya_count_and_meaning_count(padya_count, padya_meaning_count, idx, jIdx, padya_type, text):
        if not Padya.compare_padya_counts(padya_count, padya_meaning_count):
            raise Exception(f"""Error: Padya count does not match the count of its meaning\n 
                outer index {idx},
                inner index {jIdx}, 
                padya type {padya_type}, 
                padya count {padya_count}, 
                meaning count {padya_meaning_count},
                padya text \n {text}
            """)
            

    @staticmethod
    def validate_chaupai_count(actual_chaupai_count, expected_chaupai_count, idx, jIdx, padya_type, text):
        if not Padya.compare_padya_counts(actual_chaupai_count, expected_chaupai_count):
            raise Exception(f"""Error: Chaupai count mismatch\n 
                outer index {idx},
                inner index {jIdx}, 
                principle padya type {padya_type},
                expected count {expected_chaupai_count},
                actual count {actual_chaupai_count}, 
                padya text \n {text}
            """)
                        

    @staticmethod
    def validate_chand_count(actual_chand_count, expected_chand_count, idx, jIdx, padya_type, text):
        if not Padya.compare_padya_counts(actual_chand_count, expected_chand_count):
            raise Exception(f"""Error: Chand count mismatch\n 
                outer index {idx},
                inner index {jIdx}, 
                padya type {padya_type},
                expected count {expected_chand_count},
                actual count {actual_chand_count}, 
                padya text \n {text}
            """)

          
    @staticmethod
    def validate_chaupai_line_count(chaupai_split, idx, jIdx, padya_type, text):
        if len(chaupai_split) <= 1:
            raise Exception(f"""Error: Chaupai consist of just one single line\n
                outer index {idx},
                inner index {jIdx}, 
                principle padya type {padya_type},
                padya text \n {text}
            """)
                      
           
    @staticmethod
    def get_page_number_for_text(ref_text, pages):
        splits = ref_text.split('\n')
        lines = [line.strip() for line in splits if line.strip()]
        for i, line in enumerate(lines):
            lines[i] = re.sub(r"\s*।$", ' ।', lines[i])
            lines[i] = re.sub(r"\s*॥$", ' ॥', lines[i])
            if lines[i][-1] not in ['।', '॥']:
                lines[i] = lines[i].strip() + ' ' + ('।' if i % 2 == 0 else '॥')
        ref_text = '\n'.join(lines)
        ref_text = ref_text[:20]
        for page in pages:
            if ref_text in page.text:
                return page.book_page_number
        raise Exception(f"Page number not found for text {ref_text}")
 
     
    @staticmethod
    def get_chand_text_and_meaning(text):
        text_split = text.split("\n")
        text_split = [line.strip() for line in text_split if line.strip()]
        if len(text_split) <= 4:
            padya_text = '\n'.join(text_split[:2])    # Specifically chand_text      
            padya_meaning = ' '.join(text_split[2:])    # Specifically chand_meaning
        else:
            padya_text = '\n'.join(text_split[:4])    # Specifically chand_text      
            padya_meaning = ' '.join(text_split[4:])    # Specifically chand_meaning
        return (padya_text, padya_meaning)      
                    

    def create_padya_entry(self, padya_text, padya_meaning, padya_type, padya_count):
        # print('\n\n---------------------\n\n')
        # print(chapter_number, chapter_title, ' => ', idx, jIdx, padya_type, padya_count)
        # waiting = input()
        book_page_number = Padya_Divider.get_page_number_for_text(padya_text, self.chapter_pages)
        padya = Padya(padya_text, padya_meaning, padya_type, padya_count, book_page_number, self.chapter_number, self.chapter_title)
        self.padyas.append(padya)    


    def generate_padyas_for_shlokas(self, shlokas_full_text, padya_type, idx):
        shlokas_split = re.split(r"॥\s*[।]*\s*([०१२३४५६७८९-]+\s*\(?\s*[कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह]?\s*\)?)\s*[।]*\s*॥", shlokas_full_text)
        shlokas_split = shlokas_split[:-1]
        prev_shlok_number = None
        prev_chaupai_number = None
        chaupai_flag = False
        jIdx = 0
        while jIdx < len(shlokas_split):
            padya_count = shlokas_split[jIdx + 1]
            padya_count = Padya.correct_padya_count(padya_count)
            if jIdx == 0:
                padya_text = shlokas_split[jIdx]
                padya_meaning = shlokas_split[jIdx + 2]
                padya_meaning_count = shlokas_split[jIdx + 3]
                Padya_Divider.validate_padya_count_and_meaning_count(padya_count, padya_meaning_count, idx, jIdx, padya_type, shlokas_full_text)
                self.create_padya_entry(padya_text, padya_meaning, padya_type, padya_count)
                prev_shlok_number = padya_count
                jIdx += 4
            else:
                expected_shlok_number = Padya.next_padya_count(prev_shlok_number)
                if not chaupai_flag and Padya.compare_padya_counts(padya_count, expected_shlok_number):
                    padya_text = shlokas_split[jIdx]
                    padya_meaning = shlokas_split[jIdx + 2]
                    padya_meaning_count = shlokas_split[jIdx + 3]
                    Padya_Divider.validate_padya_count_and_meaning_count(padya_count, padya_meaning_count, idx, jIdx, padya_type, shlokas_full_text)
                    self.create_padya_entry(padya_text, padya_meaning, padya_type, padya_count)
                    prev_shlok_number = padya_count
                    jIdx += 4
                else:
                    chaupai_flag = True
                    sub_padya_type = 'चौपाई'
                    if prev_chaupai_number:
                        expected_chaupai_number = Padya.next_padya_count(prev_chaupai_number)
                        Padya_Divider.validate_chaupai_count(padya_count, expected_chaupai_number, idx, jIdx, padya_type, shlokas_full_text)
                        prev_chaupai_number = expected_chaupai_number
                    else:
                        prev_chaupai_number = padya_count
                    chaupai_split = shlokas_split[jIdx].split("\n")
                    chaupai_split = [chaupai_line.strip() for chaupai_line in chaupai_split if chaupai_line.strip()]
                    Padya_Divider.validate_chaupai_line_count(chaupai_split, idx, jIdx, padya_type, shlokas_full_text)
                    if shlokas_split[jIdx].count('॥') < 2:
                        padya_text = '\n'.join(chaupai_split[:1])    # Specifically chaupai_text
                        padya_meaning = ' '.join(chaupai_split[1:])    # Specifically chaupai_meaning
                    else:
                        padya_text = '\n'.join(chaupai_split[:2])    # Specifically chaupai_text
                        padya_meaning = ' '.join(chaupai_split[2:])    # Specifically chaupai_meaning
                    self.create_padya_entry(padya_text, padya_meaning, sub_padya_type, padya_count)
                    jIdx += 2        


    def generate_padyas_for_sorathas_and_dohas(self, padyas_full_text, padya_type, idx):
        padyas_split = re.split(r"॥\s*[।]*\s*([०१२३४५६७८९-]+\s*\(?\s*[कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह]?\s*\)?)\s*[।]*\s*॥", padyas_full_text)
        padyas_split = padyas_split[:-1]
        
        prev_chaupai_number = None
        chaupai_flag = False
        jIdx = 0
        while jIdx < len(padyas_split):
            padya_count = padyas_split[jIdx + 1]
            padya_count = Padya.correct_padya_count(padya_count)
            if jIdx == 0:
                padya_text = padyas_split[jIdx]
                padya_meaning = padyas_split[jIdx + 2]
                padya_meaning_count = padyas_split[jIdx + 3]
                Padya_Divider.validate_padya_count_and_meaning_count(padya_count, padya_meaning_count, idx, jIdx, padya_type, padyas_full_text)
                self.create_padya_entry(padya_text, padya_meaning, padya_type, padya_count)
                self.prev_so_do_number = padya_count
                jIdx += 4
            else:
                expected_so_do_number = Padya.next_padya_count(self.prev_so_do_number)
                if self.prev_so_do_number == "०":
                    chaupai_flag = True
                if not chaupai_flag and Padya.compare_padya_counts(padya_count, expected_so_do_number):
                    padya_text = padyas_split[jIdx]
                    padya_meaning = padyas_split[jIdx + 2]
                    padya_meaning_count = padyas_split[jIdx + 3]
                    Padya_Divider.validate_padya_count_and_meaning_count(padya_count, padya_meaning_count, idx, jIdx, padya_type, padyas_full_text)
                    self.create_padya_entry(padya_text, padya_meaning, padya_type, padya_count)
                    self.prev_so_do_number = padya_count
                    jIdx += 4
                else:
                    chaupai_flag = True
                    sub_padya_type = 'चौपाई'
                    if prev_chaupai_number:
                        expected_chaupai_number = Padya.next_padya_count(prev_chaupai_number)
                        Padya_Divider.validate_chaupai_count(padya_count, expected_chaupai_number, idx, jIdx, padya_type, padyas_full_text)
                    prev_chaupai_number = padya_count
                    chaupai_split = padyas_split[jIdx].split("\n")
                    chaupai_split = [chaupai_line.strip() for chaupai_line in chaupai_split if chaupai_line.strip()]
                    if self.chapter_number == 7 and idx in [253, 305]:
                        sub_padya_type = 'श्लोक'
                    Padya_Divider.validate_chaupai_line_count(chaupai_split, idx, jIdx, padya_type, padyas_full_text)
                    if padyas_split[jIdx].count('॥') < 2:
                        padya_text = '\n'.join(chaupai_split[:1])    # Specifically chaupai_text
                        padya_meaning = ' '.join(chaupai_split[1:])    # Specifically chaupai_meaning
                    elif self.chapter_number == 7 and idx == 305:
                        padya_text = '\n'.join(chaupai_split[:4])    # Specifically chaupai_text
                        padya_meaning = ' '.join(chaupai_split[4:])    # Specifically chaupai_meaning    
                    elif '-' in padya_count and padyas_split[jIdx].count('॥') == 4:
                        padya_text = '\n'.join(chaupai_split[:4])    # Specifically chaupai_text
                        padya_meaning = ' '.join(chaupai_split[4:])    # Specifically chaupai_meaning
                    else:
                        padya_text = '\n'.join(chaupai_split[:2])    # Specifically chaupai_text
                        padya_meaning = ' '.join(chaupai_split[2:])    # Specifically chaupai_meaning
                    self.create_padya_entry(padya_text, padya_meaning, sub_padya_type, padya_count)
                    jIdx += 2    
        
        
    def generate_padyas_for_chandas(self, chandas_full_text, padya_type, idx):
        chandas_split = re.split(r"॥\s*[।]*\s*([०१२३४५६७८९-]+\s*\(?\s*[कखगघङचछजझञटठडढणतथदधनपफबभमयरलवशषसह]?\s*\)?)\s*[।]*\s*॥", chandas_full_text)
        chandas_split = chandas_split[:-1]
        
        # Single Chand in the full text
        if len(chandas_split) == 0:
            padya_text, padya_meaning = Padya_Divider.get_chand_text_and_meaning(chandas_full_text)
            self.create_padya_entry(padya_text, padya_meaning, padya_type, None)
            return
        
        
        # Multiple Chandas in the full text 
        prev_chand_number = None
        jIdx = 0
        last_chand_number = int(devanagari_to_english(chandas_split[-1]))
        
        ## Each Chand's text and its meaning has a count along with it
        if (len(chandas_split) // last_chand_number) == 4:
            while jIdx < len(chandas_split):
                padya_count = chandas_split[jIdx + 1]
                padya_count = Padya.correct_padya_count(padya_count)
                if jIdx > 0:
                    expected_chand_number = Padya.next_padya_count(prev_chand_number)
                    Padya_Divider.validate_chand_count(padya_count, expected_chand_number, idx, jIdx, padya_type, chandas_full_text)    
                prev_chand_number = padya_count
                padya_text = chandas_split[jIdx]    # Specifically chand_text      
                padya_meaning = chandas_split[jIdx + 2]    # Specifically chand_meaning
                padya_meaning_count = chandas_split[jIdx + 3]    # Specifically chand_meaning_count
                Padya_Divider.validate_padya_count_and_meaning_count(padya_count, padya_meaning_count, idx, jIdx, padya_type, chandas_full_text)
                self.create_padya_entry(padya_text, padya_meaning, padya_type, padya_count)
                jIdx += 4
        
        ## Only Chand's meaning has a combined count                
        else:
            while jIdx < len(chandas_split):
                padya_count = chandas_split[jIdx + 1]
                padya_count = Padya.correct_padya_count(padya_count)
                if jIdx > 0:
                    expected_chand_number = Padya.next_padya_count(prev_chand_number)
                    Padya_Divider.validate_chand_count(padya_count, expected_chand_number, idx, jIdx, padya_type, chandas_full_text)
                prev_chand_number = padya_count
                padya_text, padya_meaning = Padya_Divider.get_chand_text_and_meaning(chandas_split[jIdx])
                self.create_padya_entry(padya_text, padya_meaning, padya_type, padya_count)
                jIdx += 2


    def main(self):
        idx = 0
        self.prev_so_do_number = None
        chapter_text_split = re.split(r"((?:सो०|दो०|छं०)-{0,2})", self.chapter_text)
        
        while idx < len(chapter_text_split):
            if idx == 0:
                # Handling of Shlokas
                padya_type = 'श्लोक'
                shlokas_full_text = chapter_text_split[0]
                self.generate_padyas_for_shlokas(shlokas_full_text, padya_type, idx)
                idx += 1
            else:
                # Handling of Sorathas, Dohas and Chandas
                padya_type_definer = re.search(r"^(सो०|दो०|छं०)-{0,2}$", chapter_text_split[idx])
                if not padya_type_definer:
                    raise Exception(f'Padya type not found at index {idx} \n {chapter_text_split[idx]}\n\n{chapter_text_split[idx + 1]}')
                
                padya_type = padya_type_definer.group(1)
                padyas_full_text = chapter_text_split[idx + 1]
                if padya_type in ['सो०', 'दो०']:
                    self.generate_padyas_for_sorathas_and_dohas(padyas_full_text, padya_type, idx)
                                
                elif padya_type == 'छं०':
                    self.generate_padyas_for_chandas(padyas_full_text, padya_type, idx)                     
                idx += 2

        # Chapter Ending
        chapter_ending_split = chapter_text_split[-1].split('\n')
        chapter_ending_split = [line.strip() for line in chapter_ending_split if line.strip()]
        ending_beginning_index = [i for i in range(len(chapter_ending_split)) if re.search(r'^इति', chapter_ending_split[i])][0]
        padya_text = chapter_ending_split[ending_beginning_index]    # Specifically chapter_ending_text 
        padya_meaning = ' '.join(chapter_ending_split[ending_beginning_index + 1: -1]) + '\n' + chapter_ending_split[-1]
        self.create_padya_entry(padya_text, padya_meaning, 'अंत श्लोक', None)

        return self.padyas

# Chapter Class

In [77]:
class Chapter:
    # Count of different padya types in each chapter
    expected_padya_counts = {
        1: { 'श्लोक': 7, 'दो०': 359, 'सो०': 36, 'छं०': 56, 'चौपाई': 1488, 'last_दो०': '३६०', 'last_सो०': '३६१' },
        2: { 'श्लोक': 3, 'दो०': 314, 'सो०': 13, 'छं०': 12, 'चौपाई': 1304, 'last_दो०': '३२५', 'last_सो०': '३२६' },
        3: { 'श्लोक': 2, 'दो०': 50, 'सो०': 8, 'छं०': 33, 'चौपाई': 262, 'last_दो०': '४६ (ख)', 'last_सो०': '२१ (क)' },
        4: { 'श्लोक': 2, 'दो०': 31, 'सो०': 3, 'छं०': 3, 'चौपाई': 154, 'last_दो०': '३० (क)', 'last_सो०': '३० (ख)' },
        5: { 'श्लोक': 3, 'दो०': 62, 'सो०': 1, 'छं०': 6, 'चौपाई': 271, 'last_दो०': '६०', 'last_सो०': '१२' },
        6: { 'श्लोक': 3, 'दो०': 150, 'सो०': 9, 'छं०': 70, 'चौपाई': 568, 'last_दो०': '१२१ (ख)', 'last_सो०': '६१' },
        7: { 'श्लोक': 14, 'दो०': 207, 'सो०': 17, 'छं०': 39, 'चौपाई': 595, 'last_दो०': '१३० (ख)', 'last_सो०': '११७ (घ)' }
    }
    
    def __init__(self, chapter_number, chapter_title, page_images, book_page_offset):
        self.chapter_number = chapter_number
        self.chapter_title = chapter_title
        self.pages = []
        for i, page_image in enumerate(page_images):
            print(f"Processing Chapter {chapter_number}: Page Idx {i + 1} / {len(page_images)}")
            self.pages.append(Page(book_page_offset + i + 1, book_page_offset + i + 2, page_image, i == 0)) 
        
        self.correct_text_errors()

        self.text = '\n'.join([p.text for p in self.pages])
        self.padyas = Padya_Divider(self.text, self.chapter_number, self.chapter_title, self.pages).main()


    def __str__(self):
        text = f"सोपान {english_to_devanagari(str(self.chapter_number))}: {self.chapter_title}\n\n\n\n"
        for i, padya in enumerate(self.padyas):
            if padya.padya_type == "अंत श्लोक":
                text += "\n\n"
            text += f"{str(padya)}\n\n"
        return text

    def check_for_page_headers(self):
        for book_page in self.pages:
            check_res = book_page.check_for_page_headers()
            if check_res == True:
                pass
            elif not check_res[0]:
                print(check_res[1])


    def correct_text_errors(self):
        replacements = None
        if self.chapter_number == 1:
            replacements = [
                {"index": 30, "original": 'महँ लिय महेस जियें जानि॥ २५७॥', "replacement": 'महँ लिय महेस जियें जानि॥ २५॥'},
                {"index": 89, "original": 'करहिं अपछरा गान॥ ९१२१॥', "replacement": 'करहिं अपछरा गान॥ ९१॥'},
                {"index": 142, "original": 'राम जनम कर हेतु ॥ १७५२॥', "replacement": 'राम जनम कर हेतु ॥ १५२॥'},
                {"index": 143, "original": 'कामादि सुख सेवड समय नरेसु ॥ १७४॥', "replacement": 'कामादि सुख सेवड समय नरेसु ॥ १५४॥'},
                {"index": 146, "original": 'कीन्ह नृपति हरषाइ॥ १५७८॥', "replacement": 'कीन्ह नृपति हरषाइ॥ १५८॥'},
                {"index": 231, "original": 'बस बचन कहडक्\u200d़ बिलखाइ़ ॥ २०५७ ॥', "replacement": 'बस बचन कहडक्\u200d़ बिलखाइ़ ॥ २५५ ॥'},
                {"index": 292, "original": 'मोक्ष) पा गये हों॥ ३२०॥', "replacement": 'मोक्ष) पा गये हों॥ ३२५॥'},
                {"index": 292, "original": ' हिय\nअनुरूप बर दुलहिनि', "replacement": 'अनुरूप बर दुलहिनि'}
            ]
        elif self.chapter_number == 2:
            replacements = [
                {"index": 1, "original": 'बिमल जसु जो दायकु फल चारि॥', "replacement": 'बिमल जसु जो दायकु फल चारि॥ ०॥'},
                {"index": 1, "original": '(धर्म, अर्थ, काम, मोक्षको) देनेवाला है।', "replacement": '(धर्म, अर्थ, काम, मोक्षको) देनेवाला है॥ ०॥'},
                {"index": 7, "original": 'कमलके समान दोनों हाथोंको जोड़कर श्रीरामजी बोले--', "replacement": 'कमलके समान दोनों हाथोंको जोड़कर श्रीरामजी बोले-- ॥ २॥'},
                {"index": 45, "original": 'देखिहें मनु जनि करसि मलान॥ ७५३॥', "replacement": 'देखिहें मनु जनि करसि मलान॥ ५३॥'},
                {"index": 55, "original": 'सुरसदन सम परनसाल सुख मूल॥ ६५७॥', "replacement": 'सुरसदन सम परनसाल सुख मूल॥ ६५॥'},
                {"index": 63, "original": 'मनहु भाग मृगु भाग बस॥ ७७॥', "replacement": 'मनहु भाग मृगु भाग बस॥ ७५॥'},
                {"index": 91, "original": 'जल जो सरीर सम स्याम॥ २१०९॥', "replacement": 'जल जो सरीर सम स्याम॥ १०९॥'},
                {"index": 96, "original": 'बर लसत स्वेद कन जाल॥ १५१५७५॥।', "replacement": 'बर लसत स्वेद कन जाल॥ ११५॥'},
                {"index": 134, "original": 'बिधि सन कछु न बसाइ॥ १६१५१॥', "replacement": 'बिधि सन कछु न बसाइ॥ १६१॥'},
                {"index": 179, "original": 'आश्रम पिंजराँ राखे भा भिनुसार॥ २१५७॥', "replacement": 'आश्रम पिंजराँ राखे भा भिनुसार॥ २१५॥'},
                {"index": 180, "original": 'राम कह जस भा भरतहि जात॥ २१५६॥', "replacement": 'राम कह जस भा भरतहि जात॥ २१६॥'},
                {"index": 210, "original": 'मोर भए पीन पावस प्रथम॥ २०१॥', "replacement": 'मोर भए पीन पावस प्रथम॥ २५१॥'},
                {"index": 211, "original": 'मगन जस मीनहि सलिल सँकोच ॥ २७५२॥', "replacement": 'मगन जस मीनहि सलिल सँकोच ॥ २५२॥'},
                {"index": 212, "original": 'महाजन सचिव सब जुरे सभासद आइ॥ २०३॥', "replacement": 'महाजन सचिव सब जुरे सभासद आइ॥ २५३॥'},
                {"index": 222, "original": 'सुमंगल सूल जग भरत चरन अनुरागु॥ २६५७॥', "replacement": 'सुमंगल सूल जग भरत चरन अनुरागु॥ २६५॥'}
            ]
        elif self.chapter_number == 3:
            replacements = [
                {"index": 1, "original": 'बिमूढ़ जे हरि बिमुख न धर्म रति॥', "replacement": 'बिमूढ़ जे हरि बिमुख न धर्म रति॥ ०॥'},
                {"index": 1, "original": 'वे महामूढ़ [उन्हें सुनकर] मोहको प्राप्त होते हैं।', "replacement": 'वे महामूढ़ [उन्हें सुनकर] मोहको प्राप्त होते हैं॥ ०॥'},
                {"index": 5, "original": 'भवार्णवे। वितर्क वीचि संकुले। ७ ॥', "replacement": 'भवार्णवे। वितर्क वीचि संकुले॥ ७ ॥'},
                {"index": 28, "original": 'लरहिं धर धरु धरु करहिं भयकर गिरा ॥', "replacement": 'लरहिं धर धरु धरु करहिं भयकर गिरा ॥ १॥'},
                {"index": 37, "original": 'मुनि दुर्लभ गति दीन्हि सुजाना॥ ९॥', "replacement": 'मुनि दुर्लभ गति दीन्हि सुजाना॥'},
                {"index": 44, "original": 'रामु कृपाल बाहु बिसाल भव भय मोचनं॥', "replacement": 'रामु कृपाल बाहु बिसाल भव भय मोचनं॥ १॥'}
            ]
        elif self.chapter_number == 4:
            replacements = [
                {"index": 1, "original": 'बस संभु भवानि सो कासी सेइअ कस न॥', "replacement": 'बस संभु भवानि सो कासी सेइअ कस न॥ १॥'},
                {"index": 1, "original": 'उसका सेवन क्‍यों न किया जाय?', "replacement": 'उसका सेवन क्‍यों न किया जाय?॥ १॥'},
                {"index": 1, "original": 'मंद को कृपाल संकर सरिस॥', "replacement": 'मंद को कृपाल संकर सरिस॥ २॥'},
                {"index": 1, "original": 'उनके समान कृपालु [और] कौन है?', "replacement": 'उनके समान कृपालु [और] कौन है?॥ २॥'},
                {"index": 11, "original": 'मैं अब भी पापी ही रहा ?॥ १॥', "replacement": 'मैं अब भी पापी ही रहा ?॥ ९॥'}
            ]
        elif self.chapter_number == 5:
            replacements = [
                {"index": 19, "original": 'ब्रह्मसर मानउेँ महिमा मिट अपार॥ १५९॥', "replacement": 'ब्रह्मसर मानउेँ महिमा मिट अपार॥ १९॥'},
                {"index": 33, "original": 'खान फल भालु बिपुल कपि बीर॥ ३५७॥', "replacement": 'खान फल भालु बिपुल कपि बीर॥ ३५॥'},
                {"index": 52, "original": 'महेशकी शरण जानेपर भी नहीं\nबचेगा॥ ५६ ॥ (क) ॥', "replacement": 'महेशकी शरण जानेपर भी नहीं\nबचेगा॥ ५६ (क) ॥'}
            ]
        elif self.chapter_number == 6:
            replacements = [
                {"index": 1, "original": 'राम को कालु जासु कोदंड॥', "replacement": 'राम को कालु जासु कोदंड॥ ०(क) ॥'},
                {"index": 1, "original": 'श्रीरामजीको क्\u200dयों नहीं भजता?', "replacement": 'श्रीरामजीको क्\u200dयों नहीं भजता?॥ ०(क) ॥'},
                {"index": 1, "original": 'बिलंबु केहि काम करहु सेतु उतरे कटकु॥', "replacement": 'बिलंबु केहि काम करहु सेतु उतरे कटकु॥ ०(ख) ॥'},
                {"index": 1, "original": 'सेतु (पुल) तैयार करो, जिसमें सेना उतरे।', "replacement": 'सेतु (पुल) तैयार करो, जिसमें सेना उतरे॥ ०(ख) ॥'},
                {"index": 1, "original": 'सेतु नर चढ़ि भव सागर तरहिं॥', "replacement": 'सेतु नर चढ़ि भव सागर तरहिं॥ ०(ग) ॥'},
                {"index": 1, "original": 'संसाररूपी समुद्रसे पार हो जाते हैं।', "replacement": 'संसाररूपी समुद्रसे पार हो जाते हैं॥ ०(ग) ॥'},
                {"index": 54, "original": 'नाम गिरि औषधी जाहु पवनसुत लेन॥ ५०॥', "replacement": 'नाम गिरि औषधी जाहु पवनसुत लेन॥ ५५॥'},
                {"index": 118, "original": 'गुन सागर नागर नाथ बिभो॥', "replacement": 'गुन सागर नागर नाथ बिभो॥ १ ॥'},
                {"index": 118, "original": 'खगनाथ जथा करि कोप गहा॥', "replacement": 'खगनाथ जथा करि कोप गहा॥ २ ॥'},
                {"index": 120, "original": 'बिलोकत लोचन नहीं अघात॥ १११५१॥', "replacement": 'बिलोकत लोचन नहीं अघात॥ १११॥'},
                {"index": 124, "original": 'पुलकिततनगदगदगिराँबिनय करत त्रिपुरारि। ११४ ( ख )॥', "replacement": 'पुलकिततनगदगदगिराँबिनय करत त्रिपुरारि॥ ११४ ( ख )॥'},
                {"index": 127, "original": 'अनन्य प्रेम होनेपर करते\nहैं॥ ११५७ (ख)॥', "replacement": 'अनन्य प्रेम होनेपर करते\nहैं॥ ११७ (ख)॥'}
            ]
        elif self.chapter_number == 7:
            replacements = [
                {"index": 1, "original": 'नर कूस तन राम बियोग॥', "replacement": 'नर कूस तन राम बियोग॥ १ ॥'},
                {"index": 1, "original": 'है, श्रीरामजी क्\u200dयों नहीं आये]।', "replacement": 'है, श्रीरामजी क्\u200dयों नहीं आये]॥ १ ॥'},
                {"index": 1, "original": 'जनाव जनु नगर रम्यब चहुं फेर॥', "replacement": 'जनाव जनु नगर रम्यब चहुं फेर॥ २ ॥'},
                {"index": 1, "original": 'प्रभुके [शुभ] आगमनको जना रहे हैं।', "replacement": 'प्रभुके [शुभ] आगमनको जना रहे हैं॥ २ ॥'},
                {"index": 1, "original": 'अनुजजुत कहन चहत अब कोड़ ॥', "replacement": 'अनुजजुत कहन चहत अब कोड़ ॥ ३ ॥'},
                {"index": 1, "original": 'लक्ष्मणजीसहित प्रभु श्रीरामचन्द्रजी आ गये।', "replacement": 'लक्ष्मणजीसहित प्रभु श्रीरामचन्द्रजी आ गये॥ ३ ॥'},
                {"index": 1, "original": 'सगुन मन हरष अति लागे करन बिचार॥', "replacement": 'सगुन मन हरष अति लागे करन बिचार॥ ४ ॥'},
                {"index": 1, "original": 'मनमें अत्यन्त हर्ष हुआ और वे विचार करने लगे--', "replacement": 'मनमें अत्यन्त हर्ष हुआ और वे विचार करने लगे--॥ ४ ॥'},
                {"index": 14, "original": '(जीवन) सफल समझकर हर्षित हुईं॥ ११५ (ख)॥', "replacement": '(जीवन) सफल समझकर हर्षित हुईं॥ ११ (ख)॥'},
                {"index": 55, "original": 'केहि कारन पायठ काक सरीर॥ "५४॥', "replacement": 'केहि कारन पायठ काक सरीर॥ ५४॥'},
                {"index": 107, "original": 'श्लोक-रुद्राष्ट्रमिंदं प्रोक्त विप्रेण हरतोषये।\nये पठन्ति नरा भकत्या तेषां शम्भुः प्रसीदति॥ ९॥', 
                 "replacement": 'रुद्राष्ट्रमिंदं प्रोक्त विप्रेण हरतोषये॥\nये पठन्ति नरा भकत्या तेषां शम्भुः प्रसीदति॥'},
                {"index": 141, "original": 'श्लोक -- य त्पूर्व प्रभुणा कृतं सुकविना श्रीशम्भुना दुर्गमं', "replacement": 'य त्पूर्व प्रभुणा कृतं सुकविना श्रीशम्भुना दुर्गमं।'},
                {"index": 141, "original": 'श्रीमद्रामपदाब्जभक्तिमनिशं प्राप्त्ये तु रामायणम्\u200c।', "replacement": 'श्रीमद्रामपदाब्जभक्तिमनिशं प्राप्त्ये तु रामायणम्\u200c॥'},
                {"index": 141, "original": 'तद्रघुनाथनामनिरतं स्वान्तस्तमःशान्तये', "replacement": 'तद्रघुनाथनामनिरतं स्वान्तस्तमःशान्तये।'},
                {"index": 141, "original": 'भाषाबद्धमिदं चकार तुलसीदासस्तथा मानसम्‌॥ १॥', "replacement": 'भाषाबद्धमिदं चकार तुलसीदासस्तथा मानसम्‌॥'},
                {"index": 141, "original": 'पुण्यं पापहरं सदा शिवकरं विज्ञानभक्तिप्रदं', "replacement": 'पुण्यं पापहरं सदा शिवकरं विज्ञानभक्तिप्रदं।'},
                {"index": 141, "original": 'मायामोहमलापहं सुविमलं प्रेमाम्बुपूर शुभम्\u200c।', "replacement": 'मायामोहमलापहं सुविमलं प्रेमाम्बुपूर शुभम्\u200c॥'},
                {"index": 142, "original": 'श्रीमद्रामचरित्रमानसमिदं भक्\u200d्त्यावगाहन्ति ये', "replacement": 'श्रीमद्रामचरित्रमानसमिदं भक्\u200d्त्यावगाहन्ति ये।'},
                {"index": 142, "original": 'ते संसारपतड्रघोरकिरणैर्दहान्ति नो मानवाः॥ २॥', "replacement": 'ते संसारपतड्रघोरकिरणैर्दहान्ति नो मानवाः॥'}
            ]
        # Apply replacements based on the combined structure
        for item in replacements:
            idx = item["index"]
            original = item["original"]
            replacement = item["replacement"]
            
            self.pages[idx].text = self.pages[idx].text.replace(original, replacement)

    
    def validate_chapter_counts(self):
        actual_counts = {
            'श्लोक': len([padya for padya in self.padyas if padya.padya_type == 'श्लोक']),
            'दो०': len([padya for padya in self.padyas if padya.padya_type == 'दो०']),
            'सो०': len([padya for padya in self.padyas if padya.padya_type == 'सो०']),
            'छं०': len([padya for padya in self.padyas if padya.padya_type == 'छं०']),
            'चौपाई': len([padya for padya in self.padyas if padya.padya_type == 'चौपाई']),
            'last_दो०': [padya.padya_count for padya in self.padyas if padya.padya_type == 'दो०'][-1],
            'last_सो०': [padya.padya_count for padya in self.padyas if padya.padya_type == 'सो०'][-1],
        }
    
        expected_counts = Chapter.expected_padya_counts[self.chapter_number]
        
        # Check for Total counts
        print(f"Number of shlokas   -    Expected: {expected_counts['श्लोक']:<4}  Actual: {actual_counts['श्लोक']:<4}  {'Match' if (expected_counts['श्लोक'] == actual_counts['श्लोक']) else 'Mismatch'}")
        print(f"Number of dohas     -    Expected: {expected_counts['दो०']:<4}  Actual: {actual_counts['दो०']:<4}  {'Match' if (expected_counts['दो०'] == actual_counts['दो०']) else 'Mismatch'}")
        print(f"Number of sorathas  -    Expected: {expected_counts['सो०']:<4}  Actual: {actual_counts['सो०']:<4}  {'Match' if (expected_counts['सो०'] == actual_counts['सो०']) else 'Mismatch'}")
        print(f"Number of chandas   -    Expected: {expected_counts['छं०']:<4}  Actual: {actual_counts['छं०']:<4}  {'Match' if (expected_counts['छं०'] == actual_counts['छं०']) else 'Mismatch'}")
        print(f"Number of chaupais  -    Expected: {expected_counts['चौपाई']:<4}  Actual: {actual_counts['चौपाई']:<4}  {'Match' if (expected_counts['चौपाई'] == actual_counts['चौपाई']) else 'Mismatch'}")
        
        # Check for Last sorath and doha counts
        print(f"Last doha           -    Expected: {expected_counts['last_दो०']:<5}  Actual: {actual_counts['last_दो०']:<5}  {'Match' if (expected_counts['last_दो०'] == actual_counts['last_दो०']) else 'Mismatch'}")
        print(f"Last sorath         -    Expected: {expected_counts['last_सो०']:<5}  Actual: {actual_counts['last_सो०']:<5}  {'Match' if (expected_counts['last_सो०'] == actual_counts['last_सो०']) else 'Mismatch'}")
        

    def validate_continuity_of_counts(self):
        # Check for Shlokas padya counts
        shlokas_error = False
        shlokas_padya_counts = [padya.padya_count for padya in self.padyas if padya.padya_type == 'श्लोक']
        for i, count in enumerate(shlokas_padya_counts):
            if count == '१':
                continue
            if i == 0 and count != '१':
                shlokas_error = True
                print('Error: Shlokas do not begin from number "१"')
                print(shlokas_padya_counts)
                break
            if i > 0 and Padya.find_gap_between_padya_counts(shlokas_padya_counts[i-1], shlokas_padya_counts[i]) != 1:
                shlokas_error = True
                print('Error: Shlokas are discontinuous')
                print(shlokas_padya_counts)
                break
        if not shlokas_error:
            print('All Shlok counts are continuous')

        # Check for Dohas & Sorathas padya counts
        so_do_error = False
        dohas_padya_counts = [padya.padya_count for padya in self.padyas if padya.padya_type == 'दो०']
        sorathas_padya_counts = [padya.padya_count for padya in self.padyas if padya.padya_type == 'सो०']
        combined_padya_counts = dohas_padya_counts + sorathas_padya_counts
        combined_padya_counts = Padya.sort_padya_counts(combined_padya_counts)
        for i, count in enumerate(combined_padya_counts):
            if count == '१':
                continue
            if i == 0 and '०' not in count and '१' not in count:
                so_do_error = True
                print('Error: Sorathas/Dohas do not begin from either number "०" or "१"')
                print(combined_padya_counts)
                break
            if i > 0 and Padya.find_gap_between_padya_counts(combined_padya_counts[i-1], combined_padya_counts[i]) > 1:
                so_do_error = True
                print('Error: Sorathas and Dohas are discontinuous')
                print(combined_padya_counts)
                break
        if not so_do_error:
            print('All Sorath & Doha counts are continuous')

        # Check for Chaupai counts under each sorath and doha
        idx = 0
        chaupai_error = False
        while idx < len(self.padyas):
            padya = self.padyas[idx]
            
            # Check if the current padya is सो० or दो०
            if padya.padya_type in ['सो०', 'दो०']:
                chaupai_counts = []
        
                # Collect all चौपाई counts under the current सो० or दो०
                jIdx = idx + 1
                while jIdx < len(self.padyas) and self.padyas[jIdx].padya_type == 'चौपाई':
                    chaupai_counts.append(self.padyas[jIdx].padya_count)
                    jIdx += 1
                
                # Compute gaps if there are at least two चौपाई
                if len(chaupai_counts) > 1:
                    for i, count in enumerate(chaupai_counts):
                        if count == '१':
                            continue
                        if i == 0 and '१' not in count:
                            chaupai_error = True
                            print(f'Error: Chaupais under {padya.padya_type} at index {idx} do not begin from number "१"')
                            print(chaupai_counts)
                            break
                        if i > 0 and Padya.find_gap_between_padya_counts(chaupai_counts[i-1], chaupai_counts[i]) != 1:
                            chaupai_error = True
                            print(f'Error: Chaupais under {padya.padya_type} at index {idx} are discontinuous')
                            print(chaupai_counts)
                            break
                idx = jIdx
            else:
                idx += 1
        if not chaupai_error:
            print('All Chaupai counts are continuous')

        # Check for Chand counts
        chand_error = False
        chand_counts = [padya.padya_count for padya in self.padyas if padya.padya_type == 'छं०']
        chand_counts = [count for count in chand_counts if count is not None]
        for i, count in enumerate(chand_counts):
            if count == '१':
                continue
            if i == 0 and count != '१':
                chand_error = True
                print('Error: Chandas do not begin from number "१"')
                print(chand_counts)
                break
            if i > 0 and Padya.find_gap_between_padya_counts(chand_counts[i-1], chand_counts[i]) != 1:
                chand_error = True
                print('Error: Chandas are discontinuous')
                print(chand_counts)
                break
        if not chand_error:
            print('All Chand counts are continuous')

    
    def validate_padyas(self):
        self.validate_chapter_counts()
        self.validate_continuity_of_counts()


# Book Class

In [78]:
class Book:
    # Path to your PDF
    book_pdf_path = "ramcharitmanas.pdf"
    book_chapters = [{
        "chapter_number": 1,
        "chapter_title": 'बालकाण्ड', 
        "first_page_idx": 15,
        "last_page_idx": 339
    },{
        "chapter_number": 2,
        "chapter_title": 'अयोध्याकाण्ड', 
        "first_page_idx": 341,
        "last_page_idx": 615
    },{
        "chapter_number": 3,
        "chapter_title": 'अरण्यकाण्ड', 
        "first_page_idx": 617,
        "last_page_idx": 677
    },{
        "chapter_number": 4,
        "chapter_title": 'किष्किन्धाकाण्ड', 
        "first_page_idx": 679,
        "last_page_idx": 709
    },{
        "chapter_number": 5,
        "chapter_title": 'सुन्दरकाण्ड', 
        "first_page_idx": 711,
        "last_page_idx": 767
    },{
        "chapter_number": 6,
        "chapter_title": 'लंकाकाण्ड', 
        "first_page_idx": 769,
        "last_page_idx": 901
    },{
        "chapter_number": 7,
        "chapter_title": 'उत्तरकाण्ड', 
        "first_page_idx": 903,
        "last_page_idx": 1045
    }]
    
    
    def __init__(self):
        # Convert PDF pages to images
        print('Converting pdf pages to images...')
        images = convert_from_path(Book.book_pdf_path)
        print('Total number of pages in the book: ', len(images))
        
        self.chapters = []
        for i, chapter in enumerate(Book.book_chapters):
            chapter_page_images = images[chapter['first_page_idx']: chapter['last_page_idx'] + 1] 
            chapter_obj = Chapter(chapter['chapter_number'], chapter['chapter_title'], chapter_page_images, chapter['first_page_idx'])
            self.chapters.append(chapter_obj)

    def __str__(self):
        text = ""
        for chapter in self.chapters:
            text += f"{str(chapter)}\n\n\n\n\n\n-------------------------------------------------------------------------------\n\n\n\n\n"
        return text
    

In [47]:
ramcharitmanas_book = Book()

Converting pdf pages to images...
Total number of pages in the book:  1054
Processing Chapter 1: Page Idx 1 / 325
Processing Chapter 1: Page Idx 2 / 325
Processing Chapter 1: Page Idx 3 / 325
Processing Chapter 1: Page Idx 4 / 325
Processing Chapter 1: Page Idx 5 / 325
Processing Chapter 1: Page Idx 6 / 325
Processing Chapter 1: Page Idx 7 / 325
Processing Chapter 1: Page Idx 8 / 325
Processing Chapter 1: Page Idx 9 / 325
Processing Chapter 1: Page Idx 10 / 325
Processing Chapter 1: Page Idx 11 / 325
Processing Chapter 1: Page Idx 12 / 325
Processing Chapter 1: Page Idx 13 / 325
Processing Chapter 1: Page Idx 14 / 325
Processing Chapter 1: Page Idx 15 / 325
Processing Chapter 1: Page Idx 16 / 325
Processing Chapter 1: Page Idx 17 / 325
Processing Chapter 1: Page Idx 18 / 325
Processing Chapter 1: Page Idx 19 / 325
Processing Chapter 1: Page Idx 20 / 325
Processing Chapter 1: Page Idx 21 / 325
Processing Chapter 1: Page Idx 22 / 325
Processing Chapter 1: Page Idx 23 / 325
Processing Cha

# Validating Book Object

In [80]:
# Validating Padyas for all chapters
for chapter in ramcharitmanas_book.chapters:
    print(f"Chapter: {chapter.chapter_number} {chapter.chapter_title}\n")
    chapter.validate_padyas()
    print("\n\n ----------------------------------------------- \n\n")

Chapter: 1 बालकाण्ड

Number of shlokas   -    Expected: 7     Actual: 7     Match
Number of dohas     -    Expected: 359   Actual: 359   Match
Number of sorathas  -    Expected: 36    Actual: 36    Match
Number of chandas   -    Expected: 56    Actual: 56    Match
Number of chaupais  -    Expected: 1488  Actual: 1488  Match
Last doha           -    Expected: ३६०    Actual: ३६०    Match
Last sorath         -    Expected: ३६१    Actual: ३६१    Match
All Shlok counts are continuous
All Sorath & Doha counts are continuous
All Chaupai counts are continuous
All Chand counts are continuous


 ----------------------------------------------- 


Chapter: 2 अयोध्याकाण्ड

Number of shlokas   -    Expected: 3     Actual: 3     Match
Number of dohas     -    Expected: 314   Actual: 314   Match
Number of sorathas  -    Expected: 13    Actual: 13    Match
Number of chandas   -    Expected: 12    Actual: 12    Match
Number of chaupais  -    Expected: 1304  Actual: 1304  Match
Last doha           -    E

In [81]:
print(ramcharitmanas_book)

सोपान १: बालकाण्ड



वर्णानामर्थसंघानां रसानां छन्दसामपि।
मड़लानां च कर्त्तारा बन्दे वाणीविनायकौ॥ १ ॥

अर्थ: अक्षरों, अर्थसमूहों, रसों, छन्‍्दों और मंगलोंकी करनेवाली सरस्वतीजी और गणेशजीकी मैं वन्दना करता हूँ॥ १ ॥

भवानीशड्ूरोौ वन्दे   श्रद्धाविश्वासरूपिणौ।
याभ्यां विना न पश्यन्ति सिद्धाः स्वान्तःस्थमी श्वरम्‌॥ २ ॥

अर्थ: श्रद्धा और विश्वासके स्वरूप श्रीपार्वतीजी और श्रीशड्डरजीकी मैं वन्दना करता हूँ, जिनके बिना सिद्धजन अपने अन्तःकरणमें स्थित ईश्वरको नहीं देख सकते॥ २ ॥

वन्दे बोधमयं नित्यं गुरु शड्डूररूपिणम्‌।
यमाथअ्ितो हि वक्रोषपि चन्द्रः सर्वत्र वन्द्यते॥ ३ ॥

अर्थ: ज्ञानमय, नित्य, शड्डूररूपी गुरुकी मैं वन्दना करता हूँ, जिनके आश्रित होनेसे ही टेढ़ा चन्द्रमा भी सर्वत्र वन्दित होता है॥ ३ ॥

सीतारामगुणग्रामपुण्यारण्यविहारिणौ।
वन्दे विशुद्धविज्ञानौ कवीश्वरकपी श्वरौ॥ ४ ॥

अर्थ: श्रीसीतारामजीके गुणसमूहरूपी पवित्र वनमें विहार करनेवाले, विशुद्ध विज्ञानसम्पन्न कवीश्वर श्रीवाल्मीकिजी और कपीश्वर श्रीहनुमानूजीकी मैं वन्दना करता हूँ॥ ४ ॥

उद्धवस्थितिसंहारकारिणीं क्लेशहारिणीम्‌।
सर्वश्रेयस्करीं सीता

# Converting Book Object to Text File and Saving it

In [60]:
with open('ramcharitmanas.txt', 'w') as file:
    file.write(str(ramcharitmanas_book))

<br><br><br><br><br>

# Save the Final Book Object

In [58]:
# Save the object to a file
with open("ramcharitmanas_book.pkl", "wb") as file:
    pickle.dump(ramcharitmanas_book, file)

print("Object saved to file.")


Object saved to file.


<br><br>

# Load the Final Book Object

In [79]:
# Load the object from the file
with open("ramcharitmanas_book.pkl", "rb") as file:
    ramcharitmanas_book = pickle.load(file)


# Create and Save Dataframe

In [68]:
data_values = []
for chapter in ramcharitmanas_book.chapters:
    for padya in chapter.padyas:
        data_values.append([chapter.chapter_title, padya.text, padya.meaning, padya.padya_type, padya.padya_count, padya.book_page_number])



ramcharitmanas_df = pd.DataFrame(data_values, columns=['Kand', 'Verse', 'Meaning', 'Verse Type', 'Verse Count', 'Page Number'])
ramcharitmanas_df.head(5)

Unnamed: 0,Kand,Verse,Meaning,Verse Type,Verse Count,Page Number
0,बालकाण्ड,वर्णानामर्थसंघानां रसानां छन्दसामपि।\nमड़लानां...,"अक्षरों, अर्थसमूहों, रसों, छन्‍्दों और मंगलोंक...",श्लोक,१,१७
1,बालकाण्ड,भवानीशड्ूरोौ वन्दे श्रद्धाविश्वासरूपिणौ।\nया...,श्रद्धा और विश्वासके स्वरूप श्रीपार्वतीजी और श...,श्लोक,२,१७
2,बालकाण्ड,वन्दे बोधमयं नित्यं गुरु शड्डूररूपिणम्‌।\nयमाथ...,"ज्ञानमय, नित्य, शड्डूररूपी गुरुकी मैं वन्दना क...",श्लोक,३,१७
3,बालकाण्ड,सीतारामगुणग्रामपुण्यारण्यविहारिणौ।\nवन्दे विशु...,श्रीसीतारामजीके गुणसमूहरूपी पवित्र वनमें विहार...,श्लोक,४,१७
4,बालकाण्ड,उद्धवस्थितिसंहारकारिणीं क्लेशहारिणीम्‌।\nसर्वश...,"उत्पत्ति, स्थिति (पालन) और संहार करनेवाली, क्ल...",श्लोक,५,१८


In [69]:
ramcharitmanas_df.to_csv('ramcharitmanas_df.csv', index=False)

<br><br><br>