In [323]:
import re

In [324]:
with open('pg10681.txt', 'r', encoding='utf-8') as file:
    file_content = file.read()

In [325]:
import re

def extract_classes_with_text(text):
    class_pattern = r'(CLASS\s+[IVXLCDM]+\n[^\n]+)'  # Pattern to match class titles
    end_marker = "End of thesaurus"  # Define the end marker
    
    # Find all class titles and their positions
    classes = [(match.group(), match.end()) for match in re.finditer(class_pattern, text)]
    
    # Add an artificial end marker for the last class to simplify logic
    if re.search(end_marker, text):
        classes.append(("END", re.search(end_marker, text).start()))
    else:
        classes.append(("END", len(text)))
    
    class_dict = {}
    # Iterate through classes to assign text to each class
    for i in range(len(classes) - 1):
        class_title = classes[i][0].strip()
        start_index = classes[i][1]
        end_index = classes[i + 1][1]
        # Extract text for the current class
        class_text = text[start_index:end_index].strip()
        class_dict[class_title] = class_text
    
    return class_dict



In [326]:
def remove_newlines_from_dict_keys(input_dict):
    modified_dict = {key.replace('\n', ': '): value for key, value in input_dict.items()}
    return modified_dict


In [327]:
def extract_divisions(class_dict):
    division_pattern = r"DIVISION\s(?:I|V|X|L|C|D|M)+\n[A-Z\s]+\n"  # Regex pattern for division titles

    for class_title, class_text in class_dict.items():
        # Find all division titles and their positions
        divisions = [(match.group().strip(), match.end()) for match in re.finditer(division_pattern, class_text)]
        
        # Handle case with no divisions
        if not divisions:
            class_dict[class_title] = {"NO_DIVISION": class_text}
            continue

        # Add an artificial end marker for the last division to simplify logic
        divisions.append(("END", len(class_text)))

        division_dict = {}
        # Iterate through divisions to assign text to each division
        for i in range(len(divisions) - 1):
            division_title = divisions[i][0].strip()
            start_index = divisions[i][1]
            end_index = divisions[i + 1][1]
            # Extract text for the current division
            division_text = class_text[start_index:end_index].strip()
            division_dict[division_title] = division_text

        # Update the class in the dictionary with its divisions
        class_dict[class_title] = division_dict

    return class_dict


In [328]:
import re

def refine_section_extraction(class_dict):
    # Pattern to match section identifiers and optional descriptive titles
    full_section_pattern = r'(SECTION\s+[IVXLCDM]+\.)\s*([^\n]*)'
    
    for class_title, divisions in class_dict.items():
        for division_title, division_text in divisions.items():
            # Temporary dictionary to store sections for the current division
            temp_section_dict = {}
            
            # Find all matches for the full section pattern
            matches = list(re.finditer(full_section_pattern, division_text))
            
            for i, match in enumerate(matches):
                # Determine the start of the next match or use the end of the division text if at the last match
                end_index = matches[i + 1].start() if i + 1 < len(matches) else len(division_text)
                
                # Extract the full section title, combining the identifier and the optional descriptive title
                full_section_title = match.group(1) + (' ' + match.group(2).strip() if match.group(2).strip() else '')
                # Extract the text for this section
                section_text = division_text[match.end():end_index].strip()
                
                temp_section_dict[full_section_title] = section_text
            
            # If no sections were found, use a placeholder
            if not temp_section_dict:
                temp_section_dict["NO_SECTION"] = division_text
            
            # Update the division entry with its sections
            divisions[division_title] = temp_section_dict

    return class_dict


In [329]:
import re

def process_sections_with_no_title_key(class_dict):
    # Adjusted pattern for subsection titles within the section text
    adjusted_pattern_for_subsections = r'^\d\.\s?([A-Z]+([,\s]+[A-Z]+)*([,\s]+[a-z]+)*)*$'

    for class_title, divisions in class_dict.items():
        for division_title, sections in divisions.items():
            updated_sections = {}
            for section_title, section_text in sections.items():
                # Attempt to capture the full section title and text excluding this title
                full_title_search = re.search(r'(SECTION\s+[IVXLCDM]+\.)\s*([^\n]+)', section_text)
                if full_title_search:
                    full_section_title = full_title_search.group(1) + " " + full_title_search.group(2).strip()
                    section_text_without_title = section_text[len(full_section_title):].strip()
                else:
                    full_section_title = section_title  # Use section identifier if no full title is present
                    section_text_without_title = section_text

                # Find titles within the section text
                titles_matches = list(re.finditer(adjusted_pattern_for_subsections, section_text_without_title, re.MULTILINE))
                if titles_matches:
                    title_dict = {}
                    for k in range(len(titles_matches)):
                        title = titles_matches[k].group(0).strip()
                        title_start_index = titles_matches[k].end()
                        # Determine the end index for the current title's text
                        title_end_index = titles_matches[k + 1].start() if k + 1 < len(titles_matches) else len(section_text_without_title)
                        # Extract the text associated with the current title
                        subsection_text = section_text_without_title[title_start_index:title_end_index].strip()
                        title_dict[title] = subsection_text

                    updated_sections[full_section_title] = title_dict
                else:
                    # If no titles found, assign the entire section text to "NO_TITLE" key
                    updated_sections[full_section_title] = {"NO_TITLE": section_text_without_title}

            divisions[division_title] = updated_sections  # Update division with processed sections

    return class_dict

In [337]:
import re

def transform_title_values(class_dict):
    for class_title, divisions in class_dict.items():
        for division_title, sections in divisions.items():
            for section_title, section in sections.items():
                for title, text in section.items():
                    # Determine if the text starts with a number
                    if not text.strip().startswith(tuple('0123456789')):
                        lines = text.splitlines()
                        remaining_lines = lines[2:]  # Skip the first two lines
                        text = '\n'.join(remaining_lines)  # Rejoin the remaining lines
                    
                    # Define the section pattern for splitting the text
                    section_pattern = r'(?=\n\d+[a-z]?\. )'
                    # Split the text into sections
                    sections = re.split(section_pattern, text)
                    
                    # Update the text for the title with the processed sections
                    section[title] = sections

    return class_dict


In [330]:
import json 
def write_data_to_json_file(data, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        json.dump(data, file, ensure_ascii=False, indent=4)


In [338]:
classes = extract_classes_with_text(file_content)
classes = remove_newlines_from_dict_keys(classes)
classes = extract_divisions(classes)
classes = refine_section_extraction(classes)
classes = process_sections_with_no_title_key(classes)
classes = transform_title_values(classes)
write_data_to_json_file(classes, 'thesaurus.json')
