In [7]:
from bs4 import BeautifulSoup

def extract_mdna_section(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    soup = BeautifulSoup(content, 'html.parser')
    
    # Extract content starting from "ITEM 7."
    mda_start = soup.find(string=lambda text: 'ITEM 7.' in text)
    if not mda_start:
        return "MD&A section not found."
    
    # Capture the content until we hit the next section
    mda_content = []
    for sibling in mda_start.find_all_next(string=True):
        if "ITEM 8." in sibling:
            break
        mda_content.append(sibling.strip())

    # Refine the extracted content
    refined_mda_content = []
    for item in mda_content:
        if "PAGE" in item or "Table of Contents" in item or "Part II" in item:
            continue
        refined_mda_content.append(item)

    # Convert the list into a single string
    mdna_text = ' '.join(refined_mda_content)

    # Replace multiple spaces with a single space
    mdna_text = ' '.join(mdna_text.split())

    # Remove unwanted patterns (you can add more patterns if needed)
    unwanted_patterns = ["&nbsp;", "&#146;", "&#147;", "&#148;"]
    for pattern in unwanted_patterns:
        mdna_text = mdna_text.replace(pattern, '')

    return mdna_text

# Example usage
file_path = 'data/sec-edgar-filings/AAPL/10-K/0001193125-13-416534/primary-document.html'
mdna_text = extract_mdna_section(file_path)

def save_to_file(text, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(text)
    print(f"Content saved to {filename}")

# Example usage
save_to_file(mdna_text, 'mdna_output.txt')
mdna_text

Content saved to mdna_output.txt


'MD&A section not found.'