In [7]:
from bs4 import BeautifulSoup

def extract_mdna_section(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        content = file.read()
    soup = BeautifulSoup(content, 'html.parser')
    
    # Extract content starting from "ITEM 7."
    mda_start = soup.find(string=lambda text: 'ITEM 7.' in text)
    if not mda_start:
        return "MD&A section not found."
    
    # Capture the content until we hit the next section
    mda_content = []
    for sibling in mda_start.find_all_next(string=True):
        if "ITEM 8." in sibling:
            break
        mda_content.append(sibling.strip())

    # Refine the extracted content
    refined_mda_content = []
    for item in mda_content:
        if "PAGE" in item or "Table of Contents" in item or "Part II" in item:
            continue
        refined_mda_content.append(item)

    # Convert the list into a single string
    mdna_text = ' '.join(refined_mda_content)

    # Replace multiple spaces with a single space
    mdna_text = ' '.join(mdna_text.split())

    # Remove unwanted patterns (you can add more patterns if needed)
    unwanted_patterns = ["&nbsp;", "&#146;", "&#147;", "&#148;"]
    for pattern in unwanted_patterns:
        mdna_text = mdna_text.replace(pattern, '')

    return mdna_text

# Example usage
file_path = 'data/sec-edgar-filings/AAPL/10-K/0001193125-13-416534/primary-document.html'
mdna_text = extract_mdna_section(file_path)

def save_to_file(text, filename):
    with open(filename, 'w', encoding='utf-8') as file:
        file.write(text)
    print(f"Content saved to {filename}")

# Example usage
save_to_file(mdna_text, 'mdna_output.txt')
mdna_text

Content saved to mdna_output.txt


'MD&A section not found.'

In [6]:
import re
from bs4 import BeautifulSoup

def extract_mda_section(file_path: str) -> str:
    """
    Extract the "Management’s Discussion and Analysis of Financial Condition and Results of Operations" 
    section from a 10-K report HTML file.
    
    Args:
    - file_path (str): Path to the HTML file of the 10-K report.
    
    Returns:
    - str: Text of the MDA section.
    """
    # Load the content of the HTML file
    with open(file_path, "r", encoding="utf-8") as file:
        html_content = file.read()
    
    # Convert the entire content to lowercase
    lower_content = html_content.lower()
    
    # Using a regex pattern to extract the content between the second occurrences of "item 7." and "item 8."
    pattern = r'(?:item\s*7\.)(?:.*?item\s*7\.)(.*?)(?:item\s*8\.)'
    match = re.search(pattern, lower_content, re.DOTALL)
    
    # Extract matched content if found
    section_content = match.group(1).strip() if match else "Section not found."
    
    # Parse the HTML content to retrieve the text
    soup = BeautifulSoup(section_content, 'html.parser')
    parsed_text = soup.get_text()
    
    return parsed_text


file_path = 'data/sec-edgar-filings/AAPL/10-K/0000320193-18-000145/primary-document.html'
mdna_text = extract_mda_section(file_path)
mdna_text   

"management’s discussion and analysis of financial condition and results of operationsthis section and other parts of this annual report on form 10-k (“form 10-k”) contain forward-looking statements, within the meaning of the private securities litigation reform act of 1995, that involve risks and uncertainties. forward-looking statements provide current expectations of future events based on certain assumptions and include any statement that does not directly relate to any historical or current fact. forward-looking statements can also be identified by words such as “future,”  “anticipates,”  “believes,”  “estimates,”  “expects,”  “intends,”  “plans,”  “predicts,”  “will,”  “would,”  “could,”  “can,”  “may,”  and similar terms. forward-looking statements are not guarantees of future performance and the company’s actual results may differ significantly from the results discussed in the forward-looking statements. factors that might cause such differences include, but are not limited to

In [7]:
def extract_mda_section_updated_v2(file_path: str) -> str:
    """
    Extract the "Management’s Discussion and Analysis of Financial Condition and Results of Operations"
    section from a 10-K report HTML file by first parsing the HTML content.

    Args:
    - file_path (str): Path to the HTML file of the 10-K report.

    Returns:
    - str: Text of the MDA section.
    """
    # Load and parse the HTML file content
    with open(file_path, "r", encoding="utf-8") as file:
        soup = BeautifulSoup(file.read(), "html.parser")
    
    # Convert the parsed content to text and lowercase it
    parsed_text = soup.get_text()
    lower_parsed_text = parsed_text.lower()
    
    # Using a regex pattern to find all occurrences of "item 7." and "item 8."
    item_7_matches = [match.start() for match in re.finditer(r'item\s*7\.', lower_parsed_text)]
    item_8_matches = [match.start() for match in re.finditer(r'item\s*8\.', lower_parsed_text)]
    
    # If we have less than 2 occurrences of "item 7.", return section not found
    if len(item_7_matches) < 2:
        return "MD&A section not found."
    
    # Extract content between the second occurrence of "item 7." and the occurrence of "item 8." after that
    section_content = parsed_text[item_7_matches[1]:item_8_matches[1]].strip()

    return section_content

# Testing the updated function on the provided file
file_path = 'data/sec-edgar-filings/AAPL/10-K/0000320193-18-000145/primary-document.html'
test_output_updated_v2 = extract_mda_section_updated_v2(file_path)
test_output_updated_v2


"Item 7.Management’s Discussion and Analysis of Financial Condition and Results of OperationsThis section and other parts of this Annual Report on Form 10-K (“Form 10-K”) contain forward-looking statements, within the meaning of the Private Securities Litigation Reform Act of 1995, that involve risks and uncertainties. Forward-looking statements provide current expectations of future events based on certain assumptions and include any statement that does not directly relate to any historical or current fact. Forward-looking statements can also be identified by words such as “future,”  “anticipates,”  “believes,”  “estimates,”  “expects,”  “intends,”  “plans,”  “predicts,”  “will,”  “would,”  “could,”  “can,”  “may,”  and similar terms. Forward-looking statements are not guarantees of future performance and the Company’s actual results may differ significantly from the results discussed in the forward-looking statements. Factors that might cause such differences include, but are not lim