In [113]:
import os 
import re 
from transformers import pipeline 
import random 

print("Libraries imported successfully.")

Libraries imported successfully.


In [114]:
import os 
# For this mini project, I have created a sample of randomly selected 
# 200 10X filings from first quarter of 2024 from the Loughran McDonald 10X filings databsae 
DATA_DIR = os.path.join("data", "sample_filings")
filings = [f for f in os.listdir(DATA_DIR) if f.lower().endswith(".txt")      ]
filings.sort()

# choose a random filings from the above sample 
sample_filings = random.sample(filings, 1)
sample_filings

['20240229_10-Q_edgar_data_315189_0001558370-24-002149.txt']

## 1. MD & A Text Extraction 

In [115]:
import re

def extract_mda_section(raw_filing_text):
    """
    Parses a raw SEC filing text to extract the Management's Discussion and Analysis (MD&A) section.
    
    This parser uses a robust, multi-stage approach to avoid incorrectly capturing
    the Table of Contents and to handle variations in the section title.
    
    Args:
        raw_filing_text (str): The full text of the raw SEC filing.
        
    Returns:
        str: The extracted MD&A text, or None if the section can't be found.
    """
    
    # Stage 1: Normalize Text for Easier Parsing 
    doc_lower = raw_filing_text.lower()
    
    # Stage 2: Define Regex Patterns 
    
    # UPGRADED PATTERN:
    # This pattern now handles variations like "and" vs. "&" and optional phrasing.
    # It looks for "management's discussion" followed by either "and" or "&"
    # and then "analysis". It also makes "of financial condition and results of operations" optional.
    mda_start_pattern = re.compile(
        r'^\s*item\s+[27]\s*\.\s*'  # Starts with "Item 2." or "Item 7." at the beginning of a line
        r'management.{1,5}s\s+discussion\s+' # Matches "management's discussion "
        r'(?:and|&)\s+'  # Matches either "and" or "&" followed by a space
        r'analysis'     # Matches "analysis"
        r'(?:\s+of\s+financial\s+condition\s+and\s+results\s+of\s+operations)?', # Optional trailing text
        re.MULTILINE
    )
    
    # Pattern for the end of MD&A (the start of the next item) is robust enough.
    mda_end_pattern = re.compile(r'^\s*item\s+[38]\s*\.', re.MULTILINE)

    # --- Stage 3: Find All Potential Section Starts ---
    possible_starts = list(mda_start_pattern.finditer(doc_lower))
    
    if not possible_starts:
        print("Warning: MD&A start pattern not found.")
        # Fallback to a simpler pattern if the main one fails
        simple_pattern = re.compile(r'management.{1,5}s\s+discussion\s+(?:and|&)\s+analysis', re.IGNORECASE | re.DOTALL)
        possible_starts = list(simple_pattern.finditer(doc_lower))
        if not possible_starts:
            print("Warning: Simpler fallback pattern also failed.")
            return None
    
    # since the pattern can be present in other sections such as table of contents, 
    # we choose the last occurance of the pattern 
    mda_start_match = possible_starts[-1]
    
    # --- Stage 4: Extract the Text ---
    text_after_mda_start = raw_filing_text[mda_start_match.end():]
    mda_end_match = mda_end_pattern.search(text_after_mda_start.lower())
    
    if mda_end_match:
        mda_text = text_after_mda_start[:mda_end_match.start()]
    else:
        print("Warning: MD&A end pattern not found. Returning text until the end.")
        mda_text = text_after_mda_start
        
    # --- Stage 5: Final Cleaning ---
    mda_text = re.sub(r'<[^>]+>', '', mda_text)
    mda_text = re.sub(r'\s+', ' ', mda_text).strip()
    
    return mda_text

print("Upgraded, robust MD&A parsing function defined.")


Upgraded, robust MD&A parsing function defined.


## Create a random sample of all the filings in first quarter of 2024 

In [116]:
# --- Test the Parser on a Sample Filing ---

filing_path = os.path.join(DATA_DIR, sample_filings[0])

try:
    with open(filing_path, 'r', encoding='utf-8') as f:
        raw_text = f.read()
    
    print(f"Successfully loaded file: {os.path.basename(filing_path)}")
    
    # Use our function to extract the MD&A Section
    mda_text = extract_mda_section(raw_text)
    
    if mda_text:
        print("\nSuccessfully extracted MD&A section.")
        print("--- First 500 characters of MD&A: ---")
        print(mda_text[:500])
    else:
        print("\nCould not find MD&A section in this filing.")

except FileNotFoundError:
    print(f"ERROR: File not found. Please update the 'filing_path' variable with the correct full path to your .txt file.")

Successfully loaded file: 20240229_10-Q_edgar_data_315189_0001558370-24-002149.txt

Successfully extracted MD&A section.
--- First 500 characters of MD&A: ---
RESULTS OF OPERATIONS All amounts are presented in millions of dollars unless otherwise specified. OVERVIEW Organization Deere Company is a global leader in the production of agricultural, turf, construction, and forestry equipment and solutions. John Deere Financial provides financing for John Deere equipment, parts, services, and other input costs customers need to run their operations. Our operations are managed through the production and precision agriculture (PPA), small agriculture and tur


## Phase 2: LLM Classification


In [117]:
print("Loading zero-shot classification model...")
classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')
print("Model loaded.")

Loading zero-shot classification model...


Device set to use cpu


Model loaded.


In [118]:
# Define our custom labels (our "prompt")
constraint_labels = [
    # --- Debt Constraint Signals ---
    'Discusses specific challenges with debt financing or credit markets',
    'Mentions potential for covenant violations or breach of debt agreements',
    
    # --- Equity Constraint Signals ---
    'Discusses specific challenges with issuing new stock or equity markets',
    
    # --- General Constraint & Consequence Signals ---
    'States inability to fund or potential delay of investment projects or acquisitions',
    'Expresses concern over future liquidity, cash flows, or ability to meet obligations',
    
    # --- The "Null" Case ---
    'Neutral or positive statement about financial condition and liquidity'
]


In [None]:
# --- Phase 2: LLM Classification ---

# Load our zero-shot classification model
# This might take a moment the first time you run it.
print("Loading zero-shot classification model...")
classifier = pipeline('zero-shot-classification', model='facebook/bart-large-mnli')
print("Model loaded.")

# Define our custom labels (our "prompt")
constraint_labels = [
    # --- Debt Constraint Signals ---
    'Discusses specific challenges with debt financing or credit markets',
    'Mentions potential for covenant violations or breach of debt agreements',
    
    # --- Equity Constraint Signals ---
    'Discusses specific challenges with issuing new stock or equity markets',
    
    # --- General Constraint & Consequence Signals ---
    'States inability to fund or potential delay of investment projects or acquisitions',
    'Expresses concern over future liquidity, cash flows, or ability to meet obligations',
    
    # --- The "Null" Case ---
    'Neutral or positive statement about financial condition and liquidity'
]

# The MD&A section is huge. We can't analyze all of it. it's cost prohibitive
# A simple but effective technique is to split it into sentences using the dot(.) character 
# and search for keywords. BUT this is not optimal. 
# in the next iteration of the model, a sentence tokenizer will be used: 
'''
import nltk

# a one-time download.
nltk.download('punkt')
robust_sentences = nltk.sent_tokenize(mda_text)

'''
# A simple text.split('.') is naive and breaks on numbers like "4.6%".
# We can use a regular expression to split on periods, but ONLY if they are
# followed by a space and NOT preceded by a digit. This is a good "80/20" solution.

# The regex `(?<!\d)\.\s` means:
# (?<!\d)  - A "negative lookbehind": asserts that the character immediately preceding ?< is NOT a digit !\d.
# \.        - Matches a literal period.
# \s        - Matches a whitespace character (like a space or newline).

# split by this pattern, which gives us a list of sentences.
# The `re.split()` function is perfect for this.
sentences = re.split(r'(?<!\d)\.\s', mda_text)

# This process might leave some empty strings or whitespace, so let's clean it up.
sentences = [s.strip() for s in sentences if s.strip()]

print(f"Text split into {len(sentences)} sentences using the regex.")


keywords = ['financing', 'liquidity', 'capital', 'covenant', 'debt', 
            'equity', 'funding', 'delay', 'fund', 'obligation']

# Find sentences that contain our keywords
relevant_sentences = [s for s in sentences if any(key in s.lower() for key in keywords)]
print(f"Found {len(relevant_sentences)} potentially relevant sentences.")


if relevant_sentences:
    # Analyze the first 5 relevant sentences as a proof-of-concept
    results = classifier(relevant_sentences[:5], constraint_labels, multi_label=False)

    print("\n--- LLM Classification Results ---")
    for res in results:
        print(f"\nSENTENCE: \"{res['sequence'].strip()}\"")
        print(f"  => CLASSIFICATION: {res['labels'][0]} (Score: {res['scores'][0]:.2f})")


Loading zero-shot classification model...


Device set to use cpu


Model loaded.
Text split into 167 sentences using a smarter regex.
Found 48 potentially relevant sentences.

--- LLM Classification Results ---

SENTENCE: "John Deere Financial provides financing for John Deere equipment, parts, services, and other input costs customers need to run their operations"
  => CLASSIFICATION: Neutral or positive statement about financial condition and liquidity (Score: 0.42)

SENTENCE: "Our Smart Industrial Operating Model and Leap Ambitions are intended to capitalize on this market trend"
  => CLASSIFICATION: Neutral or positive statement about financial condition and liquidity (Score: 0.58)

SENTENCE: "Agricultural fundamentals are expected to moderate in 2024 due to lower commodity prices and elevated interest rates, offset by resilient farm balance sheets and lower input costs"
  => CLASSIFICATION: Expresses concern over future liquidity, cash flows, or ability to meet obligations (Score: 0.35)

SENTENCE: "Financial Services Outlook for 2024 Net Income U