In [2]:
# Step 0 — Prepare tabular base
import pandas as pd
import numpy as np
from sklearn.impute import KNNImputer

# 1. Load Kaggle CSV
df = pd.read_csv('data/raw/corporate_rating.csv')
df.rename(columns={'Symbol': 'ticker'}, inplace=True)
# 2. Inspect dataset
print("Shape:", df.shape)
print("\nColumn names:\n", df.columns.tolist())
print("\nData types:\n", df.dtypes)
print("\nMissing values:\n", df.isnull().sum())
print("\nSample rows:\n", df.head())

# 3. Check basic distributions for numeric columns
df.describe().T


Shape: (2029, 31)

Column names:
 ['Rating', 'Name', 'ticker', 'Rating Agency Name', 'Date', 'Sector', 'currentRatio', 'quickRatio', 'cashRatio', 'daysOfSalesOutstanding', 'netProfitMargin', 'pretaxProfitMargin', 'grossProfitMargin', 'operatingProfitMargin', 'returnOnAssets', 'returnOnCapitalEmployed', 'returnOnEquity', 'assetTurnover', 'fixedAssetTurnover', 'debtEquityRatio', 'debtRatio', 'effectiveTaxRate', 'freeCashFlowOperatingCashFlowRatio', 'freeCashFlowPerShare', 'cashPerShare', 'companyEquityMultiplier', 'ebitPerRevenue', 'enterpriseValueMultiple', 'operatingCashFlowPerShare', 'operatingCashFlowSalesRatio', 'payablesTurnover']

Data types:
 Rating                                 object
Name                                   object
ticker                                 object
Rating Agency Name                     object
Date                                   object
Sector                                 object
currentRatio                          float64
quickRatio           

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
currentRatio,2029.0,3.529607,44.052361,-0.932005,1.07193,1.493338,2.166891,1725.505
quickRatio,2029.0,2.653986,32.944817,-1.893266,0.602825,0.985679,1.45382,1139.542
cashRatio,2029.0,0.667364,3.583943,-0.192736,0.13063,0.297493,0.624906,125.9174
daysOfSalesOutstanding,2029.0,333.795606,4447.839583,-811.845623,22.905093,42.37412,59.323563,115961.6
netProfitMargin,2029.0,0.278447,6.064134,-101.845815,0.021006,0.064753,0.114807,198.5179
pretaxProfitMargin,2029.0,0.431483,8.984982,-124.343612,0.025649,0.084965,0.144763,309.6949
grossProfitMargin,2029.0,0.497968,0.525307,-14.800817,0.233127,0.414774,0.849693,2.702533
operatingProfitMargin,2029.0,0.587322,11.224622,-124.343612,0.04461,0.107895,0.176181,410.1822
returnOnAssets,2029.0,-37.517928,1166.17222,-40213.17829,0.019176,0.045608,0.077468,0.4878257
returnOnCapitalEmployed,2029.0,-73.974193,2350.275719,-87162.16216,0.028112,0.074421,0.135036,2.439504


In [3]:
# 4. Merge rating classes
rating_map = {
    'AAA': 'AA+',
    'AA': 'AA', 'AA-': 'AA-', 'AA+': 'AA+', 
    'A+': 'A+', 'A': 'A', 'A-': 'A-',
    'BBB+': 'BBB+', 'BBB': 'BBB', 'BBB-': 'BBB-',
    'BB+': 'BB+', 'BB': 'BB', 'BB-': 'BB-',
    'B+': 'B+', 'B': 'B', 'B-': 'B-',
    'CCC+': 'CCC+', 'CCC': 'CCC', 'CCC-': 'CCC-',
    'CC': 'CCC-', 'C': 'CCC-', 'D': 'CCC-'
}

df['rating_simplified'] = df['Rating'].map(rating_map)
print(df['rating_simplified'].value_counts())


rating_simplified
BBB     671
BB      490
A       398
B       302
AA       89
CCC      64
CCC-      8
AA+       7
Name: count, dtype: int64


In [4]:
from sklearn.preprocessing import LabelEncoder

# 5. Encode multiclass target
le = LabelEncoder()
df['rating_encoded'] = le.fit_transform(df['rating_simplified'])

# 6. Binary label — Investment Grade (BBB- or better)
investment_grades = ['AA+', 'AA', 'AA-', 'A+', 'A', 'A-', 'BBB+', 'BBB', 'BBB-']
df['investment_grade'] = df['rating_simplified'].isin(investment_grades).astype(int)

print(df[['rating_simplified', 'rating_encoded', 'investment_grade']].head())


  rating_simplified  rating_encoded  investment_grade
0                 A               0                 1
1               BBB               5                 1
2               BBB               5                 1
3               BBB               5                 1
4               BBB               5                 1


In [5]:
# 7. Handle missing values

numeric_cols = df.select_dtypes(include=[np.number]).columns
missing_ratio = df[numeric_cols].isnull().mean()

# Create missing indicators
for col in numeric_cols:
    if df[col].isnull().any():
        df[f'{col}_missing'] = df[col].isnull().astype(int)

# Split into low and high missing ratio groups
low_missing = missing_ratio[missing_ratio < 0.2].index
high_missing = missing_ratio[missing_ratio >= 0.2].index

# Median imputation for low-missing features
for col in low_missing:
    df[col].fillna(df[col].median(), inplace=True)

# KNN imputation for high-missing features
if len(high_missing) > 0:
    imputer = KNNImputer(n_neighbors=5)
    df[high_missing] = imputer.fit_transform(df[high_missing])

print("Missing values after imputation:\n", df.isnull().sum().sum())


Missing values after imputation:
 0


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  df[col].fillna(df[col].median(), inplace=True)


In [6]:
# 8. Save cleaned tabular data
df.to_csv('data/processed/credit_ratings_tabular_clean.csv', index=False)
print("✅ Saved cleaned file to data/processed/credit_ratings_tabular_clean.csv")


✅ Saved cleaned file to data/processed/credit_ratings_tabular_clean.csv


In [7]:
df.head()

Unnamed: 0,Rating,Name,ticker,Rating Agency Name,Date,Sector,currentRatio,quickRatio,cashRatio,daysOfSalesOutstanding,...,cashPerShare,companyEquityMultiplier,ebitPerRevenue,enterpriseValueMultiple,operatingCashFlowPerShare,operatingCashFlowSalesRatio,payablesTurnover,rating_simplified,rating_encoded,investment_grade
0,A,Whirlpool Corporation,WHR,Egan-Jones Ratings Company,11/27/2015,Consumer Durables,0.945894,0.426395,0.09969,44.203245,...,9.809403,4.008012,0.049351,7.057088,15.565438,0.058638,3.906655,A,0,1
1,BBB,Whirlpool Corporation,WHR,Egan-Jones Ratings Company,2/13/2014,Consumer Durables,1.033559,0.498234,0.20312,38.991156,...,17.40227,3.156783,0.048857,6.460618,15.91425,0.067239,4.002846,BBB,5,1
2,BBB,Whirlpool Corporation,WHR,Fitch Ratings,3/6/2015,Consumer Durables,0.963703,0.451505,0.122099,50.841385,...,13.103448,4.094575,0.044334,10.49197,18.888889,0.074426,3.48351,BBB,5,1
3,BBB,Whirlpool Corporation,WHR,Fitch Ratings,6/15/2012,Consumer Durables,1.019851,0.510402,0.176116,41.161738,...,14.440104,3.63095,-0.012858,4.080741,6.901042,0.028394,4.58115,BBB,5,1
4,BBB,Whirlpool Corporation,WHR,Standard & Poor's Ratings Services,10/24/2016,Consumer Durables,0.957844,0.495432,0.141608,47.761126,...,14.257556,4.01278,0.05377,8.293505,15.808147,0.058065,3.85779,BBB,5,1


In [8]:
# Step 1 — Map ticker + rating_date → filing period
from datetime import datetime
import pandas as pd

# Ensure rating_date is a datetime
df['rating_date'] = pd.to_datetime(df['Date'], errors='coerce')

# Define a helper function to map to fiscal quarter
def get_fiscal_quarter(date):
    """Return fiscal year and quarter string (e.g., 2024_Q1) for given date."""
    year = date.year
    qtr = (date.month - 1) // 3 + 1
    return f"{year}_Q{qtr}"

df['year_qtr'] = df['rating_date'].apply(get_fiscal_quarter)

# For each (ticker, rating_date), map to that quarter
mapped_df = df[['ticker', 'rating_date', 'year_qtr', 'rating_simplified', 'investment_grade']]
mapped_df.head()


Unnamed: 0,ticker,rating_date,year_qtr,rating_simplified,investment_grade
0,WHR,2015-11-27,2015_Q4,A,1
1,WHR,2014-02-13,2014_Q1,BBB,1
2,WHR,2015-03-06,2015_Q1,BBB,1
3,WHR,2012-06-15,2012_Q2,BBB,1
4,WHR,2016-10-24,2016_Q4,BBB,1


In [9]:
# Define expected text file path per record
mapped_df['filing_path'] = mapped_df.apply(
    lambda x: f"sec_filings/{x['ticker']}/{x['year_qtr']}.txt", axis=1
)

print(mapped_df.head())


  ticker rating_date year_qtr rating_simplified  investment_grade  \
0    WHR  2015-11-27  2015_Q4                 A                 1   
1    WHR  2014-02-13  2014_Q1               BBB                 1   
2    WHR  2015-03-06  2015_Q1               BBB                 1   
3    WHR  2012-06-15  2012_Q2               BBB                 1   
4    WHR  2016-10-24  2016_Q4               BBB                 1   

                   filing_path  
0  sec_filings/WHR/2015_Q4.txt  
1  sec_filings/WHR/2014_Q1.txt  
2  sec_filings/WHR/2015_Q1.txt  
3  sec_filings/WHR/2012_Q2.txt  
4  sec_filings/WHR/2016_Q4.txt  


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mapped_df['filing_path'] = mapped_df.apply(


In [10]:
mapped_df.to_csv('data/processed/credit_rating_with_filing_periods.csv', index=False)
print("✅ Saved mapping file to data/processed/credit_rating_with_filing_periods.csv")


✅ Saved mapping file to data/processed/credit_rating_with_filing_periods.csv


In [11]:
mapped_df.head()

Unnamed: 0,ticker,rating_date,year_qtr,rating_simplified,investment_grade,filing_path
0,WHR,2015-11-27,2015_Q4,A,1,sec_filings/WHR/2015_Q4.txt
1,WHR,2014-02-13,2014_Q1,BBB,1,sec_filings/WHR/2014_Q1.txt
2,WHR,2015-03-06,2015_Q1,BBB,1,sec_filings/WHR/2015_Q1.txt
3,WHR,2012-06-15,2012_Q2,BBB,1,sec_filings/WHR/2012_Q2.txt
4,WHR,2016-10-24,2016_Q4,BBB,1,sec_filings/WHR/2016_Q4.txt


In [12]:
import os

# Create the base folder if it doesn't exist
base_dir = "sec_filings"
os.makedirs(base_dir, exist_ok=True)

print(f"✅ Base directory created (or already exists): {os.path.abspath(base_dir)}")

# Now create subfolders for each ticker in your dataset
tickers = mapped_df['ticker'].unique()

for ticker in tickers:
    os.makedirs(os.path.join(base_dir, ticker), exist_ok=True)

print(f"✅ Created subfolders for {len(tickers)} tickers.")


✅ Base directory created (or already exists): C:\Users\manish\IIIT\10 FDA\sec_filings
✅ Created subfolders for 593 tickers.


In [13]:
# List a few example subfolders
print(os.listdir(base_dir)[:10])


['AA', 'AAL', 'AAPL', 'ABB', 'ABBV', 'ABG', 'ACCO', 'ACHC', 'ACIW', 'ACM']


In [14]:
# Cell 0: Install required packages (run once)
# In Jupyter you can run pip installs inline.
import sys
!{sys.executable} -m pip install --quiet beautifulsoup4 lxml requests tqdm pdfminer.six

# Cell 1: Imports & config
import os
import re
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin
import pandas as pd
from tqdm import tqdm
from datetime import datetime
from pdfminer.high_level import extract_text as extract_pdf_text

# Config / polite header for SEC
USER_AGENT_EMAIL = "loginboy772@gmail.com"  # <-- replace with your email
HEADERS = {
    "User-Agent": f"Your Name - Data Collection Script ({USER_AGENT_EMAIL})",
    "Accept-Encoding": "gzip, deflate",
    "Host": "www.sec.gov"
}

# Output locations
BASE_DIR = "sec_filings"
MANIFEST_PATH = "data/processed/edgar_manifest.csv"
FAILURES_PATH = "data/processed/edgar_failures.csv"
os.makedirs(BASE_DIR, exist_ok=True)
os.makedirs(os.path.dirname(MANIFEST_PATH), exist_ok=True)

# Utility: safe requests.get with retries
def safe_get(url, stream=False, timeout=30):
    for attempt in range(3):
        try:
            r = requests.get(url, headers=HEADERS, stream=stream, timeout=timeout)
            r.raise_for_status()
            return r
        except Exception as e:
            last_exc = e
    raise last_exc

# Cell 2: Load the list of tickers + (optional) rating/quarters
# If you have mapped_df already in memory use it. Otherwise load saved mapping CSV.
try:
    mapped_df  # if this exists from prior cells
    print("Using mapped_df from memory.")
except NameError:
    if os.path.exists("data/processed/credit_rating_with_filing_periods.csv"):
        mapped_df = pd.read_csv("data/processed/credit_rating_with_filing_periods.csv", parse_dates=["rating_date"])
        print("Loaded mapping CSV with", len(mapped_df), "rows.")
    else:
        # fallback: load cleaned tabular file and take unique tickers
        if os.path.exists("data/processed/credit_ratings_tabular_clean.csv"):
            tmp = pd.read_csv("data/processed/credit_ratings_tabular_clean.csv", parse_dates=["rating_date"])
            mapped_df = tmp[['ticker', 'rating_date']].drop_duplicates().reset_index(drop=True)
            mapped_df['year_qtr'] = mapped_df['rating_date'].dt.to_period('Q').astype(str).str.replace('Q', '_Q')
            mapped_df.rename(columns={'rating_date':'rating_date'}, inplace=True)
            print("Loaded cleaned tabular file, created minimal mapped_df with", len(mapped_df), "rows.")
        else:
            raise FileNotFoundError("No mapped_df in memory and no processed CSVs found. Run earlier steps or place a CSV at data/processed/credit_rating_with_filing_periods.csv")

# We'll operate on unique (ticker, year_qtr) pairs to avoid duplicate downloads
mapped_df['year_qtr'] = mapped_df['rating_date'].dt.to_period('Q').astype(str).str.replace('Q', '_Q')
pairs = mapped_df[['ticker', 'rating_date', 'year_qtr']].drop_duplicates().reset_index(drop=True)
tickers = pairs['ticker'].unique().tolist()
print(f"{len(pairs)} unique (ticker, rating_date) pairs; {len(tickers)} unique tickers found.")

# Cell 3: Helpers to find filing pages and primary document link
def search_filings_list(ticker, filing_type=None, count=40):
    """
    Use SEC browse-edgar HTML to retrieve the filings list (returns soup of results).
    Optionally filter by filing_type (e.g., '10-Q' or '10-K').
    """
    base = "https://www.sec.gov/cgi-bin/browse-edgar"
    params = {
        "action": "getcompany",
        "CIK": ticker,
        "type": filing_type or "",
        "owner": "exclude",
        "count": count,
        "output": "atom"
    }
    url = base + "?" + "&".join(f"{k}={requests.utils.quote(str(v))}" for k,v in params.items())
    r = safe_get(url)
    return r.text

def parse_atom_for_filings(atom_xml):
    """
    Parse the SEC atom feed and extract a list of filings with keys:
    { 'filing_date', 'filing_type', 'filing_href' (filing-detail page) }
    """
    soup = BeautifulSoup(atom_xml, "lxml")
    entries = []
    for entry in soup.find_all("entry"):
        try:
            filing_type = entry.find("category")['term']
        except Exception:
            filing_type = entry.find("category")['term'] if entry.find("category") else None
        filing_date = entry.find("updated").text if entry.find("updated") else None
        link = entry.find("link", {"rel":"alternate"})
        href = link['href'] if link else None
        if href:
            entries.append({
                "filing_type": filing_type.strip() if filing_type else None,
                "filing_date": filing_date,
                "filing_href": href
            })
    return entries

def find_primary_document_link(filing_detail_html, base_url):
    """
    Given the HTML of the filing detail page, find the primary document link (.htm, .html, .txt, .pdf).
    Returns absolute URL and the filename.
    """
    soup = BeautifulSoup(filing_detail_html, "lxml")
    # First look in the table with class "tableFile" for the first document row
    table = soup.find("table", {"class": "tableFile"})
    if table:
        # Typically the first row (after header) is the primary document
        rows = table.find_all("tr")
        if len(rows) > 1:
            for r in rows[1:]:
                cells = r.find_all("td")
                if len(cells) >= 3:
                    doc_cell = cells[2]  # document link cell is usually the 3rd
                    a = doc_cell.find("a")
                    if a and a.get('href'):
                        link = urljoin(base_url, a['href'])
                        fname = a.text.strip()
                        # prefer html or txt over xml/xbrl; pdf if no html
                        if re.search(r'\.htm|\.html|\.txt|\.pdf', link, re.I):
                            return link, fname
            # fallback: first <a> in table
            a = table.find("a")
            if a:
                link = urljoin(base_url, a['href'])
                return link, a.text.strip()
    # Otherwise look for <a id="formName"> or any large link to doc
    a = soup.find("a", string=re.compile(r'Primary Document', re.I))
    if a and a.get('href'):
        return urljoin(base_url, a['href']), a.text.strip()
    # last fallback: first link to .htm or .txt
    a = soup.find("a", href=re.compile(r'\.htm|\.html|\.txt|\.pdf', re.I))
    if a:
        return urljoin(base_url, a['href']), a.text.strip()
    return None, None

# Cell 4: Text extraction helpers
def html_to_plain_text(html):
    soup = BeautifulSoup(html, "lxml")
    # remove scripts/styles
    for tag in soup(["script", "style", "header", "footer", "nav", "noscript"]):
        tag.decompose()
    text = soup.get_text(separator="\n")
    # collapse multiple newlines and whitespace
    text = re.sub(r'\n\s*\n+', '\n\n', text)
    return text

def extract_mda_from_text(full_text, filing_type_hint=None):
    """
    Heuristic MD&A extraction.
    Returns the extracted MD&A string or None.
    Strategy:
      - Lowercase the text for searching
      - Look for common MD&A start headings, then find an end heading
      - If none found, fallback to Item markers (Item 7 for 10-K, Item 2 for 10-Q)
      - If multiple matches, pick longest plausible chunk (>500 chars)
    """
    if not isinstance(full_text, str) or len(full_text) < 200:
        return None

    txt = full_text
    lower = txt.lower()

    # Candidate start patterns
    start_patterns = [
        r"management['’`s]*\s+discussion\s+and\s+analysis",  # management's discussion and analysis
        r"management\s+discussion\s+and\s+analysis", 
        r"management['’`s]*\s+discussion", 
        r"management\s+discussion\s+and\s+analysis\s+of\s+financial\s+condition",
        r"\bmd&a\b",
        r"item\s+7\.\s*management['’`s]*\s+discussion",  # item 7 markers
        r"item\s+2\.\s*management['’`s]*\s+discussion"
    ]
    # Candidate end patterns
    end_patterns = [
        r"quantitative\s+and\s+qualitative\s+disclosures", 
        r"controls\s+and\s+procedures",
        r"financial\s+statements\s+and\s+supplementary\s+data",
        r"item\s+7a\.", r"item\s+8\.", r"item\s+8\.", r"item\s+6\.", r"item\s+1a\.", 
        r"risk\s+factors", r"legal\s+proceedings", r"exhibits"
    ]

    # compile regex lists
    start_re = [re.compile(p, re.I) for p in start_patterns]
    end_re = [re.compile(p, re.I) for p in end_patterns]

    candidates = []
    # Find start indices
    for s_re in start_re:
        for m in s_re.finditer(lower):
            s_idx = m.start()
            # search for end after the start
            e_idx = None
            # look for the earliest end pattern after s_idx
            for e_re in end_re:
                m2 = e_re.search(lower, s_idx + 100)  # allow some offset after start
                if m2:
                    if e_idx is None or m2.start() < e_idx:
                        e_idx = m2.start()
            # fallback to next "item" marker (Item \d+)
            if e_idx is None:
                m_item = re.search(r'\n\s*item\s+\d+\b', lower[s_idx+50:s_idx+5000])
                if m_item:
                    e_idx = s_idx + 50 + m_item.start()
            # If still None, set a reasonable chunk length (e.g., 10000 chars) but ensure within bounds
            if e_idx is None:
                e_idx = min(len(txt), s_idx + 15000)
            # extract candidate
            cand = txt[s_idx:e_idx]
            # Filter out extremely short candidates
            if len(cand) > 400:
                candidates.append(cand)

    # If no candidates found by headings, fallback to Item markers (Item 7 for 10-K, Item 2 for 10-Q)
    if not candidates:
        # check for item markers with numeric boundaries: item 7 -> item 8, item 2 -> item 3
        if filing_type_hint and "10-k" in filing_type_hint.lower():
            m = re.search(r'(item\s+7\..*?)(?=item\s+8\.)', lower, re.S)
            if m:
                cand = txt[m.start():m.end()]
                if len(cand) > 300:
                    candidates.append(cand)
        if filing_type_hint and "10-q" in filing_type_hint.lower():
            m = re.search(r'(item\s+2\..*?)(?=item\s+3\.)', lower, re.S)
            if m:
                cand = txt[m.start():m.end()]
                if len(cand) > 300:
                    candidates.append(cand)
        # generic item-numeric fallback
        if not candidates:
            m = re.search(r'(item\s+7\..*?)(?=item\s+8\.)', lower, re.S)
            if m and len(m.group(0)) > 300:
                candidates.append(txt[m.start():m.end()])
            m = re.search(r'(item\s+2\..*?)(?=item\s+3\.)', lower, re.S)
            if m and len(m.group(0)) > 300:
                candidates.append(txt[m.start():m.end()])

    # If multiple candidates, pick the longest
    if candidates:
        best = max(candidates, key=len)
        # clean extra whitespace
        best = re.sub(r'\n{3,}', '\n\n', best).strip()
        # sanity check: require at least a few sentences and > 500 chars
        if len(best) >= 500:
            return best
    return None

# Cell 5: Main loop — find filings, download, extract, save manifest
manifest_rows = []
failures = []

# We'll attempt either 10-Q first (quarterly) then 10-K (annual) — and take the one on or before rating_date
# For speed we process per pair (ticker, rating_date)
for idx, row in tqdm(pairs.iterrows(), total=len(pairs), desc="Processing pairs"):
    try:
        ticker = row['ticker']
        rating_date = pd.to_datetime(row['rating_date'])
        year_qtr = row['year_qtr']
        ticker_dir = os.path.join(BASE_DIR, ticker)
        os.makedirs(ticker_dir, exist_ok=True)

        # Search SEC atom feed for both 10-Q and 10-K
        atom_text_q = search_filings_list(ticker, filing_type="10-Q", count=200)
        atom_text_k = search_filings_list(ticker, filing_type="10-K", count=200)

        filings_q = parse_atom_for_filings(atom_text_q)
        filings_k = parse_atom_for_filings(atom_text_k)
        candidate_filings = []

        # convert filing_date strings to dates and collect only those on or before rating_date
        def norm_filing_list(filings_list, ftype):
            result = []
            for f in filings_list:
                try:
                    fd = pd.to_datetime(f['filing_date']).date()
                except Exception:
                    continue
                if fd <= rating_date.date():
                    result.append({
                        'filing_type': ftype,
                        'filing_date': fd,
                        'filing_href': f['filing_href']
                    })
            return result

        candidate_filings.extend(norm_filing_list(filings_q, "10-Q"))
        candidate_filings.extend(norm_filing_list(filings_k, "10-K"))

        # If none found on/before rating_date, broaden to any recent filings (take latest)
        if not candidate_filings:
            # try recent Qs and Ks without date restriction
            candidate_filings.extend([{'filing_type':f['filing_type'],'filing_date':pd.to_datetime(f['filing_date']).date(),'filing_href':f['filing_href']} for f in parse_atom_for_filings(atom_text_q)])
            candidate_filings.extend([{'filing_type':f['filing_type'],'filing_date':pd.to_datetime(f['filing_date']).date(),'filing_href':f['filing_href']} for f in parse_atom_for_filings(atom_text_k)])

        # sort by filing_date descending (prefer most recent on/before rating_date)
        candidate_filings = sorted(candidate_filings, key=lambda x: x['filing_date'], reverse=True)

        # take up to top 3 candidates and try to download primary doc and extract MD&A
        extracted_ok = False
        tried_paths = []
        for cand in candidate_filings[:3]:
            filing_detail_url = cand['filing_href']
            filing_type = cand['filing_type'] or ""
            filing_date = cand['filing_date']

            # Fetch filing detail page
            try:
                fd_resp = safe_get(filing_detail_url)
            except Exception as e:
                failures.append({'ticker': ticker, 'year_qtr': year_qtr, 'filing_href': filing_detail_url, 'error': f"detail_fetch_error: {e}"})
                continue

            primary_link, primary_name = find_primary_document_link(fd_resp.text, filing_detail_url)
            if not primary_link:
                failures.append({'ticker': ticker, 'year_qtr': year_qtr, 'filing_href': filing_detail_url, 'error': "no_primary_doc_link_found"})
                continue

            # Download primary document
            try:
                r = safe_get(primary_link, stream=False)
            except Exception as e:
                failures.append({'ticker': ticker, 'year_qtr': year_qtr, 'primary_link': primary_link, 'error': f"primary_fetch_error: {e}"})
                continue

            content_type = r.headers.get('Content-Type', '').lower()
            ext = ".html"
            if "pdf" in content_type or primary_link.lower().endswith(".pdf"):
                ext = ".pdf"
            elif primary_link.lower().endswith(".txt"):
                ext = ".txt"
            elif primary_link.lower().endswith(".htm") or primary_link.lower().endswith(".html"):
                ext = ".html"
            else:
                # use content-type guesses
                if "html" in content_type:
                    ext = ".html"
                elif "text/plain" in content_type:
                    ext = ".txt"
                elif "xml" in content_type:
                    ext = ".xml"

            out_fname = f"{year_qtr}{ext}"
            out_path = os.path.join(ticker_dir, out_fname)
            # Save raw bytes
            with open(out_path, "wb") as f:
                f.write(r.content)
            tried_paths.append(out_path)

            # Extract text
            text = None
            try:
                if ext == ".pdf":
                    # extract text from PDF
                    text = extract_pdf_text(out_path)
                else:
                    # html/txt/xml -> decode to string then parse
                    # some pages are windows-1252; requests should have guessed encoding
                    html = r.content.decode(r.encoding or 'utf-8', errors='ignore')
                    text = html_to_plain_text(html)
            except Exception as e:
                failures.append({'ticker': ticker, 'year_qtr': year_qtr, 'file_path': out_path, 'error': f"text_extraction_error: {e}"})
                text = None

            # Attempt MD&A extraction
            mda_text = None
            try:
                mda_text = extract_mda_from_text(text or "", filing_type_hint=filing_type)
            except Exception as e:
                failures.append({'ticker': ticker, 'year_qtr': year_qtr, 'file_path': out_path, 'error': f"mda_extract_error: {e}"})
                mda_text = None

            # Save manifest row even if mda_text is None (we want to keep the file path and allow manual inspection)
            manifest_rows.append({
                "ticker": ticker,
                "rating_date": rating_date,
                "year_qtr": year_qtr,
                "filing_date": filing_date,
                "filing_type": filing_type,
                "file_path": out_path,
                "mda_text": mda_text
            })

            if mda_text:
                extracted_ok = True
                break  # best candidate found for this (ticker, rating_date)
        if not extracted_ok and not tried_paths:
            failures.append({'ticker': ticker, 'year_qtr': year_qtr, 'error': 'no_filings_found_or_downloaded'})
    except Exception as e:
        failures.append({'ticker': row.get('ticker'), 'year_qtr': row.get('year_qtr'), 'error': f'outer_loop_error: {e}'})

# Save manifest and failures
manifest_df = pd.DataFrame(manifest_rows)
# ensure consistent columns
if 'mda_text' not in manifest_df.columns:
    manifest_df['mda_text'] = None
manifest_df.to_csv(MANIFEST_PATH, index=False)
print(f"✅ Manifest saved to {MANIFEST_PATH}; rows: {len(manifest_df)}")

fail_df = pd.DataFrame(failures)
if len(fail_df):
    fail_df.to_csv(FAILURES_PATH, index=False)
    print(f"⚠️ Failures logged to {FAILURES_PATH}; rows: {len(fail_df)}")
else:
    print("✅ No failures logged.")

# Cell 6: Quick summary and example of reading saved manifest and checking MD&A column
print("\nManifest preview:")
display(manifest_df.head())

# Count how many MD&A extracted
n_extracted = manifest_df['mda_text'].notnull().sum()
print(f"\nMD&A extracted for {n_extracted} filings out of {len(manifest_df)} saved filings.")


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  mapped_df['year_qtr'] = mapped_df['rating_date'].dt.to_period('Q').astype(str).str.replace('Q', '_Q')


Using mapped_df from memory.
2029 unique (ticker, rating_date) pairs; 593 unique tickers found.


  soup = BeautifulSoup(atom_xml, "lxml")
  soup = BeautifulSoup(html, "lxml")
Processing pairs: 100%|██████████████████████████████████████████████████████████| 2029/2029 [2:11:51<00:00,  3.90s/it]


✅ Manifest saved to data/processed/edgar_manifest.csv; rows: 2506
⚠️ Failures logged to data/processed/edgar_failures.csv; rows: 306

Manifest preview:


Unnamed: 0,ticker,rating_date,year_qtr,filing_date,filing_type,file_path,mda_text
0,WHR,2015-11-27,2015_Q4,2015-10-23,10-Q,sec_filings\WHR\2015_Q4.html,ITEM 2.\nMANAGEMENT’S DISCUSSION AND ANALYSIS ...
1,WHR,2014-02-13,2014_Q1,2013-10-22,10-Q,sec_filings\WHR\2014_Q1.html,ITEM 2.\nMANAGEMENT’S DISCUSSION AND ANALYSIS ...
2,WHR,2015-03-06,2015_Q1,2015-02-26,10-K,sec_filings\WHR\2015_Q1.html,ITEM 7.\nMANAGEMENT’S DISCUSSION AND ANALYSIS ...
3,WHR,2012-06-15,2012_Q2,2012-04-26,10-Q,sec_filings\WHR\2012_Q2.html,ITEM 2.\nMANAGEMENT’S DISCUSSION AND ANALYSIS ...
4,WHR,2016-10-24,2016_Q4,2016-07-22,10-Q,sec_filings\WHR\2016_Q4.html,ITEM 2.\nMANAGEMENT’S DISCUSSION AND ANALYSIS ...



MD&A extracted for 1403 filings out of 2506 saved filings.
