## Setup

In [1]:
import glob
import os
import warnings
import sys
import traceback
import re
import pandas as pd

from requests_html import HTMLSession
from sec_edgar_downloader import Downloader
from tqdm import tqdm
from bs4 import BeautifulSoup

warnings.filterwarnings(
    "ignore",
    message="It looks like you're parsing an XML document using an HTML parser",
)

In [2]:
os.makedirs("data", exist_ok=True)
os.makedirs("10K_files", exist_ok=True)

## Download each 10-K

### List Tickers

List each S&P500 ticker on Wikipedia.  If the list already exists, we skip the download.

In [3]:
sp500_path = "inputs/s&p500_2022.csv"

if not os.path.exists(sp500_path):
    url = "https://en.wikipedia.org/wiki/List_of_S%26P_500_companies"
    pd.read_html(url)[0].to_csv(sp500_path, index=False)  # [1] shows updates

sp500 = pd.read_csv(sp500_path)

In [4]:
sp500

Unnamed: 0,Symbol,Security,GICS Sector,GICS Sub-Industry,Headquarters Location,Date added,CIK,Founded
0,MMM,3M,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1957-03-04,66740,1902
1,AOS,A. O. Smith,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott,Health Care,Health Care Equipment,"North Chicago, Illinois",1957-03-04,1800,1888
3,ABBV,AbbVie,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989
...,...,...,...,...,...,...,...,...
498,YUM,Yum! Brands,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,1041061,1997
499,ZBRA,Zebra Technologies,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,877212,1969
500,ZBH,Zimmer Biomet,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,1136869,1927
501,ZION,Zions Bancorporation,Financials,Regional Banks,"Salt Lake City, Utah",2001-06-22,109380,1873


### Download

Download the last 10-K files of 2022.

In [5]:
# Set up changeable variables
tics = sp500['Symbol'].to_list()
before = '2023-01-01'
res_path = '10K_files'
download_type = '10-K'
amount = 1

# Ensure the directory and downloader exist
os.makedirs(res_path, exist_ok=True)
dl = Downloader(res_path)

In [6]:
# Loop over each ticker
for tic in tqdm(tics):
    # Check if the files were already downloaded
    tic_res_path = rf'{res_path}\sec-edgar-filings\{tic}\{download_type}'
    file_downloaded = (
        os.path.exists(tic_res_path) and len(os.listdir(tic_res_path)) >= amount
    )  # quick check
    if not file_downloaded:
        try:
            dl.get(download_type, tic, before=before, amount=amount)
        except Exception:
            print(f'Error on {tic}: {sys.exc_info()[2]}')

    # Check and delete any .txt files in path
    for file in glob.glob(tic_res_path + '/*/*.txt'):
        os.remove(file)

100%|██████████████████████████████████████| 503/503 [00:03<00:00, 152.28it/s]


### Check for Erroneous Filings

Checking the first ticker, A, we find that the `sec_edgar_downloader` incorrectly downloaded ticker HEI.A (HEICO Corporation) instead of A (Agilent Technologies), along with many other mistaken tickers.  In this loop, we find the company name specified in the S&P500 table and compare it to the name found in the 10-K.

In [7]:
# Replace existing misspellings in sp500 table
sp500.loc[sp500['Symbol'] == 'GWW', ['Security']] = 'W.W. Grainger'
sp500.loc[sp500['Symbol'] == 'COO', ['Security']] = 'The Cooper Companies, Inc.'
sp500.loc[sp500['Symbol'] == 'HSY', ['Security']] = 'The Hershey Company'
sp500.loc[sp500['Symbol'] == 'MCO', ['Security']] = "Moody's Corporation"    # TODO
sp500.loc[sp500['Symbol'] == 'ORLY', ['Security']] = "O'Reilly Automotive, Inc."
sp500.loc[sp500['Symbol'] == 'PH', ['Security']] = 'Parker-Hannifin Corporation'
# TODO: General Motors?

In [8]:
counter = 0

for tic in tics:
    tic_res_path = rf'{res_path}\sec-edgar-filings\{tic}\{download_type}'
    
    if not os.path.exists(tic_res_path):
        print(f'\tCannot find ticker {tic}')
        continue
    
    for file in glob.glob(tic_res_path+'/*/*.html'):
        with open(file, 'rb') as report_file:
            html = report_file.read()
        
        # Get approximate name
        tic_name = sp500.loc[sp500['Symbol'] == tic]['Security'].values[0]
        appx_tic = re.sub('\s*&\s*', '.+', tic_name)
        appx_tic = re.sub('\(.*\)', '', appx_tic)
        appx_tic = re.sub(r"[^\x00-\x7F]|'", '.', appx_tic)    # non ascii
        appx_tic = re.sub('(,\s*)?(Inc|Co)\.', '', appx_tic).strip()
        
        # Search in file
        if not re.search(appx_tic.encode('utf-8'), html, re.IGNORECASE):
            print(f'Cannot find {appx_tic} in {tic}\'s {download_type}')
            counter += 1

print(counter)

Cannot find Agilent Technologies in A's 10-K
Cannot find Allstate in ALL's 10-K
	Cannot find ticker BRK.B
Cannot find Bio-Rad in BIO's 10-K
Cannot find Bio-Techne in TECH's 10-K
Cannot find BNY Mellon in BK's 10-K
Cannot find Boeing in BA's 10-K
	Cannot find ticker BF.B
Cannot find Caterpillar in CAT's 10-K
Cannot find CF Industries in CF's 10-K
Cannot find Chipotle Mexican Grill in CMG's 10-K
Cannot find Chubb Limited in CB's 10-K
Cannot find Citigroup in C's 10-K
Cannot find Darden Restaurants in DRI's 10-K
Cannot find John Deere in DE's 10-K
Cannot find Dominion Energy in D's 10-K
	Cannot find ticker ELV
Cannot find Essex Property Trust in ESS's 10-K
Cannot find Fastenal in FAST's 10-K
	Cannot find ticker FRC
Cannot find FirstEnergy in FE's 10-K
Cannot find Ford Motor Company in F's 10-K
Cannot find Gartner in IT's 10-K
	Cannot find ticker GEHC
Cannot find Gen Digital in GEN's 10-K
Cannot find General Motors in GM's 10-K
Cannot find Healthpeak in PEAK's 10-K
Cannot find Host Hotels.