Step 1: Get a list of sample firms

In [2]:
import glob
import os

import pandas as pd
from sec_edgar_downloader import Downloader
from tqdm import tqdm 
import shutil

In [3]:
# places to put files - best practice chapter 2!

os.makedirs("inputs", exist_ok=True)
os.makedirs("10k_files", exist_ok=True)

In [4]:
#Step 1: Getting S&P data
# path and place to put it
sp500_file = 'inputs/sp500_2022.csv'

# get it if we haven't 
if not os.path.exists(sp500_file):
    # 2022 dec version of page
    url = 'https://en.wikipedia.org/w/index.php?title=List_of_S%26P_500_companies&oldid=1130173030'
    pd.read_html(url)[0].to_csv(sp500_file,index=False)

# load and look at it
sp500 = pd.read_csv(sp500_file) 

In [5]:
sp500

Unnamed: 0,Symbol,Security,SEC filings,GICS Sector,GICS Sub-Industry,Headquarters Location,Date first added,CIK,Founded
0,MMM,3M,reports,Industrials,Industrial Conglomerates,"Saint Paul, Minnesota",1976-08-09,66740,1902
1,AOS,A. O. Smith,reports,Industrials,Building Products,"Milwaukee, Wisconsin",2017-07-26,91142,1916
2,ABT,Abbott,reports,Health Care,Health Care Equipment,"North Chicago, Illinois",1964-03-31,1800,1888
3,ABBV,AbbVie,reports,Health Care,Pharmaceuticals,"North Chicago, Illinois",2012-12-31,1551152,2013 (1888)
4,ACN,Accenture,reports,Information Technology,IT Consulting & Other Services,"Dublin, Ireland",2011-07-06,1467373,1989
...,...,...,...,...,...,...,...,...,...
498,YUM,Yum! Brands,reports,Consumer Discretionary,Restaurants,"Louisville, Kentucky",1997-10-06,1041061,1997
499,ZBRA,Zebra Technologies,reports,Information Technology,Electronic Equipment & Instruments,"Lincolnshire, Illinois",2019-12-23,877212,1969
500,ZBH,Zimmer Biomet,reports,Health Care,Health Care Equipment,"Warsaw, Indiana",2001-08-07,1136869,1927
501,ZION,Zions Bancorporation,reports,Financials,Regional Banks,"Salt Lake City, Utah",2001-06-22,109380,1873


In [6]:
dl = Downloader("Lehigh", 
                "muj225@lehigh.edu",
                "10k_files")

In [7]:
if not os.path.exists('10k_files/10k_files.zip'):
    
    for cik in tqdm(sp500['CIK']): # start with a small subset while we figure things out, remove later
         
        firm_folder = f'10k_files/sec-edgar-filings/{str(cik).zfill(10)}/'  # str(cik).zfill(10)   means that CIK 1234 becomes 0000001234

        # if I haven't downloaded any HTML for this firm (len=0 files on this pattern), do so
        # you could make this more precise and only look for filings during 2022 (if you were downloading across many years)
        
        if len(glob.glob(firm_folder + '/10-K/*/*.html')) == 0:
            
            dl.get("10-K", cik, 
                   limit=1,                  # get the latest filing within window
                   after="2022-01-01",       # does this download filings ON 1/1 or nah? (check)
                   before="2022-12-31",      # does this download filings ON 12/31 or nah? (check)
                   download_details =True    # download the html 
            ) 
    
        # delete the txt files as we go!!!
        # files are of the form: folder/10-K/*/*.txt
        for txt_f in glob.glob(firm_folder + '/10-K/*/*.txt'):
            os.remove(txt_f)    
    
        # pause if there is a problem and the SEC is mad at my spider
        # unneeded! sec-edgar-dl does it for us 

In [8]:
files = glob.glob('10k_files/sec-edgar-filings/*/10-K/*/*.html')

f'We have {len(files)} HTML files for {len(sp500["CIK"])} firms'

'We have 498 HTML files for 503 firms'

In [9]:
##One shot code - run it once after all downloads completed
# set to True to run the code below. make sure you are done with downloads first!
# see if your folder has ~500ish html files, and take the screenshot from instructions
done_with_downloads = True 

if os.path.exists('10k_files/sec-edgar-filings') and \
    not os.path.exists('10k_files/10k_files.zip') and \
    done_with_downloads:
    
    # zip the folder (2GB --> 150mb)
    shutil.make_archive('10k_files', 'zip', '10k_files')
    
    # delete the folder with all the files
    # I've commented this out for now so that we can keep opening up the 10-Ks up easily 
    #shutil.rmtree('10k_files/sec-edgar-filings')
    
    # put the zip file in the `10k_files` folder
    shutil.move('10k_files.zip', '10k_files/')