In [10]:
import os
from tqdm import tqdm
import pandas as pd

In [20]:
sp500_url = 'https://en.wikipedia.org/wiki/List_of_S%26P_500_companies'
data_table = pd.read_html(sp500_url)

df = data_table[0].set_index("Symbol")

def map_fn(cik):
    
    cik = str(cik)
    
    if len(cik) == 10:
        return cik
    else:
        
        return '0' * (10 - len(cik)) + cik 
        

df['CIK'] = df['CIK'].map(map_fn)

df['CIK'].head()

Symbol
MMM     0000066740
ABT     0000001800
ABBV    0001551152
ABMD    0000815094
ACN     0001467373
Name: CIK, dtype: object

In [30]:
cik_lookup = df.CIK.to_dict()

'0001018724'

In [2]:
"""
cik_lookup = {
    'AMZN': '0001018724',
    'BMY': '0000014272',   
    'CNP': '0001130310',
    'CVX': '0000093410',
    'FL': '0000850209',
    'FRT': '0000034903',
    'HON': '0000773840'}
    
"""

In [3]:
# import requests as req
from bs4 import BeautifulSoup
import requests

def get_sec_data(cik, doc_type, start=0, count=60):
    rss_url = 'https://www.sec.gov/cgi-bin/browse-edgar?action=getcompany' \
        '&CIK={}&type={}&start={}&count={}&owner=exclude&output=atom' \
        .format(cik, doc_type, start, count)
    # sec_data = sec_api.get(rss_url)
    # sec_data = req.get(rss_url)
    sec_data = requests.get(rss_url, timeout=5)
    # feed = BeautifulSoup(sec_data.encode('ascii'), 'xml').feed
    feed = BeautifulSoup(sec_data.content, "xml").feed
    entries = [
        (
            entry.content.find('filing-href').getText(),
            entry.content.find('filing-type').getText(),
            entry.content.find('filing-date').getText())
        for entry in feed.find_all('entry', recursive=False)]

    return entries

In [5]:
#example_ticker = 'AMZN'
sec_data = {}

import pprint

for ticker, cik in cik_lookup.items():
    sec_data[ticker] = get_sec_data(cik, '10-K')

#pprint.pprint(sec_data[example_ticker])

In [6]:

raw_fillings_by_ticker = {}

for ticker, data in sec_data.items():
    raw_fillings_by_ticker[ticker] = {}
    for index_url, file_type, file_date in tqdm(data, desc='Downloading {} Fillings'.format(ticker), unit='filling'):
        if (file_type == '10-K'):
            file_url = index_url.replace('-index.htm', '.txt').replace('.txtl', '.txt')            
            
            # raw_fillings_by_ticker[ticker][file_date] = sec_api.get(file_url)
            raw_fillings_by_ticker[ticker][file_date] = requests.get(file_url, timeout=5).content.decode("utf-8")#requests.get(file_url, timeout=5)


Downloading AMZN Fillings: 100%|██████████| 24/24 [00:13<00:00,  1.81filling/s]
Downloading BMY Fillings: 100%|██████████| 29/29 [00:15<00:00,  2.17filling/s]
Downloading CNP Fillings: 100%|██████████| 21/21 [00:09<00:00,  2.22filling/s]
Downloading CVX Fillings: 100%|██████████| 27/27 [00:14<00:00,  2.25filling/s]
Downloading FL Fillings: 100%|██████████| 24/24 [00:11<00:00,  2.15filling/s]
Downloading FRT Fillings: 100%|██████████| 31/31 [00:11<00:00,  4.03filling/s]
Downloading HON Fillings: 100%|██████████| 27/27 [00:13<00:00,  2.03filling/s]


In [31]:
for tix in raw_fillings_by_ticker.keys():
    
    path = os.getcwd() + '/' + '10-K/' + tix

    if not os.path.exists(path):
        os.mkdir(path)
    else:
        pass

    for date in raw_fillings_by_ticker[tix].keys():

        new_path = path + '/' + date + '.htm'

        with open(new_path, "w") as text_file:
            text_file.write(raw_fillings_by_ticker[tix][date])