<font size="6"> **Donwload SEC 10-K Fillings** </font>

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
%run ../nb_config.py

In [141]:
import os
import numpy as np
import pandas as pd
import datetime as dt
import scipy
import pickle
import pprint
from tqdm import tqdm

from src import utils
from src.load_data import load_sec10k, io_utils
from src.nlp_quant import parse_sec_fillings

In [4]:
cfg = utils.read_conf()

In [76]:
INPATH =  os.path.join(io_utils.interim_path, '')
OUTPATH1 = os.path.join(io_utils.raw_path, 'sec_fillings', '')
OUTPATH2 = os.path.join(OUTPATH1, 'ten_ks', '')
OUTPATH3 = os.path.join(io_utils.interim_path, 'sec_fillings', '')
OUTPATH4 = os.path.join(OUTPATH3, 'ten_ks', '')
os.path.isdir(INPATH), os.path.isdir(OUTPATH2), os.path.isdir(OUTPATH4)

(True, True, True)

In [6]:
INFILE1 = cfg['output']['interim']['sector_factors']
OUTFILE1 = 'metadata.pkl'
OUTFILE2 = 'sec_10k.pkl'

# Get target tickers

In [9]:
tickers_to_cik = load_sec10k.get_cik_mapping().set_index('ticker')
tickers_to_cik.head(2)

Unnamed: 0_level_0,cik
ticker,Unnamed: 1_level_1
AAPL,320193
MSFT,789019


In [10]:
tickers_univ = pd.read_csv(INPATH + INFILE1, parse_dates=['date'])
tickers_univ = tickers_univ[['asset', 'sector_code']].drop_duplicates().rename(columns={'asset': 'ticker'}).set_index('ticker')
tickers_univ.head(2)

Unnamed: 0_level_0,sector_code
ticker,Unnamed: 1_level_1
A,0
AAL,3


In [11]:
cik_lookup_df = tickers_univ.join(tickers_to_cik)


In [12]:
cik_lookup_nans = cik_lookup_df[cik_lookup_df['cik'].isna()]
cik_lookup_nans.shape

(32, 2)

In [13]:
cik_lookup_nans.sample(5)

Unnamed: 0_level_0,sector_code,cik
ticker,Unnamed: 1_level_1,Unnamed: 2_level_1
TWX,8,
GGP,6,
SPLS,8,
APC,9,
PX,10,


In [14]:
cik_lookup = cik_lookup_df['cik'].dropna().to_dict()
len(cik_lookup)

458

# Get 10ks
We'll be running NLP analysis on 10-k documents. To do that, we first need to download the documents. For this project, we'll download 10-ks for a few companies. To lookup documents for these companies, we'll use their CIK. If you would like to run this against other stocks, we've provided the dict `additional_cik` for more stocks. However, the more stocks you try, the long it will take to run.

In [43]:
DOC_TYPE = '10-K'
START_DT = '1998-01-01'

In [20]:
example_ticker = 'AMZN'

## Get list of 10-ks urls
The SEC has a limit on the number of calls that can be made to the website per second. The `SecAPI` class, will cache data from the SEC and prevent you from going over the limit.

In [16]:
sec_api = load_sec10k.SecAPI()

With the class constructed, let's pull a list of filled 10-ks from the SEC for each company.

Let's pull the list using the `get_sec_data` function, then display some of the results. For displaying some of the data, we'll use Amazon as an example. 

In [65]:
sec_data = {}
sec_dates = {}
for ticker, cik in cik_lookup.items():
    sec_data[ticker] = load_sec10k.get_sec_data(sec_api=sec_api, cik=cik, doc_type=DOC_TYPE)
    sec_dates[ticker] = [x[2] for x in sec_data[ticker]]

In [66]:
sec_dates[example_ticker]

['2020-01-31',
 '2019-02-01',
 '2018-02-02',
 '2017-02-10',
 '2016-01-29',
 '2015-01-30',
 '2014-01-31',
 '2013-01-30',
 '2012-02-01',
 '2011-02-28',
 '2011-01-28',
 '2010-01-29',
 '2009-01-30',
 '2008-02-11',
 '2007-02-16',
 '2006-02-17',
 '2005-03-11',
 '2004-02-25',
 '2003-02-19',
 '2002-01-24',
 '2001-03-23',
 '2000-09-08',
 '2000-03-29',
 '1999-03-05',
 '1998-03-30']

In [67]:
pprint.pprint(sec_data[example_ticker][:5])

[('https://www.sec.gov/Archives/edgar/data/1018724/000101872420000004/0001018724-20-000004-index.htm',
  '10-K',
  '2020-01-31'),
 ('https://www.sec.gov/Archives/edgar/data/1018724/000101872419000004/0001018724-19-000004-index.htm',
  '10-K',
  '2019-02-01'),
 ('https://www.sec.gov/Archives/edgar/data/1018724/000101872418000005/0001018724-18-000005-index.htm',
  '10-K',
  '2018-02-02'),
 ('https://www.sec.gov/Archives/edgar/data/1018724/000101872417000011/0001018724-17-000011-index.htm',
  '10-K',
  '2017-02-10'),
 ('https://www.sec.gov/Archives/edgar/data/1018724/000101872416000172/0001018724-16-000172-index.htm',
  '10-K',
  '2016-01-29')]


In [68]:
len(sec_data)

458

## Download 10-ks
As you see, this is a list of urls. These urls point to a file that contains metadata related to each filling. Since we don't care about the metadata, we'll pull the filling by replacing the url with the filling url.

In [69]:
load_sec10k.run_download_and_parse(sec_data, sec_api, OUTPATH2, doc_type=DOC_TYPE, oldest_filling_date = START_DT)

Downloading A Fillings: 100%|██████████| 22/22 [00:13<00:00,  1.57filling/s]
Downloading AAL Fillings: 100%|██████████| 31/31 [00:13<00:00,  2.33filling/s]
Downloading AAP Fillings: 100%|██████████| 19/19 [00:13<00:00,  1.45filling/s]
Downloading AAPL Fillings: 100%|██████████| 28/28 [00:09<00:00,  2.83filling/s]
Downloading ABBV Fillings: 100%|██████████| 9/9 [00:04<00:00,  1.95filling/s]
Downloading ABC Fillings: 100%|██████████| 20/20 [00:08<00:00,  2.36filling/s]
Downloading ABT Fillings: 100%|██████████| 28/28 [00:10<00:00,  2.66filling/s]
Downloading ACN Fillings: 100%|██████████| 12/12 [00:05<00:00,  2.36filling/s]
Downloading ADBE Fillings: 100%|██████████| 29/29 [00:12<00:00,  2.40filling/s]
Downloading ADI Fillings: 100%|██████████| 26/26 [00:12<00:00,  2.15filling/s]
Downloading ADM Fillings: 100%|██████████| 32/32 [00:13<00:00,  2.29filling/s]
Downloading ADP Fillings: 100%|██████████| 27/27 [00:11<00:00,  2.31filling/s]
Downloading ADS Fillings: 100%|██████████| 20/20 [00:

# Parse Documents

In [138]:
import re
re_risk = re.compile(r'(>Item(\s|&#160;|&nbsp;)(1A|7A|7)\.{0,1})|(ITEM\s(1A|7A|7))')
matches = re_risk.finditer('asdr')

In [140]:
[(x.group(), x.start(), x.end()) for x in matches]

[]

In [163]:
ticker, doc, date = os.listdir(OUTPATH2)[0].split("_")
ticker, doc, date

('aal', '10k', '20030415.gz')

In [172]:
def get_risk_sections_and_parse(inpath, outpath, write_gzip = True):
    in_listdir = os.listdir(inpath)[:10]
    
    control_lst = []
    for file in in_listdir:
        ticker, doc_type, date = file.split("_")
        date = date.split(".")[0]
        with gzip.open(inpath + file, "rb") as f:
            doc = f.read()
        doc = doc.decode()

        tenk_risk_pos_dat = parse_sec_fillings.get_10k_risk_sections_df(text=doc)
        tenk_risk_sections = parse_sec_fillings.get_section_text(text=doc,
            pos_dat=tenk_risk_pos_dat)
        
        sections = []
        for item, section in tenk_risk_sections.items():
            section_clean = parse_sec_fillings.clean_text(section)
            sections.append(section)

        doc_clean = " ".join(sections)
        print(len(doc), len(doc_clean))
        
        if write_gzip:
            with gzip.GzipFile(outpath + file, "wb") as gzip_text_file:
                gzip_text_file.write(doc_clean.encode())
        else:
            with open(outpath + file.split(".")[0] + ".txt", "w") as text_file:
                text_file.write(doc_clean)
        
        tenk_risk_pos_dat['ticker'] = ticker
        tenk_risk_pos_dat['doc_type'] = doc_type
        tenk_risk_pos_dat['date'] = dt.datetime.strptime(date, "%Y%m%d")
        
        control_lst.append(tenk_risk_pos_dat)
        
    return pd.concat(control_lst, axis=0)
        

In [177]:
control_df = parse_sec_fillings.get_risk_sections_and_parse(inpath=OUTPATH2, outpath=OUTPATH4, write_gzip=True)

870154 590475
843334 535976
919571 607539
894119 825319
906911 840161
2471099 1710876
2759974 2640788
2581059 1792606
No match!
1471040 1471040
No match!
2630469 2630469


In [178]:
control_df.head()

Unnamed: 0_level_0,start,end,next_start,ticker,doc_type,date
item,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
item7,279673,279679,870154.0,aal,10k,2003-04-15
item7,307352,307358,843334.0,aal,10k,2004-02-27
item7,312026,312032,919571.0,aal,10k,2005-02-25
item1a,68794,68801,322221.0,aal,10k,2006-02-24
item7,322215,322221,894119.0,aal,10k,2006-02-24
item1a,66744,66751,310408.0,aal,10k,2007-02-23
item7,310402,310408,906911.0,aal,10k,2007-02-23
item\n7,760217,760223,2471099.0,aal,10k,2008-02-20
item\n1a,119180,119187,851156.0,aal,10k,2009-02-19
item\n7,851150,851156,2759974.0,aal,10k,2009-02-19


In [None]:
raw_fillings_by_ticker = {}

for ticker, data in sec_data.items():
    path = OUTPATH
    doc_type = DOC_TYPE
    raw_fillings_by_ticker = {}
    for index_url, file_type, file_date in tqdm(data, desc='Downloading {} Fillings'.format(ticker), unit='filling'):
        if (file_type == DOC_TYPE):
            file_url = index_url.replace('-index.htm', '.txt').replace('.txtl', '.txt')            
            
            raw_filling = sec_api.get(file_url)
            raw_extracted_docs = load_sec10k.get_documents(raw_filling)
            for document in raw_extracted_docs:
                if load_sec10k.get_document_type(document) == DOC_TYPE:
                    with open(f"{path}{ticker}_{DOC_TYPE}_{.replace('-', '_')}.txt", "w") as text_file:
                        document.write(raw_fillings)

In [None]:
print('Example Document:\n\n{}...'.format(next(iter(raw_fillings_by_ticker[example_ticker].values()))[:1000]))

# Parse Documents

## Get Documents
Each filling is broken into several associated documents, sectioned off in the fillings with the tags:
      <DOCUMENT> </DOCUMENT> There's no overlap with these documents, so each `</DOCUMENT>` tag should come after the `<DOCUMENT>` with no `<DOCUMENT>` tag in between.


In [None]:
filling_documents_by_ticker = {}

for ticker, raw_fillings in raw_fillings_by_ticker.items():
    filling_documents_by_ticker[ticker] = {}
    for file_date, filling in tqdm(raw_fillings.items(), desc='Getting Documents from {} Fillings'.format(ticker), unit='filling'):
        filling_documents_by_ticker[ticker][file_date] = load_sec10k.get_documents(filling)

In [None]:
print('\n\n'.join([
    'Document {} Filed on {}:\n{}...'.format(doc_i, file_date, doc[:200])
    for file_date, docs in filling_documents_by_ticker[example_ticker].items()
    for doc_i, doc in enumerate(docs)][:3]))

## Get Document Types
Now that we have all the documents, we want to find the 10-k form in this 10-k filing. Implement the `get_document_type` function to return the type of document given. The document type is located on a line with the `<TYPE>` tag. For example, a form of type "TEST" would have the line `<TYPE>TEST`. Make sure to return the type as lowercase, so this example would be returned as "test".

With the `get_document_type` function, we'll filter out all non 10-k documents.

In [None]:
ten_ks_by_ticker = {}
tenk_dates = {}

for ticker, filling_documents in filling_documents_by_ticker.items():
    ten_ks_by_ticker[ticker] = []
    tenk_dates[ticker] = []
    for file_date, documents in filling_documents.items():
        for document in documents:
            if load_sec10k.get_document_type(document) == DOC_TYPE:
                ten_ks_by_ticker[ticker].append({
                    'cik': load_sec10k.cik_lookup[ticker],
                    'file': document,
                    'file_date': file_date})
                tenk_dates[ticker].append(file_date)
                

In [None]:
load_sec10k.print_ten_k_data(ten_ks_by_ticker[example_ticker][:5], ['cik', 'file', 'file_date'])

In [None]:
len(tenk_dates[example_ticker]) == len(sec_dates[example_ticker])  # some fillings does not have 10-Ks

In [None]:
ten_ks_by_ticker[example_ticker][0]['file_date'], ten_ks_by_ticker[example_ticker][-1]['file_date']

In [None]:
tenk_dates[example_ticker][0], tenk_dates[example_ticker][-1]

In [None]:
sec_dates[example_ticker][0], sec_dates[example_ticker][-1]

In [None]:
ten_ks_by_ticker[example_ticker][4]['file'][:1000]

# Write Raw 10Ks

In [None]:
metadata = {'doc_type': DOC_TYPE,
            'file_dates': tenk_dates,
           'tickers': cik_lookup}

In [None]:
with open(OUTPATH + OUTFILE1, 'wb') as file:
    pickle.dump(metadata, file)

In [None]:
with open(OUTPATH + OUTFILE2, 'wb') as file:
    pickle.dump(ten_ks_by_ticker, file)