In [22]:
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import os
import re
import html2text
import pickle
from sec_edgar_downloader import Downloader

First, let's take a look at which stocks in particular we need to parse. To be clear, not all companies that reported are publicly traded, so there is no point in doing more work than necessary for this case. We know that only the following CIKs have associated stock prices:

In [23]:
valid_CIKS = [948846, 6201, 1362468,766421,1011696,1029863,
              27904,921929,1351548,914397,1172222,1158463,
              701345,92380,948845,810332,1058033,1498710,
              793733,100517,1088734,1144331,1614436]

In [25]:
# Sanity check
len(valid_CIKS)

23

Looks good! So the only universe of stocks we will need to parse are the ones listed above. Let's convert them back to the form with padded zeros:

In [36]:
str(valid_CIKS[0]).zfill(10)

'0000948846'

In [38]:
str(valid_CIKS[0]).zfill(10) in list(os.listdir('sec-edgar-filings/'))

True

In [42]:
CIKS = [str(valid_CIKS[i]).zfill(10) for i in range(len(valid_CIKS))]
CIKS

['0000948846',
 '0000006201',
 '0001362468',
 '0000766421',
 '0001011696',
 '0001029863',
 '0000027904',
 '0000921929',
 '0001351548',
 '0000914397',
 '0001172222',
 '0001158463',
 '0000701345',
 '0000092380',
 '0000948845',
 '0000810332',
 '0001058033',
 '0001498710',
 '0000793733',
 '0000100517',
 '0001088734',
 '0001144331',
 '0001614436']

# AAL Sample

In [41]:
# Get the most recent filing
f = open("sec-edgar-filings/AAL/10-K/0001193125-15-061145/filing-details.html", "r")
raw_10k = f.read()

In [31]:
matches = re.finditer('..>ITEM.{1,10}8(|A)[^a-zA-Z]{1,10}(consolidated|financial)', raw_10k, re.IGNORECASE)
locations = [x for x in matches]
locations[:3]

[<re.Match object; span=(1614421, 1614455), match='/a>ITEM&#160;8A.&#160;CONSOLIDATED'>]

In [43]:
# Check each filing to make sure there's a match
for filing in os.listdir("sec-edgar-filings/AAL/10-K/"):
    
    locs = []
    
    if filing == '.DS_Store':
        continue
        
    print(filing)
    matches = re.finditer('..>ITEM.{1,10}7[^a-zA-Z]{1,10}management', raw_10k, re.IGNORECASE)
    print([x for x in matches])
    
    matches = re.finditer('..>ITEM.{1,10}8(|A)[^a-zA-Z]{1,10}(consolidated|financial)', raw_10k, re.IGNORECASE)
    print([x for x in matches])
    print('\n')

0000004515-08-000014
[<re.Match object; span=(723646, 723672), match='/a>ITEM&#160;7. MANAGEMENT'>]
[<re.Match object; span=(1614421, 1614455), match='/a>ITEM&#160;8A.&#160;CONSOLIDATED'>]


0000006201-20-000023
[<re.Match object; span=(723646, 723672), match='/a>ITEM&#160;7. MANAGEMENT'>]
[<re.Match object; span=(1614421, 1614455), match='/a>ITEM&#160;8A.&#160;CONSOLIDATED'>]


0000950134-06-003715
[<re.Match object; span=(723646, 723672), match='/a>ITEM&#160;7. MANAGEMENT'>]
[<re.Match object; span=(1614421, 1614455), match='/a>ITEM&#160;8A.&#160;CONSOLIDATED'>]


0001047469-03-013301
[<re.Match object; span=(723646, 723672), match='/a>ITEM&#160;7. MANAGEMENT'>]
[<re.Match object; span=(1614421, 1614455), match='/a>ITEM&#160;8A.&#160;CONSOLIDATED'>]


0000950134-05-003726
[<re.Match object; span=(723646, 723672), match='/a>ITEM&#160;7. MANAGEMENT'>]
[<re.Match object; span=(1614421, 1614455), match='/a>ITEM&#160;8A.&#160;CONSOLIDATED'>]


0000950134-04-002668
[<re.Match object; span=

# Extrapolate to All Valid Stocks

In [492]:
# Do the same for all files
for company in CIKS:
    
    logstr = ''
    
    if company == '.DS_Store' or company == 'AAL':
        continue
        
    pulls = os.listdir("sec-edgar-filings/"+company+"/10-K/")
    
    a = pd.DataFrame()
    
    
    for year in pulls:
        if year == '.DS_Store':
            continue
        #print(year)

        try:
            f = open("sec-edgar-filings/"+company+"/10-K/"+year+"/filing-details.html", "r")
        except:
            f = open("sec-edgar-filings/"+company+"/10-K/"+year+"/filing-details.txt", "r")
        raw_10k = f.read()
        
        matches1 = re.finditer(r"(?s).>IT.{0,20}EM.{1,20}7[^A].{1,400}MANAGEMENT", 
                               raw_10k, re.IGNORECASE)
        
        matches2 = re.finditer(r"(?s).>IT.{0,20}EM.{1,20}8([^B]|A).{1,400}(CONSOLIDATED|FINANCIAL)", 
                               raw_10k, re.IGNORECASE)
        
        logstr += year + '\n'
        logstr += str([x for x in matches1])+ '\n'
        logstr += str([x for x in matches2])+ '\n'
        logstr += '\n'
        
    print(logstr, file=open("logs/"+company+".txt", "w"))

In [493]:
pulls

['0001193125-15-073939', '0001193125-16-485622']

In [47]:
os.listdir("sec-edgar-filings/"+company+"/10-K/")

['0001193125-15-073939', '0001193125-16-485622']

# Debug

In [17]:
company = '0001362468'
year = '0001047469-10-001935'

f = open("sec-edgar-filings/"+company+"/10-K/"+year+"/filing-details.html", "r")
raw_10k = str(f.read())

In [18]:
matches1 = re.finditer(r"(?s).IT.{0,20}EM.{1,20}7[^a].{1,400}m", raw_10k, re.IGNORECASE)
a = [x for x in matches1]
print(a)

[<re.Match object; span=(23925, 24321), match='_item_7._management_s_discussio__ite03668"><p sty>, <re.Match object; span=(24644, 25008), match='_item_7._management_s_discussio__ite03668"><font >, <re.Match object; span=(372072, 372350), match='_item_7._management_s_discussio__ite03668"> </a>\>, <re.Match object; span=(576697, 577074), match='with the remaining 17 aircraft owned free and cle>]


In [19]:
locs = [x.start() for x in a]
locs

[23925, 24644, 372072, 576697]

In [20]:
raw_10k[locs[0]-100:locs[0]+100]

'2"><br/>\n26</font></a></td>\n</tr>\n<tr valign="TOP">\n<td style="font-family:times;"><a href="#dk41401_item_7._management_s_discussio__ite03668"><p style="font-family:times;margin-top:12pt;margin-left:0'

In [21]:
matches2 = re.finditer('(?s).>IT.{0,20}EM.{1,20}8(|A).{1,400}FINANCIAL', raw_10k, re.IGNORECASE)
b = [x for x in matches2]
print(b)

[]


In [168]:
locs = [x.start() for x in b]
locs

[40459]

In [172]:
raw_10k[locs[0]-200:locs[0]+400]

'p style="margin-bottom:0pt;margin-top:0pt;margin-left:10pt;;text-indent:-10pt;;font-size:10pt;font-family:Times New Roman;font-weight:normal;font-style:normal;text-transform:none;font-variant: normal;">Item&#160;8.&#160;</p></td>\n<td style="padding-left:0pt;padding-Right:0pt;padding-Top:0pt;padding-Bottom:0pt;width:79.34%;" valign="bottom">\n<p style="margin-bottom:0pt;margin-top:0pt;margin-left:0pt;;text-indent:0pt;;font-size:10pt;font-family:Times New Roman;font-weight:normal;font-style:normal;text-transform:none;font-variant: normal;"><a href="#ITEM8_FINANCIAL_STATEMENTS"><font style="text-d'

In [242]:
# coding=utf8
# the above tag defines encoding for this document and is for Python 2.x compatibility

import re

regex = r".>ITEM.{1,20}7.*(?<=MANAGEMENT)"

test_str = "'font style=\"DISPLAY: inline; FONT-FAMILY: times new roman; FONT-SIZE: 10pt\">&#160;&#160;<a href=\"#mda\">ITEM&#160;7.</a></font></div>\\n</td>\\n<td align=\"left\" colspan=\"3\" valign=\"top\" width=\"83%\">\\n<div align=\"left\" style=\"TEXT-INDENT: 0pt; DISPLAY: block; MARGIN-LEFT: 18pt; MARGIN-RIGHT: 0pt\"><font style=\"DISPLAY: inline; FONT-FAMILY: times new roman; FONT-SIZE: 10pt\"><a href=\"#mda\">MANAGEMENT&#8217;'"

matches = re.finditer(regex, raw_10k, re.MULTILINE)

for matchNum, match in enumerate(matches, start=1):
    
    print ("Match {matchNum} was found at {start}-{end}: {match}".format(matchNum = matchNum, start = match.start(), end = match.end(), match = match.group()))
    
    for groupNum in range(0, len(match.groups())):
        groupNum = groupNum + 1
        
        print ("Group {groupNum} found at {start}-{end}: {group}".format(groupNum = groupNum, start = match.start(groupNum), end = match.end(groupNum), group = match.group(groupNum)))

# Note: for Python 2.7 compatibility, use ur"" to prefix the regex and u"" to prefix the test string and substitution.

# Go the Other Way

We have now built upwards, but some matches are too general and catch too many exceptions. Now, we must work backwards to find out why this is the case and see what we can do about it

In [518]:
# Do the same for all files

# Keep track of number of unique companies to fix
count = 0

for company in CIKS:
    
    logstr = ''
    
    if company == '.DS_Store' or company == 'AAL':
        continue
        
    pulls = os.listdir("sec-edgar-filings/"+company+"/10-K/")
    
    a = pd.DataFrame()
    
    
    for year in pulls:
        if year == '.DS_Store':
            continue
        #print(year)

        try:
            f = open("sec-edgar-filings/"+company+"/10-K/"+year+"/filing-details.html", "r")
        except:
            continue
            #f = open("sec-edgar-filings/"+company+"/10-K/"+year+"/filing-details.txt", "r")
        raw_10k = f.read()
        

        matches1 = re.finditer(r"(?s).>IT.{0,20}EM.{1,20}7[^A].{1,400}MANAGEMENT", 
                               raw_10k, re.IGNORECASE)
        
        matches2 = re.finditer(r"(?s).>IT.{0,20}EM.{1,20}8([^B]|A).{1,400}(CONSOLIDATED|FINANCIAL)", 
                               raw_10k, re.IGNORECASE)
        
        
        list_matches1 = [x for x in matches1]
        list_matches2 = [x for x in matches2]
        
        logstr += year + '\n'
        logstr += str(list_matches1)+ '\n'
        logstr += str(list_matches2)+ '\n'
        logstr += '\n'
        
        if len(list_matches1) == 0:
            print('matches1 '+company+' '+year+'\n')
            
        if len(list_matches1) == 0:
            print('matches2 '+company+' '+year+'\n')
            
#         if len(list_matches1) > 2:
#             print('matches1 '+company+' '+year+'\n')
            
#         if len(list_matches2) > 2:
#             print('matches2 '+company+' '+year+'\n')
            
        if (len(list_matches1) > 2) or (len(list_matches2) > 2):
            count += 1
        
    #print(logstr, file=open("logs/"+company+".txt", "w"))

matches1 0001362468 0001047469-09-002190

matches2 0001362468 0001047469-09-002190

matches1 0001362468 0001047469-10-001935

matches2 0001362468 0001047469-10-001935

matches1 0001172222 0000950136-05-001827

matches2 0001172222 0000950136-05-001827



In [519]:
count

24

# Debug

In [511]:
company = '0001029863'
year = '0000950153-04-000491'

f = open("sec-edgar-filings/"+company+"/10-K/"+year+"/filing-details.html", "r")
raw_10k = str(f.read())

In [512]:
raw_10k[:500]

'<html><body><document>\n<type>10-K\n<sequence>1\n<filename>p68837e10vk.htm\n<description>10-K\n<text>\n<title>e10vk</title>\n<!-- PAGEBREAK -->\n<h5 align="left" style="page-break-before:always"><a href="#toc">Table of Contents</a></h5><p>\n</p><p align="center"><font size="4"><b>UNITED STATES SECURITIES AND EXCHANGE<br/>\nCOMMISSION</b>\n</font>\n</p><div align="center"><font size="3"><b>WASHINGTON, D.C. 20549</b>\n</font></div>\n<div align="center"><font size="5"><b>FORM 10-K</b>\n</font></div>\n<center>\n<tab'

In [513]:
matches1 = re.finditer(r"(?s).>IT.{0,20}EM.{1,20}7[^A].{1,400}MANAGEMENT.{,150}\n", raw_10k, re.IGNORECASE)
a = [x for x in matches1]
print(a)

[<re.Match object; span=(11795, 11910), match='">ITEM 7. MANAGEMENT&#146;S DISCUSSION AND ANALYS>, <re.Match object; span=(18366, 18528), match='">Item&#160;7. Management&#146;s Discussion and A>, <re.Match object; span=(233805, 234228), match='b>ITEM 7. MANAGEMENT&#146;S DISCUSSION AND ANALYS>]


In [514]:
locs = [x.start() for x in a]
locs

[11795, 18366, 233805]

In [517]:
idx = 1
raw_10k[locs[idx]-1000:locs[idx]+1000]

'="2">PART II</font></div></td>\n<td><font size="2">&#160;</font></td>\n<td><font size="2">&#160;</font></td>\n<td><font size="2">&#160;</font></td>\n<td><font size="2">&#160;</font></td>\n</tr>\n<tr bgcolor="#eeeeee" valign="bottom">\n<td><div style="margin-left:10px; text-indent:-10px"><font size="2">Item&#160;5. Market for Registrants&#146; Common Equity and Related Stockholder Matters</font></div></td>\n<td><font size="2">&#160;</font></td>\n<td><font size="2">&#160;</font></td>\n<td align="right"><font size="2">21</font></td>\n<td><font size="2">&#160;</font></td>\n</tr>\n<tr valign="bottom">\n<td><div style="margin-left:10px; text-indent:-10px"><font size="2">Item&#160;6. Selected Consolidated Financial Data</font></div></td>\n<td><font size="2">&#160;</font></td>\n<td><font size="2">&#160;</font></td>\n<td align="right"><font size="2">22</font></td>\n<td><font size="2">&#160;</font></td>\n</tr>\n<tr bgcolor="#eeeeee" valign="bottom">\n<td><div style="margin-left:10px; text-inden

In [89]:
matches2 = re.finditer(r"(?s).>IT.{0,20}EM.{1,20}8([^B]|A).{1,400}(CONSOLIDATED|FINANCIAL).{105,150}\n", 
                               raw_10k, re.IGNORECASE)
b = [x for x in matches2]
print(b)

[<re.Match object; span=(12136, 12446), match='">ITEM 8. FINANCIAL STATEMENTS AND SUPPLEMENTARY >, <re.Match object; span=(30831, 31096), match='b>ITEM 8.</b>\n</font></td>\n<td><font size="2">&>]


In [90]:
locs = [x.start() for x in b]
locs

[12136, 30831]

In [83]:
idx = 1
raw_10k[locs[idx]-1000:locs[idx]+1000]

'h5 align="left" style="page-break-before:always"><a href="#toc">Table of Contents</a></h5><p>\n</p><center>\n<table border="0" cellpadding="0" cellspacing="0" width="100%">\n<tr valign="bottom">\n<td width="2%">&#160;</td>\n<td width="1%">&#160;</td>\n<td width="9%">&#160;</td>\n<td width="1%">&#160;</td>\n<td width="80%">&#160;</td>\n<td width="2%">&#160;</td>\n<td width="1%">&#160;</td>\n</tr>\n<tr bgcolor="#eeeeee" valign="bottom">\n<td valign="top"><font size="2">&#160;</font></td>\n<td><font size="2">&#160;</font></td>\n<td align="left" valign="top"><font size="2">\n<b>ITEM 7A</b>\n</font></td>\n<td><font size="2">&#160;</font></td>\n<td align="left" valign="top"><font size="2"><b>QUANTITATIVE AND QUALITATIVE DISCLOSURES ABOUT MARKET RISK</b></font></td>\n<td><font size="2">&#160;</font></td>\n<td align="right"><font size="2">54</font></td>\n</tr>\n<tr valign="bottom">\n<td valign="top"><font size="2">&#160;</font></td>\n<td><font size="2">&#160;</font></td>\n<td align="left" val

In [80]:
len('MANAGEMENT\'S DISCUSSION AND ANALYSIS OF FINANCIAL CONDITION\nAND RESULTS OF OPERATIONS')

85