In [2]:
# import our libraries
import requests
import pandas as pd
from bs4 import BeautifulSoup

# Grab the Filing XML Summary

Something that makes 10-K and for that matter 10-Q filings so unique is we have access to a particular document that gives us a quick way to grab the data we need from a 10-K. This file is the filing summary and comes in an either an XML or xlsx format. While you would think these two files would be identical, they are not, the XML version of the file provides us with a quick way to see the structure of the 10-K, defines whether a section is a note, table or details, and the name and each corresponding file for each section.

The xlsx file, on the other hand, contains each section of the 10K in an excel style format. This file can come in handy if we want to parse just a single location, but be warned that formatting issues will not make it a simple load.

Let's assume we want to parse the XML file as we want to leverage the underlying structure of the 10-K report. In the section below, I outline how you would go about this process and use a sample document URL for our demonstration.

In [3]:
# define the base url needed to create the file url.
base_url = r"https://www.sec.gov"

# convert a normal url to a document url
normal_url = r"https://www.sec.gov/Archives/edgar/data/1265107/0001265107-19-000004.txt"
normal_url = normal_url.replace('-','').replace('.txt','/index.json')

# define a url that leads to a 10k document landing page
documents_url = r"https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/index.json"

# request the url and decode it.
content = requests.get(documents_url).json()

for file in content['directory']['item']:
    
    # Grab the filing summary and create a new url leading to the file so we can download it.
    if file['name'] == 'FilingSummary.xml':

        xml_summary = base_url + content['directory']['name'] + "/" + file['name']
        
        print('-' * 100)
        print('File Name: ' + file['name'])
        print('File Path: ' + xml_summary)

----------------------------------------------------------------------------------------------------
File Name: FilingSummary.xml
File Path: https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/FilingSummary.xml


In [11]:
report

<report instance="mtii-20181231.xml">
<isdefault>false</isdefault>
<hasembeddedreports>false</hasembeddedreports>
<htmlfilename>R9999.htm</htmlfilename>
<longname>Uncategorized Items - mtii-20181231.xml</longname>
<reporttype>Sheet</reporttype>
<role>http://xbrl.sec.gov/role/uncategorizedFacts</role>
<shortname>Uncategorized Items - mtii-20181231.xml</shortname>
<menucategory>Cover</menucategory>
<position>82</position>
</report>

In [7]:
# define a new base url that represents the filing folder. This will come in handy when we need to download the reports.
base_url = xml_summary.replace('FilingSummary.xml', '')

# request and parse the content
content = requests.get(xml_summary).content
soup = BeautifulSoup(content, 'lxml')

# find the 'myreports' tag because this contains all the individual reports submitted.
reports = soup.find('myreports')

# I want a list to store all the individual components of the report, so create the master list.
master_reports = []

# loop through each report in the 'myreports' tag but avoid the last one as this will cause an error.
for report in reports.find_all('report')[:-1]:

    # let's create a dictionary to store all the different parts we need.
    report_dict = {}
    report_dict['name_short'] = report.shortname.text
    report_dict['name_long'] = report.longname.text
    report_dict['position'] = report.position.text
    report_dict['category'] = report.menucategory.text
    report_dict['url'] = base_url + report.htmlfilename.text

    # append the dictionary to the master list.
    master_reports.append(report_dict)

    # print the info to the user.
    print('-'*100)
    print(base_url + report.htmlfilename.text)
    print(report.longname.text)
    print(report.shortname.text)
    print(report.menucategory.text)
    print(report.position.text)

----------------------------------------------------------------------------------------------------
https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/R1.htm
0001000 - Document - Document and Entity Information
Document and Entity Information
Cover
1
----------------------------------------------------------------------------------------------------
https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/R2.htm
1001000 - Statement - Consolidated Balance Sheets
Consolidated Balance Sheets
Statements
2
----------------------------------------------------------------------------------------------------
https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/R3.htm
1001501 - Statement - Consolidated Balance Sheets (Parenthetical)
Consolidated Balance Sheets (Parenthetical)
Statements
3
----------------------------------------------------------------------------------------------------
https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/

In [12]:
# create the list to hold the statement urls
statements_url = []

for report_dict in master_reports:
    
    # define the statements we want to look for.
    item1 = r"Consolidated Balance Sheets"
    item2 = r"Consolidated Statements of Operations and Comprehensive Income (Loss)"
    item3 = r"Consolidated Statements of Cash Flows"
    item4 = r"Consolidated Statements of Stockholder's (Deficit) Equity"
    
    # store them in a list.
    report_list = [item1, item2, item3, item4]
    
    # if the short name can be found in the report list.
    if report_dict['name_short'] in report_list:
        
        # print some info and store it in the statements url.
        print('-'*100)
        print(report_dict['name_short'])
        print(report_dict['url'])
        
        statements_url.append(report_dict['url'])

----------------------------------------------------------------------------------------------------
Consolidated Balance Sheets
https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/R2.htm
----------------------------------------------------------------------------------------------------
Consolidated Statements of Operations and Comprehensive Income (Loss)
https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/R4.htm
----------------------------------------------------------------------------------------------------
Consolidated Statements of Cash Flows
https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/R5.htm
----------------------------------------------------------------------------------------------------
Consolidated Statements of Stockholder's (Deficit) Equity
https://www.sec.gov/Archives/edgar/data/1265107/000126510719000004/R6.htm


In [13]:
# let's assume we want all the statements in a single data set.
statements_data = []

# loop through each statement url
for statement in statements_url:

    # define a dictionary that will store the different parts of the statement.
    statement_data = {}
    statement_data['headers'] = []
    statement_data['sections'] = []
    statement_data['data'] = []
    
    # request the statement file content
    content = requests.get(statement).content
    report_soup = BeautifulSoup(content, 'html')

    # find all the rows, figure out what type of row it is, parse the elements, and store in the statement file list.
    for index, row in enumerate(report_soup.table.find_all('tr')):
        
        # first let's get all the elements.
        cols = row.find_all('td')
        
        # if it's a regular row and not a section or a table header
        if (len(row.find_all('th')) == 0 and len(row.find_all('strong')) == 0): 
            reg_row = [ele.text.strip() for ele in cols]
            statement_data['data'].append(reg_row)
            
        # if it's a regular row and a section but not a table header
        elif (len(row.find_all('th')) == 0 and len(row.find_all('strong')) != 0):
            sec_row = cols[0].text.strip()
            statement_data['sections'].append(sec_row)
            
        # finally if it's not any of those it must be a header
        elif (len(row.find_all('th')) != 0):            
            hed_row = [ele.text.strip() for ele in row.find_all('th')]
            statement_data['headers'].append(hed_row)
            
        else:            
            print('We encountered an error.')

    # append it to the master list.
    statements_data.append(statement_data)



 BeautifulSoup(YOUR_MARKUP})

to this:

 BeautifulSoup(YOUR_MARKUP, "lxml")

  markup_type=markup_type))


In [50]:
statements_data[3]

{'headers': [["Consolidated Statements of Stockholder's (Deficit) Equity - USD ($) $ in Thousands",
   'Total',
   'Common Stock',
   'Additional Paid-in Capital',
   'Accumulated Deficit',
   'Accumulated Other Comprehensive Income (Loss)']],
 'sections': ["Increase (Decrease) in Stockholder's Equity",
  "Increase (Decrease) in Stockholder's Equity",
  "Increase (Decrease) in Stockholder's Equity"],
 'data': [['Beginning Balance (in shares) at Dec. 31, 2015',
   '',
   '1,000',
   '',
   '',
   ''],
  ['Beginning Balance at Dec. 31, 2015',
   '$ 201,065',
   '',
   '$ 361,228',
   '$ (146,617)',
   '$ (13,546)'],
  ['Net loss', '(76,307)', '', '', '(76,307)', ''],
  ['Other comprehensive income', '4,589', '', '', '', '4,589'],
  ['Contributions from Ascent Capital', '88,000', '', '88,000', '', ''],
  ['Dividend paid to Ascent Capital', '(5,000)', '', '(5,000)', '', ''],
  ['Stock-based compensation', '2,719', '', '2,719', '', ''],
  ['Value of shares withheld for minimum tax liability