# EDGAR XBRL XML URL

Generate URLs to the XBRL XML files in the fileing directory in a parallel processing manner using Ray.

In [1]:
# !pip install -q ray

In [2]:
from typing import (
    List,
    Dict,
    Iterable
)
import os
import sys
import random
import logging
import time
import re
import requests
import bs4
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import ray

In [3]:
pd.set_option('display.float_format', lambda x: ('%f' % x).rstrip('0').rstrip('.'))
pd.set_option('display.colheader_justify', 'center')
pd.set_option('display.max_columns', None)  

logging.basicConfig(level=logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)

Logger = logging.getLogger(__name__)
Logger.addHandler(handler)

In [4]:
Logger.debug("test")

test


DEBUG:__main__:test


In [5]:
from sec_edgar_constant import (
    NUM_CPUS,
    FS_TYPE_10K,
    FS_TYPE_10Q,
    EDGAR_HTTP_HEADERS,
    DIR_CSV_LIST,
    DIR_XML_XBRL,
    DIR_CSV_XBRL
)
from sec_edgar_utility import (
    split,
    http_get_content,
)
from sec_edgar_download_xbrl_xml import (
    save_to_xml,
    worker,
)

In [6]:
def pd_print_full():
    pd.set_option('display.max_rows', None)
    pd.set_option('display.max_columns', None)
    pd.set_option('display.width', 2000)
    pd.set_option('display.float_format', '{:20,.2f}'.format)
    pd.set_option('display.max_colwidth', None)
    pd.reset_option('display.max_rows')
    pd.reset_option('display.max_columns')
    pd.reset_option('display.width')
    pd.reset_option('display.float_format')
    pd.reset_option('display.max_colwidth')

# Data 

## EDGAR XBRL Directory Indices

In [7]:
YEAR = 2019
QTR=2
listing = pd.read_csv(
    f"{DIR_CSV_LIST}/{YEAR}QTR{QTR}_LIST.gz", 
    sep="|",
    usecols=['CIK', 'Form Type', 'Date Filed', 'Filename'],
    parse_dates=['Date Filed'],
)
listing = listing.head(8)

In [8]:
listing.insert(loc=3, column='Year', value=pd.Categorical([YEAR]* len(listing)))
listing.insert(loc=4, column='Quarter', value=pd.Categorical([QTR]* len(listing)))
listing.insert(loc=len(listing.columns), column="Filepath", value=[None]*len(listing))
listing

Unnamed: 0,CIK,Form Type,Date Filed,Year,Quarter,Filename,Filepath
0,1000045,10-K,2019-06-28,2019,2,https://sec.gov/Archives/edgar/data/1000045/000156459019023956/nick-20190331.xml,
1,1000209,10-Q,2019-05-10,2019,2,https://sec.gov/Archives/edgar/data/1000209/000119312519144225/mfin-20190331.xml,
2,1000228,10-Q,2019-05-07,2019,2,https://sec.gov/Archives/edgar/data/1000228/000100022819000030/hsic-20190330.xml,
3,1000229,10-Q,2019-04-26,2019,2,https://sec.gov/Archives/edgar/data/1000229/000156459019013150/clb-20190331.xml,
4,1000230,10-Q,2019-06-11,2019,2,https://sec.gov/Archives/edgar/data/1000230/000143774919011785/occ20190430b_10q_htm.xml,
5,1000232,10-Q,2019-05-10,2019,2,https://sec.gov/Archives/edgar/data/1000232/000155837019004800/ktyb-20190331.xml,
6,1000298,10-Q,2019-05-10,2019,2,https://sec.gov/Archives/edgar/data/1000298/000155837019004865/imh-20190331.xml,
7,1000623,10-Q,2019-05-01,2019,2,https://sec.gov/Archives/edgar/data/1000623/000100062319000067/swm-20190331.xml,


In [9]:
for index, row in listing.iterrows():
    # --------------------------------------------------------------------------------
    # Download XBRL XML
    # --------------------------------------------------------------------------------
    url = row['Filename']
    content = http_get_content(url, EDGAR_HTTP_HEADERS)

    # --------------------------------------------------------------------------------
    # Save
    # --------------------------------------------------------------------------------
    elements = url.split('/')
    basename = elements[-1]
    accession = elements[-2]
    cik = elements[-3]
    assert str(row['CIK']) == cik, f"CIK [{row['CIK']})] must match CIK part [{cik}] in url {url}"

    output_xml_directory = DIR_XML_XBRL
    directory = f"{cik}{os.sep}{accession}"
    Logger.debug(f"worker(): saving XML to [{directory}:{basename}]...")

    package = {
        "data": content,
        "output_xml_directory": output_xml_directory,
        "directory": directory,
        "basename": basename
    }
    filepath = save_to_xml(package)
    assert not filepath.startswith("/"), f"{filepath}"
    
    listing.at[index, 'Filepath'] = filepath

DEBUG:sec_edgar_utility:http_get_content(): GET url [https://sec.gov/Archives/edgar/data/1000045/000156459019023956/nick-20190331.xml] headers [{'User-Agent': 'Company Name myname@company.com'}]
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): sec.gov:443
DEBUG:urllib3.connectionpool:https://sec.gov:443 "GET /Archives/edgar/data/1000045/000156459019023956/nick-20190331.xml HTTP/1.1" 301 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.sec.gov:443
DEBUG:urllib3.connectionpool:https://www.sec.gov:443 "GET /Archives/edgar/data/1000045/000156459019023956/nick-20190331.xml HTTP/1.1" 200 94459


worker(): saving XML to [1000045/000156459019023956:nick-20190331.xml]...


DEBUG:__main__:worker(): saving XML to [1000045/000156459019023956:nick-20190331.xml]...
DEBUG:root:save_to_xml(): saving XBRL XML to [/home/oonisim/home/repository/git/oonisim/python_programs/finance/SEC/EDGAR/data/xml/xbrl/1000045/000156459019023956/nick-20190331.xml.gz]...
DEBUG:root:save_to_xml(): saved [/home/oonisim/home/repository/git/oonisim/python_programs/finance/SEC/EDGAR/data/xml/xbrl/1000045/000156459019023956/nick-20190331.xml.gz]
DEBUG:sec_edgar_utility:http_get_content(): GET url [https://sec.gov/Archives/edgar/data/1000209/000119312519144225/mfin-20190331.xml] headers [{'User-Agent': 'Company Name myname@company.com'}]
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): sec.gov:443
DEBUG:urllib3.connectionpool:https://sec.gov:443 "GET /Archives/edgar/data/1000209/000119312519144225/mfin-20190331.xml HTTP/1.1" 301 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.sec.gov:443
DEBUG:urllib3.connectionpool:https://www.sec.gov:443 "GET /Archi

worker(): saving XML to [1000209/000119312519144225:mfin-20190331.xml]...


DEBUG:__main__:worker(): saving XML to [1000209/000119312519144225:mfin-20190331.xml]...
DEBUG:root:save_to_xml(): saving XBRL XML to [/home/oonisim/home/repository/git/oonisim/python_programs/finance/SEC/EDGAR/data/xml/xbrl/1000209/000119312519144225/mfin-20190331.xml.gz]...
DEBUG:root:save_to_xml(): saved [/home/oonisim/home/repository/git/oonisim/python_programs/finance/SEC/EDGAR/data/xml/xbrl/1000209/000119312519144225/mfin-20190331.xml.gz]
DEBUG:sec_edgar_utility:http_get_content(): GET url [https://sec.gov/Archives/edgar/data/1000228/000100022819000030/hsic-20190330.xml] headers [{'User-Agent': 'Company Name myname@company.com'}]
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): sec.gov:443
DEBUG:urllib3.connectionpool:https://sec.gov:443 "GET /Archives/edgar/data/1000228/000100022819000030/hsic-20190330.xml HTTP/1.1" 301 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.sec.gov:443
DEBUG:urllib3.connectionpool:https://www.sec.gov:443 "GET /Archi

worker(): saving XML to [1000228/000100022819000030:hsic-20190330.xml]...


DEBUG:__main__:worker(): saving XML to [1000228/000100022819000030:hsic-20190330.xml]...
DEBUG:root:save_to_xml(): saving XBRL XML to [/home/oonisim/home/repository/git/oonisim/python_programs/finance/SEC/EDGAR/data/xml/xbrl/1000228/000100022819000030/hsic-20190330.xml.gz]...
DEBUG:root:save_to_xml(): saved [/home/oonisim/home/repository/git/oonisim/python_programs/finance/SEC/EDGAR/data/xml/xbrl/1000228/000100022819000030/hsic-20190330.xml.gz]
DEBUG:sec_edgar_utility:http_get_content(): GET url [https://sec.gov/Archives/edgar/data/1000229/000156459019013150/clb-20190331.xml] headers [{'User-Agent': 'Company Name myname@company.com'}]
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): sec.gov:443
DEBUG:urllib3.connectionpool:https://sec.gov:443 "GET /Archives/edgar/data/1000229/000156459019013150/clb-20190331.xml HTTP/1.1" 301 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.sec.gov:443
DEBUG:urllib3.connectionpool:https://www.sec.gov:443 "GET /Archive

worker(): saving XML to [1000229/000156459019013150:clb-20190331.xml]...


DEBUG:__main__:worker(): saving XML to [1000229/000156459019013150:clb-20190331.xml]...
DEBUG:root:save_to_xml(): saving XBRL XML to [/home/oonisim/home/repository/git/oonisim/python_programs/finance/SEC/EDGAR/data/xml/xbrl/1000229/000156459019013150/clb-20190331.xml.gz]...
DEBUG:root:save_to_xml(): saved [/home/oonisim/home/repository/git/oonisim/python_programs/finance/SEC/EDGAR/data/xml/xbrl/1000229/000156459019013150/clb-20190331.xml.gz]
DEBUG:sec_edgar_utility:http_get_content(): GET url [https://sec.gov/Archives/edgar/data/1000230/000143774919011785/occ20190430b_10q_htm.xml] headers [{'User-Agent': 'Company Name myname@company.com'}]
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): sec.gov:443
DEBUG:urllib3.connectionpool:https://sec.gov:443 "GET /Archives/edgar/data/1000230/000143774919011785/occ20190430b_10q_htm.xml HTTP/1.1" 301 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.sec.gov:443
DEBUG:urllib3.connectionpool:https://www.sec.gov:443 

worker(): saving XML to [1000230/000143774919011785:occ20190430b_10q_htm.xml]...


DEBUG:__main__:worker(): saving XML to [1000230/000143774919011785:occ20190430b_10q_htm.xml]...
DEBUG:root:save_to_xml(): saving XBRL XML to [/home/oonisim/home/repository/git/oonisim/python_programs/finance/SEC/EDGAR/data/xml/xbrl/1000230/000143774919011785/occ20190430b_10q_htm.xml.gz]...
DEBUG:root:save_to_xml(): saved [/home/oonisim/home/repository/git/oonisim/python_programs/finance/SEC/EDGAR/data/xml/xbrl/1000230/000143774919011785/occ20190430b_10q_htm.xml.gz]
DEBUG:sec_edgar_utility:http_get_content(): GET url [https://sec.gov/Archives/edgar/data/1000232/000155837019004800/ktyb-20190331.xml] headers [{'User-Agent': 'Company Name myname@company.com'}]
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): sec.gov:443
DEBUG:urllib3.connectionpool:https://sec.gov:443 "GET /Archives/edgar/data/1000232/000155837019004800/ktyb-20190331.xml HTTP/1.1" 301 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.sec.gov:443
DEBUG:urllib3.connectionpool:https://www.se

worker(): saving XML to [1000232/000155837019004800:ktyb-20190331.xml]...


DEBUG:__main__:worker(): saving XML to [1000232/000155837019004800:ktyb-20190331.xml]...
DEBUG:root:save_to_xml(): saving XBRL XML to [/home/oonisim/home/repository/git/oonisim/python_programs/finance/SEC/EDGAR/data/xml/xbrl/1000232/000155837019004800/ktyb-20190331.xml.gz]...
DEBUG:root:save_to_xml(): saved [/home/oonisim/home/repository/git/oonisim/python_programs/finance/SEC/EDGAR/data/xml/xbrl/1000232/000155837019004800/ktyb-20190331.xml.gz]
DEBUG:sec_edgar_utility:http_get_content(): GET url [https://sec.gov/Archives/edgar/data/1000298/000155837019004865/imh-20190331.xml] headers [{'User-Agent': 'Company Name myname@company.com'}]
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): sec.gov:443
DEBUG:urllib3.connectionpool:https://sec.gov:443 "GET /Archives/edgar/data/1000298/000155837019004865/imh-20190331.xml HTTP/1.1" 301 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.sec.gov:443
DEBUG:urllib3.connectionpool:https://www.sec.gov:443 "GET /Archive

worker(): saving XML to [1000298/000155837019004865:imh-20190331.xml]...


DEBUG:__main__:worker(): saving XML to [1000298/000155837019004865:imh-20190331.xml]...
DEBUG:root:save_to_xml(): saving XBRL XML to [/home/oonisim/home/repository/git/oonisim/python_programs/finance/SEC/EDGAR/data/xml/xbrl/1000298/000155837019004865/imh-20190331.xml.gz]...
DEBUG:root:save_to_xml(): saved [/home/oonisim/home/repository/git/oonisim/python_programs/finance/SEC/EDGAR/data/xml/xbrl/1000298/000155837019004865/imh-20190331.xml.gz]
DEBUG:sec_edgar_utility:http_get_content(): GET url [https://sec.gov/Archives/edgar/data/1000623/000100062319000067/swm-20190331.xml] headers [{'User-Agent': 'Company Name myname@company.com'}]
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): sec.gov:443
DEBUG:urllib3.connectionpool:https://sec.gov:443 "GET /Archives/edgar/data/1000623/000100062319000067/swm-20190331.xml HTTP/1.1" 301 0
DEBUG:urllib3.connectionpool:Starting new HTTPS connection (1): www.sec.gov:443
DEBUG:urllib3.connectionpool:https://www.sec.gov:443 "GET /Archives/e

worker(): saving XML to [1000623/000100062319000067:swm-20190331.xml]...


DEBUG:__main__:worker(): saving XML to [1000623/000100062319000067:swm-20190331.xml]...
DEBUG:root:save_to_xml(): saving XBRL XML to [/home/oonisim/home/repository/git/oonisim/python_programs/finance/SEC/EDGAR/data/xml/xbrl/1000623/000100062319000067/swm-20190331.xml.gz]...
DEBUG:root:save_to_xml(): saved [/home/oonisim/home/repository/git/oonisim/python_programs/finance/SEC/EDGAR/data/xml/xbrl/1000623/000100062319000067/swm-20190331.xml.gz]


In [10]:
any(listing['Filepath'].isna())

False

In [16]:
YEAR = 2010
QTR=1
listing = pd.read_csv(
    f"{DIR_CSV_XBRL}/{YEAR}QTR{QTR}_XBRL.gz", 
    sep="|",
)

In [17]:
listing

Unnamed: 0,CIK,Company Name,Form Type,Year,Quarter,Date Filed,Filename,Filepath
0,1002638,OPEN TEXT CORP,10-Q,2010,1,2010-02-04,https://sec.gov/Archives/edgar/data/1002638/000119312510021715/otex-20091231.xml,1002638/000119312510021715/otex-20091231.xml.gz
1,1001039,WALT DISNEY CO/,10-Q,2010,1,2010-02-09,https://sec.gov/Archives/edgar/data/1001039/000119312510025949/dis-20100102.xml,1001039/000119312510025949/dis-20100102.xml.gz
2,1001082,DISH Network CORP,10-K,2010,1,2010-03-01,https://sec.gov/Archives/edgar/data/1001082/000095012310018671/dish-20091231.xml,1001082/000095012310018671/dish-20091231.xml.gz
3,1000697,WATERS CORP /DE/,10-K,2010,1,2010-02-26,https://sec.gov/Archives/edgar/data/1000697/000095012310017583/wat-20091231.xml,1000697/000095012310017583/wat-20091231.xml.gz
4,1004155,AGL RESOURCES INC,10-K,2010,1,2010-02-04,https://sec.gov/Archives/edgar/data/1004155/000100415510000016/agl-20091231.xml,1004155/000100415510000016/agl-20091231.xml.gz
5,1001838,SOUTHERN COPPER CORP/,10-K,2010,1,2010-02-26,https://sec.gov/Archives/edgar/data/1001838/000110465910010334/scco-20091231.xml,1001838/000110465910010334/scco-20091231.xml.gz
6,1004440,CONSTELLATION ENERGY GROUP INC,10-K,2010,1,2010-02-26,https://sec.gov/Archives/edgar/data/1004440/000104746910001515/ceg-20091231.xml,1004440/000104746910001515/ceg-20091231.xml.gz
7,1002910,AMEREN CORP,10-K,2010,1,2010-02-26,https://sec.gov/Archives/edgar/data/1002910/000119312510043155/aee-20091231.xml,1002910/000119312510043155/aee-20091231.xml.gz




---