# EDGAR XBRL XML URL

Generate URLs to the XBRL XML files in the fileing directory in a parallel processing manner using Ray.

In [1]:
# !pip install -q ray

In [2]:
from typing import (
    List,
    Dict,
    Iterable
)
import os
import sys
import random
import logging
import time
import re
import requests
import bs4
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import ray
from IPython.core.display import (
    display, 
    HTML
)

In [3]:
from sec_edgar_constant import (
    NUM_CPUS,
    FS_TYPE_10K,
    FS_TYPE_10Q,
    EDGAR_HTTP_HEADERS,
)

from sec_edgar_filing_xbrl_listing import (
    list_files,
    index_xml_url,
    xbrl_url,
    director,
    load_edgar_xbrl_index_file,
    xbrl_file_path_to_save,
    save_to_csv,
)

In [4]:
pd.set_option('display.float_format', lambda x: ('%f' % x).rstrip('0').rstrip('.'))
pd.set_option('display.colheader_justify', 'center')

logging.basicConfig(level=logging.DEBUG)
handler = logging.StreamHandler(sys.stdout)

Logger = logging.getLogger(__name__)
Logger.addHandler(handler)

# Data 

## EDGAR XBRL Directory Indices

In [5]:
indices = pd.read_csv(
    "../data/listing/2010QTR1", 
    sep="|",
    usecols=['CIK', 'Form Type', 'Date Filed', 'Filename'],
    parse_dates=['Date Filed'],
)
# Copy, not view
indices = indices[indices['Form Type'].isin([FS_TYPE_10Q, FS_TYPE_10K])]
indices.loc[:,  'Form Type'] = indices['Form Type'].astype('category')
print(len(indices))
indices

513


Unnamed: 0,CIK,Form Type,Date Filed,Filename
0,1000697,10-K,2010-02-26,edgar/data/1000697/0000950123-10-017583.txt
1,1001039,10-Q,2010-02-09,edgar/data/1001039/0001193125-10-025949.txt
3,1001082,10-K,2010-03-01,edgar/data/1001082/0000950123-10-018671.txt
4,1001838,10-K,2010-02-26,edgar/data/1001838/0001104659-10-010334.txt
5,1002638,10-Q,2010-02-04,edgar/data/1002638/0001193125-10-021715.txt
...,...,...,...,...
585,96223,10-K,2010-02-26,edgar/data/96223/0000096223-10-000004.txt
586,97216,10-K,2010-02-24,edgar/data/97216/0001104659-10-009153.txt
587,97476,10-K,2010-02-23,edgar/data/97476/0001140361-10-007923.txt
588,97745,10-K,2010-02-26,edgar/data/97745/0000097745-10-000008.txt


In [6]:
DIR_CSV_INDEX = "../data/listing/"

## XBRL TXT directory path to directory listing index.xml URL

In [7]:
indices.loc[:, 'Filename'] = indices['Filename'].apply(index_xml_url)
indices

Unnamed: 0,CIK,Form Type,Date Filed,Filename
0,1000697,10-K,2010-02-26,https://sec.gov/Archives/edgar/data/1000697/000095012310017583/index.xml
1,1001039,10-Q,2010-02-09,https://sec.gov/Archives/edgar/data/1001039/000119312510025949/index.xml
3,1001082,10-K,2010-03-01,https://sec.gov/Archives/edgar/data/1001082/000095012310018671/index.xml
4,1001838,10-K,2010-02-26,https://sec.gov/Archives/edgar/data/1001838/000110465910010334/index.xml
5,1002638,10-Q,2010-02-04,https://sec.gov/Archives/edgar/data/1002638/000119312510021715/index.xml
...,...,...,...,...
585,96223,10-K,2010-02-26,https://sec.gov/Archives/edgar/data/96223/000009622310000004/index.xml
586,97216,10-K,2010-02-24,https://sec.gov/Archives/edgar/data/97216/000110465910009153/index.xml
587,97476,10-K,2010-02-23,https://sec.gov/Archives/edgar/data/97476/000114036110007923/index.xml
588,97745,10-K,2010-02-26,https://sec.gov/Archives/edgar/data/97745/000009774510000008/index.xml


---

# Utilities

In [8]:
def split(tasks: Iterable, num: int):
    """Split tasks into num assignments and dispense them sequentially
    Args:
        tasks: tasks to split into assignments
        num: number of assignments to create
    Yields: An assignment, which is a slice of the tasks
    """
    assert num > 0
    assert len(tasks) > 0
    Logger.debug(f"createing {num} assignments for {len(tasks)} tasks")

    # Total size of the tasks
    total = len(tasks)
    
    # Each assignment has 'quota' size which can be zero if total < number of assignments.
    quota = int(total / num)

    # Left over after each assignment takes its 'quota'
    redisual = total % num

    start = 0
    while start < total:
        # As long as redisual is there, each assginemt has (quota + 1) as its tasks.
        if redisual > 0:
            size = quota + 1 
            redisual -= 1
        else:
            size = quota 
        
        end = start + size
        yield tasks[start : min(end, total)]

        start = end
        end += size

In [9]:
@ray.remote(num_returns=1)
def worker(df):
    """GET XBRL XML URL
    Args:
        df: Pandas dataframe of the XBRL indices in the format:
            |CIK|Company Name|Form Type|Date Filed|Filename|
    
    Returns: Pandas dataframe where "Filename" column is updated with XBRL XML URL.
    """
    assert len(df) > 0

    df.loc[:, 'Filename'] = df['Filename'].apply(xbrl_url)
    return df

# Start Ray

In [10]:
ray.init(num_cpus=NUM_CPUS, num_gpus=0)

{'node_ip_address': '192.168.13.128',
 'raylet_ip_address': '192.168.13.128',
 'redis_address': '192.168.13.128:60386',
 'object_store_address': '/tmp/ray/session_2021-12-30_00-24-45_919192_24271/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-12-30_00-24-45_919192_24271/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2021-12-30_00-24-45_919192_24271',
 'metrics_export_port': 62687,
 'node_id': '00f00ba77beb61d0480cdbc7acc8f759e3da1da90e6be8aa09611f69'}

# Steps

In [11]:
indices = indices.head(100)
indices

Unnamed: 0,CIK,Form Type,Date Filed,Filename
0,1000697,10-K,2010-02-26,https://sec.gov/Archives/edgar/data/1000697/000095012310017583/index.xml
1,1001039,10-Q,2010-02-09,https://sec.gov/Archives/edgar/data/1001039/000119312510025949/index.xml
3,1001082,10-K,2010-03-01,https://sec.gov/Archives/edgar/data/1001082/000095012310018671/index.xml
4,1001838,10-K,2010-02-26,https://sec.gov/Archives/edgar/data/1001838/000110465910010334/index.xml
5,1002638,10-Q,2010-02-04,https://sec.gov/Archives/edgar/data/1002638/000119312510021715/index.xml
...,...,...,...,...
105,1103982,10-K,2010-02-25,https://sec.gov/Archives/edgar/data/1103982/000119312510040106/index.xml
106,1105705,10-K,2010-02-19,https://sec.gov/Archives/edgar/data/1105705/000095012310014479/index.xml
107,1108524,10-K,2010-03-11,https://sec.gov/Archives/edgar/data/1108524/000119312510053838/index.xml
108,1109357,10-K,2010-02-05,https://sec.gov/Archives/edgar/data/1109357/000119312510023280/index.xml


In [12]:
# --------------------------------------------------------------------------------
# Split dataframe to handle in parallel
# --------------------------------------------------------------------------------
assignment = split(tasks=indices, num=NUM_CPUS)

In [13]:
# --------------------------------------------------------------------------------
# Asynchronously invoke tasks
# --------------------------------------------------------------------------------
futures = [worker.remote(task) for task in assignment]
assert len(futures) == NUM_CPUS, f"Expected {NUM_CPUS} tasks but got {len(futures)}."

In [14]:
%%time
waits = []
while futures:
    # --------------------------------------------------------------------------------
    # Take the object references from completed jobs
    # --------------------------------------------------------------------------------
    completed, futures = ray.wait(futures)
    
    # --------------------------------------------------------------------------------
    # Pass the object references to another asynchronous job
    # --------------------------------------------------------------------------------
    waits.extend(completed)

[2m[36m(pid=24373)[0m INFO:sec_edgar_filing_xbrl_listing:Identifying XBRL URL for the filing directory index [https://sec.gov/Archives/edgar/data/1000697/000095012310017583/index.xml]
[2m[36m(pid=24370)[0m INFO:sec_edgar_filing_xbrl_listing:Identifying XBRL URL for the filing directory index [https://sec.gov/Archives/edgar/data/1047122/000119312510038391/index.xml]
[2m[36m(pid=24376)[0m INFO:sec_edgar_filing_xbrl_listing:Identifying XBRL URL for the filing directory index [https://sec.gov/Archives/edgar/data/1067983/000119312510043450/index.xml]
[2m[36m(pid=24371)[0m INFO:sec_edgar_filing_xbrl_listing:Identifying XBRL URL for the filing directory index [https://sec.gov/Archives/edgar/data/1058290/000119312510040500/index.xml]
[2m[36m(pid=24372)[0m INFO:sec_edgar_filing_xbrl_listing:Identifying XBRL URL for the filing directory index [https://sec.gov/Archives/edgar/data/1013871/000095012310015824/index.xml]
[2m[36m(pid=24373)[0m INFO:sec_edgar_filing_xbrl_listing:XBRL 

[2m[36m(pid=24369)[0m INFO:sec_edgar_filing_xbrl_listing:XBRL XML [https://sec.gov/Archives/edgar/data/1031296/000103129610000011/fe-20091231.xml] identified
[2m[36m(pid=24369)[0m INFO:sec_edgar_filing_xbrl_listing:Identifying XBRL URL for the filing directory index [https://sec.gov/Archives/edgar/data/1032033/000095012310018176/index.xml]
[2m[36m(pid=24368)[0m INFO:sec_edgar_filing_xbrl_listing:XBRL XML [https://sec.gov/Archives/edgar/data/1090727/000119312510042908/ups-20091231.xml] identified
[2m[36m(pid=24368)[0m INFO:sec_edgar_filing_xbrl_listing:Identifying XBRL URL for the filing directory index [https://sec.gov/Archives/edgar/data/1090872/000110465910013535/index.xml]
[2m[36m(pid=24374)[0m INFO:sec_edgar_filing_xbrl_listing:XBRL XML [https://sec.gov/Archives/edgar/data/1039101/000095012310017377/lll-20091231.xml] identified
[2m[36m(pid=24374)[0m INFO:sec_edgar_filing_xbrl_listing:Identifying XBRL URL for the filing directory index [https://sec.gov/Archives/edg

[2m[36m(pid=24370)[0m INFO:sec_edgar_filing_xbrl_listing:XBRL XML [https://sec.gov/Archives/edgar/data/1050915/000095012310019204/pwr-20091231.xml] identified
[2m[36m(pid=24370)[0m INFO:sec_edgar_filing_xbrl_listing:Identifying XBRL URL for the filing directory index [https://sec.gov/Archives/edgar/data/1051470/000119312510031419/index.xml]
[2m[36m(pid=24369)[0m INFO:sec_edgar_filing_xbrl_listing:XBRL XML [https://sec.gov/Archives/edgar/data/1032208/000008652110000019/sre-20091231.xml] identified
[2m[36m(pid=24369)[0m INFO:sec_edgar_filing_xbrl_listing:Identifying XBRL URL for the filing directory index [https://sec.gov/Archives/edgar/data/103379/000095012310020525/index.xml]
[2m[36m(pid=24371)[0m INFO:sec_edgar_filing_xbrl_listing:XBRL XML [https://sec.gov/Archives/edgar/data/106040/000095012310006821/wdc-20100101.xml] identified
[2m[36m(pid=24371)[0m INFO:sec_edgar_filing_xbrl_listing:Identifying XBRL URL for the filing directory index [https://sec.gov/Archives/edga

[2m[36m(pid=24372)[0m INFO:sec_edgar_filing_xbrl_listing:XBRL XML [https://sec.gov/Archives/edgar/data/1018724/000119312510016098/amzn-20091231.xml] identified
[2m[36m(pid=24372)[0m INFO:sec_edgar_filing_xbrl_listing:Identifying XBRL URL for the filing directory index [https://sec.gov/Archives/edgar/data/1018963/000095012310017084/index.xml]
[2m[36m(pid=24369)[0m INFO:sec_edgar_filing_xbrl_listing:XBRL XML [https://sec.gov/Archives/edgar/data/1035002/000095012310018097/vlo-20091231.xml] identified
[2m[36m(pid=24369)[0m INFO:sec_edgar_filing_xbrl_listing:Identifying XBRL URL for the filing directory index [https://sec.gov/Archives/edgar/data/1035267/000119312510016932/index.xml]
[2m[36m(pid=24376)[0m INFO:sec_edgar_filing_xbrl_listing:XBRL XML [https://sec.gov/Archives/edgar/data/10795/000095012310009988/bdx-20091231.xml] identified
[2m[36m(pid=24376)[0m INFO:sec_edgar_filing_xbrl_listing:Identifying XBRL URL for the filing directory index [https://sec.gov/Archives/edg

[2m[36m(pid=24369)[0m INFO:sec_edgar_filing_xbrl_listing:XBRL XML [https://sec.gov/Archives/edgar/data/103682/000119312510042883/d-20091231.xml] identified
[2m[36m(pid=24369)[0m INFO:sec_edgar_filing_xbrl_listing:Identifying XBRL URL for the filing directory index [https://sec.gov/Archives/edgar/data/1037016/000095012310017111/index.xml]
[2m[36m(pid=24376)[0m INFO:sec_edgar_filing_xbrl_listing:XBRL XML [https://sec.gov/Archives/edgar/data/1086195/000095012310007004/artg-20091231.xml] identified
[2m[36m(pid=24376)[0m INFO:sec_edgar_filing_xbrl_listing:Identifying XBRL URL for the filing directory index [https://sec.gov/Archives/edgar/data/1086222/000119312510044561/index.xml]
[2m[36m(pid=24373)[0m INFO:sec_edgar_filing_xbrl_listing:XBRL XML [https://sec.gov/Archives/edgar/data/100826/000119312510043155/aee-20091231.xml] identified
[2m[36m(pid=24373)[0m INFO:sec_edgar_filing_xbrl_listing:Identifying XBRL URL for the filing directory index [https://sec.gov/Archives/edgar

[2m[36m(pid=24373)[0m INFO:sec_edgar_filing_xbrl_listing:XBRL XML [https://sec.gov/Archives/edgar/data/1010775/000119312510041620/mir-20091231.xml] identified
[2m[36m(pid=24373)[0m INFO:sec_edgar_filing_xbrl_listing:Identifying XBRL URL for the filing directory index [https://sec.gov/Archives/edgar/data/1011006/000119312510043149/index.xml]
[2m[36m(pid=24376)[0m INFO:sec_edgar_filing_xbrl_listing:XBRL XML [https://sec.gov/Archives/edgar/data/108772/000119312510043079/xrx-20091231.xml] identified
[2m[36m(pid=24376)[0m INFO:sec_edgar_filing_xbrl_listing:Identifying XBRL URL for the filing directory index [https://sec.gov/Archives/edgar/data/1087835/000095012310015194/index.xml]
[2m[36m(pid=24368)[0m INFO:sec_edgar_filing_xbrl_listing:XBRL XML [https://sec.gov/Archives/edgar/data/1109357/000119312510023280/exc-20091231.xml] identified
[2m[36m(pid=24368)[0m INFO:sec_edgar_filing_xbrl_listing:Identifying XBRL URL for the filing directory index [https://sec.gov/Archives/edg

CPU times: user 1.34 s, sys: 940 ms, total: 2.28 s
Wall time: 17.5 s


[2m[36m(pid=24369)[0m INFO:sec_edgar_filing_xbrl_listing:XBRL XML [https://sec.gov/Archives/edgar/data/1037949/000119312510032428/q-20091231.xml] identified


In [15]:
# --------------------------------------------------------------------------------
# Collect the results
# --------------------------------------------------------------------------------
assert len(waits) == NUM_CPUS, f"Expected {NUM_CPUS} tasks but got {len(waits)}."
results = ray.get(waits)

In [16]:
%%time
df = pd.concat(results)
df.sort_index(inplace=True)
df

CPU times: user 28.3 ms, sys: 11.2 ms, total: 39.5 ms
Wall time: 40.6 ms


Unnamed: 0,CIK,Form Type,Date Filed,Filename
0,1000697,10-K,2010-02-26,https://sec.gov/Archives/edgar/data/1000697/000095012310017583/wat-20091231.xml
1,1001039,10-Q,2010-02-09,https://sec.gov/Archives/edgar/data/1001039/000119312510025949/dis-20100102.xml
3,1001082,10-K,2010-03-01,https://sec.gov/Archives/edgar/data/1001082/000095012310018671/dish-20091231.xml
4,1001838,10-K,2010-02-26,https://sec.gov/Archives/edgar/data/1001838/000110465910010334/scco-20091231.xml
5,1002638,10-Q,2010-02-04,https://sec.gov/Archives/edgar/data/1002638/000119312510021715/otex-20091231.xml
...,...,...,...,...
105,1103982,10-K,2010-02-25,https://sec.gov/Archives/edgar/data/1103982/000119312510040106/kft-20091231.xml
106,1105705,10-K,2010-02-19,https://sec.gov/Archives/edgar/data/1105705/000095012310014479/twx-20091231.xml
107,1108524,10-K,2010-03-11,https://sec.gov/Archives/edgar/data/1108524/000119312510053838/crm-20100131.xml
108,1109357,10-K,2010-02-05,https://sec.gov/Archives/edgar/data/1109357/000119312510023280/exc-20091231.xml


# Cleanup

In [17]:
ray.shutdown()

In [18]:
year = None
assert (re.match(r"[1-2][0-9][0-9][0-9]", year) if year else True), f"Invalid year {year}"
