# Ray parallel execution 

Download the XBRL XML files form the SEC EDGAR sites in parallel and save to files in parallel.

In [1]:
# !pip install -q ray

In [2]:
from typing import (
    List,
    Dict
)
import os
import logging
import time
import re
import requests
import bs4
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import ray
from IPython.core.display import (
    display, 
    HTML
)

pd.set_option('display.float_format', lambda x: ('%f' % x).rstrip('0').rstrip('.'))
pd.set_option('display.colheader_justify', 'center')

In [3]:
EDGAR_HTTP_HEADERS = {"User-Agent": "Company Name myname@company.com"}
urls = [
    'https://sec.gov/Archives/edgar/data/1000697/000095012310017583/wat-20091231.xml',
    'https://sec.gov/Archives/edgar/data/1001039/000119312510025949/dis-20100102.xml',
    'https://sec.gov/Archives/edgar/data/1001082/000095012310018671/dish-20091231.xml',
    'https://sec.gov/Archives/edgar/data/1001838/000110465910010334/scco-20091231.xml',
    'https://sec.gov/Archives/edgar/data/1002638/000119312510021715/otex-20091231.xml',
    'https://sec.gov/Archives/edgar/data/1002910/000119312510043155/aee-20091231.xml',
    'https://sec.gov/Archives/edgar/data/1004155/000100415510000016/agl-20091231.xml',
    'https://sec.gov/Archives/edgar/data/1004440/000104746910001515/ceg-20091231.xml',
    'https://sec.gov/Archives/edgar/data/1004980/000100498010000015/pcg-20091231.xml'
]

In [4]:
ray.init(num_cpus=4, num_gpus=0)

{'node_ip_address': '192.168.13.128',
 'raylet_ip_address': '192.168.13.128',
 'redis_address': '192.168.13.128:6379',
 'object_store_address': '/tmp/ray/session_2021-12-28_14-28-43_135913_86405/sockets/plasma_store',
 'raylet_socket_name': '/tmp/ray/session_2021-12-28_14-28-43_135913_86405/sockets/raylet',
 'webui_url': None,
 'session_dir': '/tmp/ray/session_2021-12-28_14-28-43_135913_86405',
 'metrics_export_port': 64517,
 'node_id': 'd48d64df6a333833a206f608c471571a796e121b07121d5e6ad2175c'}

---

In [5]:
@ray.remote(num_returns=1)
def worker(urls):
    """GET XML files from URLs"""
    names = []
    xmls = []

    for url in urls:
        # --------------------------------------------------------------------------------
        # Retrieve SEC Filing XBRL XML from the URL 
        # --------------------------------------------------------------------------------
        response = requests.get(url, headers=EDGAR_HTTP_HEADERS)
        if response.status_code == 200:
            print(f"Got XML from {url}")
            # --------------------------------------------------------------------------------
            # SEC Filing XML Filename
            # --------------------------------------------------------------------------------
            names.append(os.path.basename(url))
            
            # --------------------------------------------------------------------------------
            # The first HTML content in the XBRL XML
            # --------------------------------------------------------------------------------
            content = response.content.decode("utf-8") 
            xmls.append(content)
        else:
            assert False, f"{url} failed with status {response.status_code}"
        
    return names, xmls

In [6]:
def provision():
    """Provision URLs"""
    num = 3
    start = 0
    end = num

    while(start < len(urls)):
        yield urls[start:end]
        start = end
        end += num
                
provisioner = provision()

In [7]:
os.makedirs("downloads", exist_ok=True)

@ray.remote
def save(references):
    
    # --------------------------------------------------------------------------------
    # ray.wait() returns a list of object-references (results) returned from workers.
    # (one reference per worker)
    # --------------------------------------------------------------------------------
    results = ray.get(references)
    while(results):
        # --------------------------------------------------------------------------------
        # Pick the first job result
        # --------------------------------------------------------------------------------
        result, *results = results

        # --------------------------------------------------------------------------------
        # result = [names, xmls] from each worker
        # --------------------------------------------------------------------------------
        names, xmls = result
        
        # --------------------------------------------------------------------------------
        # Save XML into a file with 'name'
        # --------------------------------------------------------------------------------
        for i in range(0, len(names)):
            print(f"Saving XML into {names[i]}")
            with open(f"./downloads/{names[i]}", "w") as f:
                f.write(xmls[i])

In [8]:
# --------------------------------------------------------------------------------
# Asynchronously invoke 3 jobs
# --------------------------------------------------------------------------------
futures = [worker.remote(ray.put(next(provisioner))) for i in range(0,3)]

In [9]:
waits = []
while futures:
    # --------------------------------------------------------------------------------
    # Take the object references from completed jobs
    # --------------------------------------------------------------------------------
    references, futures = ray.wait(futures)
    
    # --------------------------------------------------------------------------------
    # Pass the object references to another asynchronous job
    # --------------------------------------------------------------------------------
    waits.append(save.remote(references))

# --------------------------------------------------------------------------------
# Wait for the async job completions before the next step of shutting down Ray.
# --------------------------------------------------------------------------------
ray.get(waits)
time.sleep(3)  # Give the time to Jupyter to flush all stdout outputs.

[2m[36m(worker pid=86506)[0m Got XML from https://sec.gov/Archives/edgar/data/1004155/000100415510000016/agl-20091231.xml
[2m[36m(worker pid=86507)[0m Got XML from https://sec.gov/Archives/edgar/data/1001838/000110465910010334/scco-20091231.xml
[2m[36m(worker pid=86505)[0m Got XML from https://sec.gov/Archives/edgar/data/1000697/000095012310017583/wat-20091231.xml
[2m[36m(worker pid=86506)[0m Got XML from https://sec.gov/Archives/edgar/data/1004440/000104746910001515/ceg-20091231.xml
[2m[36m(worker pid=86507)[0m Got XML from https://sec.gov/Archives/edgar/data/1002638/000119312510021715/otex-20091231.xml
[2m[36m(worker pid=86505)[0m Got XML from https://sec.gov/Archives/edgar/data/1001039/000119312510025949/dis-20100102.xml
[2m[36m(worker pid=86505)[0m Got XML from https://sec.gov/Archives/edgar/data/1001082/000095012310018671/dish-20091231.xml
[2m[36m(worker pid=86506)[0m Got XML from https://sec.gov/Archives/edgar/data/1004980/000100498010000015/pcg-20091231.x

# Cleanup

In [10]:
ray.shutdown()