# Ray parallel execution 

Download the XBRL XML files form the SEC EDGAR sites in parallel and save to files in parallel.

In [1]:
# !pip install -q ray

In [1]:
from typing import (
    List,
    Dict
)
import os
import logging
import time
import re
import requests
import bs4
from bs4 import BeautifulSoup
import numpy as np
import pandas as pd
import ray
from IPython.core.display import (
    display, 
    HTML
)

pd.set_option('display.float_format', lambda x: ('%f' % x).rstrip('0').rstrip('.'))
pd.set_option('display.colheader_justify', 'center')

  from IPython.core.display import (


In [2]:
EDGAR_HTTP_HEADERS = {"User-Agent": "Company Name myname@company.com"}
urls = [
    'https://sec.gov/Archives/edgar/data/1000697/000095012310017583/wat-20091231.xml',
    'https://sec.gov/Archives/edgar/data/1001039/000119312510025949/dis-20100102.xml',
    'https://sec.gov/Archives/edgar/data/1001082/000095012310018671/dish-20091231.xml',
    'https://sec.gov/Archives/edgar/data/1001838/000110465910010334/scco-20091231.xml',
    'https://sec.gov/Archives/edgar/data/1002638/000119312510021715/otex-20091231.xml',
    'https://sec.gov/Archives/edgar/data/1002910/000119312510043155/aee-20091231.xml',
    'https://sec.gov/Archives/edgar/data/1004155/000100415510000016/agl-20091231.xml',
    'https://sec.gov/Archives/edgar/data/1004440/000104746910001515/ceg-20091231.xml',
    'https://sec.gov/Archives/edgar/data/1004980/000100498010000015/pcg-20091231.xml'
]

In [3]:
ray.init(num_cpus=4, num_gpus=0)

2023-10-25 16:35:55,274	INFO worker.py:1553 -- Started a local Ray instance.


0,1
Python version:,3.9.13
Ray version:,2.3.0


---

# Flow

1. URL generator (provisioner) generates a URL at a time.
2. Each Ray ```download``` worker downloads XML files and return a ray object reference (names, xmls).
3. Each Ray ```save``` worker saves XML files from the (names, xmls) reference.

```ray.remote()``` call asynchronouslly returns a ```future``` and we can either:
* wait until all the results get available by ```ray.get([futures])``` blocking call.
* process the results as they get available by ```ray.wait([futures])``` non-blocking all.

## ray.remote

* [Ray Core Quickstart](https://docs.ray.io/en/latest/ray-overview/getting-started.html#ray-core-quickstart)
>  This remote call yields a ```future```, a **Ray object reference**, that you can then fetch with ray.get.

## ray.wait

* [ray.wait](https://docs.ray.io/en/latest/ray-core/api/doc/ray.wait.html) 

> This method returns two lists. The first list consists of object refs that correspond to objects that are available in the object store. The second list corresponds to the rest of the object refs (which may or may not be ready).

## ray.get

* [ray.get](https://docs.ray.io/en/latest/ray-core/api/doc/ray.get.html)

> This method **blocks** until the object corresponding to the object ref is available in the local object store. 

In [4]:
def provision():
    """Provision URLs"""
    num = 3
    start = 0
    end = num

    while(start < len(urls)):
        yield urls[start:end]
        start = end
        end += num
                
provisioner = provision()

In [5]:
@ray.remote(num_returns=1)
def download(urls):
    """worker to download XML files from URLs via HTTP GET"""
    names = []
    xmls = []

    for url in urls:
        # --------------------------------------------------------------------------------
        # Retrieve SEC Filing XBRL XML from the URL 
        # --------------------------------------------------------------------------------
        response = requests.get(url, headers=EDGAR_HTTP_HEADERS)
        if response.status_code == 200:
            print(f"Got XML from {url}")
            # --------------------------------------------------------------------------------
            # SEC Filing XML Filename
            # --------------------------------------------------------------------------------
            names.append(os.path.basename(url))
            
            # --------------------------------------------------------------------------------
            # The first HTML content in the XBRL XML
            # --------------------------------------------------------------------------------
            content = response.content.decode("utf-8") 
            xmls.append(content)
        else:
            assert False, f"{url} failed with status {response.status_code}"
        
    return names, xmls

In [6]:
os.makedirs("downloads", exist_ok=True)

@ray.remote
def save(references):
    """worker to save the downloaded XML file(s)
    Args:
        references: Ray object referernces to (names, XMLs) of the downlaoded XML
    """
    # --------------------------------------------------------------------------------
    # ray.wait() returns a list of object-references (results) returned from 
    # download workers. (one reference per worker)
    # --------------------------------------------------------------------------------
    results = ray.get(references)
    while(results):
        # --------------------------------------------------------------------------------
        # Pick the first job result
        # --------------------------------------------------------------------------------
        result, *results = results

        # --------------------------------------------------------------------------------
        # result = [names, xmls] from each worker
        # --------------------------------------------------------------------------------
        names, xmls = result
        
        # --------------------------------------------------------------------------------
        # Save XML into a file with 'name'
        # --------------------------------------------------------------------------------
        for i in range(0, len(names)):
            print(f"Saving XML into {names[i]}")
            with open(f"./downloads/{names[i]}", "w") as f:
                f.write(xmls[i])

# Distributed executions of downloads

Invoke remote workers and process the worker results as they get available.

In [7]:
# --------------------------------------------------------------------------------
# Asynchronously invoke 3 XML file download jobs (workers)
# ray.put() to convert a python object to a Ray object.
# --------------------------------------------------------------------------------
futures_for_download = [download.remote(ray.put(next(provisioner))) for i in range(0,3)]

[2m[36m(download pid=93155)[0m Got XML from https://sec.gov/Archives/edgar/data/1001838/000110465910010334/scco-20091231.xml
[2m[36m(download pid=93155)[0m Got XML from https://sec.gov/Archives/edgar/data/1002638/000119312510021715/otex-20091231.xml
[2m[36m(download pid=93156)[0m Got XML from https://sec.gov/Archives/edgar/data/1004155/000100415510000016/agl-20091231.xml
[2m[36m(download pid=93158)[0m Got XML from https://sec.gov/Archives/edgar/data/1000697/000095012310017583/wat-20091231.xml
[2m[36m(download pid=93156)[0m Got XML from https://sec.gov/Archives/edgar/data/1004440/000104746910001515/ceg-20091231.xml
[2m[36m(download pid=93158)[0m Got XML from https://sec.gov/Archives/edgar/data/1001039/000119312510025949/dis-20100102.xml
[2m[36m(download pid=93155)[0m Got XML from https://sec.gov/Archives/edgar/data/1002910/000119312510043155/aee-20091231.xml
[2m[36m(download pid=93156)[0m Got XML from https://sec.gov/Archives/edgar/data/1004980/000100498010000015

In [8]:
references_to_names_xmls_pairs = []    # Ray object references to (names, xmls)
while futures_for_download:
    # --------------------------------------------------------------------------------
    # Take the object references from completed jobs
    # --------------------------------------------------------------------------------
    references, futures_for_download = ray.wait(futures_for_download)
    references_to_names_xmls_pairs.append(references)

# Distributed executions of saving XML files

Invoke remote workers and block wait until all the worker complete.

In [9]:
futures_for_save = [
    save.remote(references) 
    for references in references_to_names_xmls_pairs
]

[2m[36m(save pid=93155)[0m Saving XML into agl-20091231.xml
[2m[36m(save pid=93155)[0m Saving XML into ceg-20091231.xml
[2m[36m(save pid=93155)[0m Saving XML into pcg-20091231.xml
[2m[36m(save pid=93156)[0m Saving XML into scco-20091231.xml
[2m[36m(save pid=93156)[0m Saving XML into otex-20091231.xml
[2m[36m(save pid=93156)[0m Saving XML into aee-20091231.xml
[2m[36m(save pid=93158)[0m Saving XML into wat-20091231.xml
[2m[36m(save pid=93158)[0m Saving XML into dis-20100102.xml
[2m[36m(save pid=93158)[0m Saving XML into dish-20091231.xml


In [10]:
# --------------------------------------------------------------------------------
# Wait for the async job completions before the next step of shutting down Ray.
# --------------------------------------------------------------------------------
ray.get(futures_for_save)
time.sleep(3)  # Give the time to Jupyter to flush all stdout outputs.

In [11]:
!ls downloads

aee-20091231.xml  dis-20100102.xml  pcg-20091231.xml
agl-20091231.xml  dish-20091231.xml scco-20091231.xml
ceg-20091231.xml  otex-20091231.xml wat-20091231.xml


# Cleanup

In [12]:
ray.shutdown()

In [13]:
!rm -rf downloads