# CAISO Energy Storage Bid Data Scraping


<b>Author:</b> Neal Ma<br/>
<b>Creation Date:</b> September 11, 2024

<b>Description:</b> CAISO has released daily energy storage reports here (https://www.caiso.com/library/daily-energy-storage-reports) since August 4, 2022. These reports include extremely useful data but the data itself is not readily accessible. This notebook extracts that data and saves it to a local parquet file to read into a pandas dataframe for easy query and access.

In [1]:
import requests
from bs4 import BeautifulSoup
import pandas as pd
import os

In [5]:
# define all URLs and dates to search
BASE_URL = "https://www.caiso.com/documents/dailyenergystoragereport"
DATE_FORMAT = "%b%d-%Y"
EXTENSION = ".html"

ALT_BASE_URL = "https://www.caiso.com/documents/daily-energy-storage-report-"
ALT_DATE_FORMAT = "%b-%d-%Y"

ALT_2_DATE_FORMAT = "%b-%d%Y"

START_DATE = '2022-07-31' # NOTE: This is the earliest date with available data
END_DATE = '2024-09-11' # NOTE: This can be changed but just needs to be verified

HTML_STORAGE_PATH = './data/CAISO_ES_HTML'

In [7]:
# build the directory to store html pages
if not os.path.exists(HTML_STORAGE_PATH):
    os.makedirs(HTML_STORAGE_PATH)

In [8]:
date_range = pd.date_range(start=START_DATE, end=END_DATE)
date_strings = date_range.strftime(DATE_FORMAT)
alt_date_strings = date_range.strftime(ALT_DATE_FORMAT)
alt_2_date_strings = date_range.strftime(ALT_2_DATE_FORMAT)

In [18]:
# iterate through all dates and datestrings to find valid dates
failed_requests = []
for date_str, alt_date_str, alt_2_date_str in zip(date_strings, alt_date_strings, alt_2_date_strings):
    html_str = None
    if (page := requests.get(BASE_URL + date_str + EXTENSION)).status_code == 200:
        html_str = BeautifulSoup(page.content, "html.parser")
    elif (page := requests.get(ALT_BASE_URL + alt_date_str + EXTENSION)).status_code == 200:
        html_str = BeautifulSoup(page.content, "html.parser")
    elif (page := requests.get(ALT_BASE_URL + alt_2_date_str + EXTENSION)).status_code == 200:
        html_str = BeautifulSoup(page.content, "html.parser")
    else:
        failed_requests.append(date_str)
    # save html_str to file with the date_str
    html_str = str(html_str)

    # keep lines in html_str between and including the line starting with 'var tot_energy_ifm = [' and with the final line starting with '	bid_rtpd_neg_hybrid_11 = ['
    lines = html_str.split('\n')
    start_index = next(i for i, line in enumerate(lines) if line.strip().startswith('var tot_energy_ifm = ['))
    end_index = next(i for i, line in enumerate(lines) if line.strip().startswith('bid_rtpd_neg_hybrid_11 = ['))
    html_str = ''.join(lines[start_index:end_index+1])

    f = open(HTML_STORAGE_PATH + "/" + date_str + '.html', 'w')
    f.write(str(html_str))
    f.close()

print(failed_requests)

[]


In [63]:
# define data names and time granularity in minutes
key_dict = {'tot_energy_ifm': 5,
            'tot_energy_ruc': 5,
            'tot_energy_rtpd': 5,
            'tot_energy_rtd': 5,
            'tot_charge_ifm': 5,
            'tot_charge_ruc': 5,
            'tot_charge_rtpd': 5,
            'tot_charge_rtd': 5,
            'as_ru_ifm': 60,
            'as_rd_ifm': 60,
            'as_sr_ifm': 60,
            'as_nr_ifm': 60,
            'as_ru_rtpd': 15,
            'as_rd_rtpd': 15,
            'as_sr_rtpd': 15,
            'as_nr_rtpd': 15,
            'tot_energy_hybrid_ifm': 5,
            'tot_energy_hybrid_ruc': 5,
            'tot_energy_hybrid_rtpd': 5,
            'tot_energy_hybrid_rtd': 5,
            'tot_charge_hybrid_ifm': 5,
            'tot_charge_hybrid_ruc': 5,
            'tot_charge_hybrid_rtpd': 5,
            'tot_charge_hybrid_rtd': 5,
            'as_ru_hybrid_ifm': 60,
            'as_rd_hybrid_ifm': 60,
            'as_sr_hybrid_ifm': 60,
            'as_nr_hybrid_ifm': 60,
            'as_ru_hybrid_rtpd': 15,
            'as_rd_hybrid_rtpd': 15,
            'as_sr_hybrid_rtpd': 15,
            'as_nr_hybrid_rtpd': 15,
            }

# some larger labels to add
prefixes = ['bid_ifm', 'bid_rtpd']

288
