# webscrape AEMO schemas

This playbook downloads [these web pages](https://nemweb.com.au/Reports/Current/MMSDataModelReport/Electricity/MMS%20Data%20Model%20Report_files/MMS_504_9.htm) to detect the list of columns, column types, and index columns for each table.

Each table is documented on a page between these URLs:

* https://nemweb.com.au/Reports/Current/MMSDataModelReport/Electricity/MMS%20Data%20Model%20Report_files/MMS_245.htm#1
* https://nemweb.com.au/Reports/Current/MMSDataModelReport/Electricity/MMS%20Data%20Model%20Report_files/MMS_503.htm#1

So we do a for loop to grab each one.

We want the list of "Primary Key Columns", "Index Columns" (there may be multiple sets of these, and "Content", columns "Name" and "Data Type"

This file is a work in progress. I want to add it to "01-download.ipynb".

In [55]:
import re

from tqdm import tqdm # progress bar animation
import requests
from bs4 import BeautifulSoup # webscraping

In [56]:
# create a session object to re-use between requests
# to hopefully speed up downloads by not re-doing the TLS handshake for each HTTP request
# (unsure if this actually speeds things up)
session = requests.Session()

In [78]:
schemas = {}
for i in tqdm(range(245, 503+1)):
    url = f"https://nemweb.com.au/Reports/Current/MMSDataModelReport/Electricity/MMS%20Data%20Model%20Report_files/MMS_{i}.htm#1"
    r = session.get(url)
    r.raise_for_status()
    html = r.text
    
    # it's called "soup" because the python webscraping library is called "beautiful soup"
    soup = BeautifulSoup(html)

    h2_text = soup.find('h2').text.strip()
    if not h2_text.startswith("Table: "):
        if not h2_text.strip().startswith('Package'):
            print(f"Ignoring page with h2: {h2_text}: {url}")
    else:
        # watch out, sometimes the documentation has an incorrect space inside the table name
        # e.g. SET_ APC_COMPENSATION instead of SET_APC_COMPENSATION
        table_name = re.match(r"Table: (.*)", h2_text).group(1).replace(' ','')
        
        schemas[table_name] = {
            'indexes': [],
            'columns': {}
        }
        for (i, h3) in enumerate(soup.find_all('h3')):
            h3_name = h3.find('a').get('name').strip()
            h3_text = h3.text.strip()
            if i == 0:
                assert table_name == h3.text.strip().replace(' ',''), f"Table name from top h2 was {table_name}, from first h3 was {h3.text.strip()} in {url}"
            elif h3_text == "Primary Key Columns":
                # the next element is a table.
                # grab each cell of table, except the first one
                # that's the primary key name
                schemas[table_name]['primary_keys'] = [td.text.strip() for td in h3.find_next('table').find_all('td')[1:]]
            elif h3_text == "Index Columns":
                # same again, but for an index
                schemas[table_name]['indexes'].append([td.text.strip() for td in h3.find_next('table').find_all('td')[1:]])
            elif h3_text == "Content":
                # the list of all column names
                for row in h3.find_next('table').find_all('tr')[1:]:
                    cells = row.find_all('td')
                    column_name = cells[0].text.strip()
                    column_type = cells[1].text.strip()
                    schemas[table_name]['columns'][column_name] = column_type
            elif h3_text == "Description":
                # ignore
                pass
            else:
                print(f"Unknown h3: {h3.text.strip()} on {url}")
        
        # sometimes there's so many columns that there is a second page, just of the "Content" table
        # example: https://nemweb.com.au/Reports/Current/MMSDataModelReport/Electricity/MMS%20Data%20Model%20Report_files/MMS_128_1.htm
        for a in soup.find_all('a'):
            if a.get('href') and re.match(r"MMS_(\d+)_(\d+).htm", a['href']):
                try:
                    int(a.text.strip())
                except ValueError:
                    # this is not a subsequent page, it's another link
                    # e.g. to the list of tables
                    # https://nemweb.com.au/Reports/Current/MMSDataModelReport/Electricity/MMS%20Data%20Model%20Report_files/MMS_358_1.htm
                    continue
                else:
                    url_prefix, slash, url_end = url.rpartition('/')
                    new_url = url_prefix + '/' + a['href']
                    r = session.get(new_url)
                    r.raise_for_status()
                    html = r.text
                    soup = BeautifulSoup(html)
                    h3 = soup.find('h3')
                    assert h3.text.strip() == 'Content', f"Unexpected first h3 in second page: {new_url} after {url}"
                    table = h3.find_next('table')
                    for row in table.find_all('tr')[1:]:
                        cells = row.find_all('td')
                        column_name = cells[0].text.strip()
                        column_type = cells[1].text.strip()
                        schemas[table_name]['columns'][column_name] = column_type
schemas

 44%|███████████████████████████████████████████████████████████████████                                                                                    | 115/259 [00:38<00:46,  3.13it/s]

Ignoring page with h2: Diagram: Entities: Historical Tables: https://nemweb.com.au/Reports/Current/MMSDataModelReport/Electricity/MMS%20Data%20Model%20Report_files/MMS_359.htm#1


100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████████| 259/259 [01:26<00:00,  3.00it/s]


{'ADG_DETAIL': {'indexes': [],
  'columns': {'ADG_ID': 'VARCHAR2(20)',
   'EFFECTIVEDATE': 'DATE',
   'VERSION_DATETIME': 'DATE',
   'ADG_TYPE': 'VARCHAR2(20)',
   'AUTHORISEDDATE': 'DATE',
   'AUTHORISEDBY': 'VARCHAR2(15)',
   'LASTCHANGED': 'DATE'},
  'primary_keys': ['ADG_ID', 'EFFECTIVEDATE', 'VERSION_DATETIME']},
 'AGGREGATE_DISPATCH_GROUP': {'indexes': [],
  'columns': {'ADG_ID': 'VARCHAR2(20)',
   'COMMENTS': 'VARCHAR2(100)',
   'LASTCHANGED': 'DATE'},
  'primary_keys': ['ADG_ID']},
 'BIDDUIDDETAILS': {'indexes': [['LASTCHANGED']],
  'columns': {'DUID': 'VARCHAR2(10)',
   'EFFECTIVEDATE': 'DATE',
   'VERSIONNO': 'NUMBER(3,0)',
   'BIDTYPE': 'VARCHAR2(10)',
   'MAXCAPACITY': 'NUMBER(22,0)',
   'MINENABLEMENTLEVEL': 'NUMBER(22,0)',
   'MAXENABLEMENTLEVEL': 'NUMBER(22,0)',
   'MAXLOWERANGLE': 'NUMBER(3,0)',
   'MAXUPPERANGLE': 'NUMBER(3,0)',
   'LASTCHANGED': 'DATE'},
  'primary_keys': ['BIDTYPE', 'DUID', 'EFFECTIVEDATE', 'VERSIONNO']},
 'BIDDUIDDETAILSTRK': {'indexes': [['LASTCHAN

In [74]:
schemas.keys()

dict_keys(['ADG_DETAIL', 'AGGREGATE_DISPATCH_GROUP', 'BIDDUIDDETAILS', 'BIDDUIDDETAILSTRK', 'DISPATCHABLEUNIT', 'DUALLOC', 'DUDETAIL', 'DUDETAILSUMMARY', 'GENMETER', 'GENUNITS', 'GENUNITS_UNIT', 'MNSP_INTERCONNECTOR', 'MNSP_PARTICIPANT', 'PARTICIPANT', 'PARTICIPANTACCOUNT', 'PARTICIPANTCATEGORY', 'PARTICIPANTCATEGORYALLOC', 'PARTICIPANTCLASS', 'PARTICIPANTCREDITDETAIL', 'PMS_GROUP', 'PMS_GROUPNMI', 'PMS_GROUPSERVICE', 'STADUALLOC', 'STATION', 'STATIONOPERATINGSTATUS', 'STATIONOWNER', 'STATIONOWNERTRK', 'PREDISPATCH_FCAS_REQ', 'PREDISPATCH_LOCAL_PRICE', 'PREDISPATCH_MNSPBIDTRK', 'PREDISPATCHBLOCKEDCONSTRAINT', 'PREDISPATCHCASESOLUTION', 'PREDISPATCHCONSTRAINT', 'PREDISPATCHINTERCONNECTORRES', 'PREDISPATCHINTERSENSITIVITIES', 'PREDISPATCHLOAD', 'PREDISPATCHOFFERTRK', 'PREDISPATCHPRICE', 'PREDISPATCHPRICESENSITIVITIES', 'PREDISPATCHREGIONSUM', 'PREDISPATCHSCENARIODEMAND', 'PREDISPATCHSCENARIODEMANDTRK', 'MTPASA_RESERVELIMIT', 'MTPASA_RESERVELIMIT_REGION', 'MTPASA_RESERVELIMIT_SET', 'RESER