# IOC catalog

In [None]:
import pandas as pd
import numpy as np
import requests

### Read html with Pandas

In [None]:
ioc = pd.read_html('http://www.ioc-sealevelmonitoring.org/ssc/')[0]

In [None]:
ioc.drop(ioc.tail(1).index,inplace=True)

In [None]:
ioc

In [None]:
#save DataFrame
ioc.to_csv('ioc.csv',encoding='utf-8',index=False)

## Additional IOC table with real time info 

In [None]:
import pandas as pd
ioc2 = pd.read_html('http://www.ioc-sealevelmonitoring.org/list.php?operator=&showall=all&output=general#')

In [None]:
ioc2 = ioc2[6].drop(0)
nh = ioc2.iloc[0]
ioc2 = ioc2[1:]
ioc2.columns = nh.values
ioc2 = ioc2.iloc[:,:10]
ioc2.reset_index(inplace=True, drop=True)

ioc2 = ioc2.drop(ioc2.columns[6:7],axis=1)

In [None]:
ioc2

In [None]:
#save DataFrame
ioc2.to_csv('ioc2.csv',encoding='utf-8',index=False)

### merge

In [None]:
#Merge the two ioc DataFrames
ioc3 = ioc.merge(ioc2, left_on=['Station Name'], right_on=['Location'])

In [None]:
ioc3 = ioc3.drop(['Details','Edit','Connection','DCP ID','Delay','TransmitInterval'], axis=1)

In [None]:
#save DataFrame
ioc3.to_csv('ioc3.csv',encoding='utf-8',index=False)

In [None]:
ioc3

### Find tgs within a lat/lon window

In [None]:
minlat = 17.,
maxlat = 25.,
minlon = 100.0,
maxlon = 130.0,

In [None]:
w = ioc.loc[(ioc['Longitude'] > minlon) & (ioc['Longitude'] < maxlon) & (ioc['Latitude'] > minlat) & (ioc['Latitude'] < maxlat)]

In [None]:
w.reset_index(inplace=True, drop=True)

In [None]:
w

## get the data

In [None]:
url = "http://www.ioc-sealevelmonitoring.org/bgraph.php?code={}&output=tab&period=0.5&endtime={}".format('quar','2020-2-1') # use IOC code value
url

In [None]:
data = pd.read_html(url, header=0)[0]
data

## Panos - Retrieve full IOC metadata

This is more or less an extension of IOC2. We retrieve all 3 tables, i.e.:

- "general"
- "contacts"
- "performance"

and we merge them

Furthermore, we make the requests in parallel using multithreading. The runtime is ~ 10seconds.


In [None]:
import concurrent.futures
import functools

import bs4 
import html5lib   # We don't use it directly, but we use its parser in bs4
import pandas as pd
import requests


IOC_METADATA_SKIP_ROWS = {
    "general": 4,
    "contacts": 4,
    "performance": 8,
}

IOC_METADATA_COLUMN_NAMES = {
    "general": [
        "ioc_code", "gloss_id", "country", "location", "connection", "dcp_id", "last_observation_level", "last_observation_time", "delay", "interval", "view",
    ],
    "contacts": ["ioc_code", "gloss_id", "lat", "lon", "country", "location", "connection", "contacts", "view"],
    "performance": [
        "ioc_code", "gloss_id", "country", "location", "connection", "added_to_system", 
        "observations_arrived_per_week", "observations_expected_per_week", "observations_ratio_per_week",
        "observations_arrived_per_month", "observations_expected_per_month", "observations_ratio_per_month",
        "observations_ratio_per_day", "sample_interval", "average_delay_per_day", "transmit_interval", "view", 
    ],
}

def get_ioc_metadata(output: str, skip_table_rows: int) -> pd.DataFrame:
    url = f"https://www.ioc-sealevelmonitoring.org/list.php?operator=&showall=all&output={output}#"
    print(url)
    response = requests.get(url)
    assert response.ok
    soup = bs4.BeautifulSoup(response.content, "html5lib")
    table = soup.find("table", {"class": "nice"})
    trs = table.find_all("tr")
    table_contents = '\n'.join(str(tr) for tr in trs[skip_table_rows:])
    html = f"<table>{table_contents}</table>"
    df = pd.read_html(html)[0]
    df.columns = IOC_METADATA_COLUMN_NAMES[output]
    df = df.drop(columns="view")
    return df


def normalize_ioc(df: pd.DataFrame) -> pd.DataFrame:
    df = df.assign(
        gloss_id=df.gloss_id.astype(pd.Int64Dtype()),
        country=df.country.astype("category"),
        observations_ratio_per_day=ioc.observations_ratio_per_day.replace("-", "0%").str[:-1].astype(int),
        observations_ratio_per_week=ioc.observations_ratio_per_week.replace("-", "0%").str[:-1].astype(int),
        observations_ratio_per_month=ioc.observations_ratio_per_month.replace("-", "0%").str[:-1].astype(int),
    )
    return df


# This is boilerplate code for using multithreading. 
# It can be abstracted in a function but I haven't done it here
ioc_dfs = {}
with concurrent.futures.ThreadPoolExecutor(max_workers=3) as executor:
    futures = {}
    for (output, skip_table_rows) in IOC_METADATA_SKIP_ROWS.items():
        future = executor.submit(get_ioc_metadata, output, skip_table_rows)
        futures[future] = output
    
    for future in concurrent.futures.as_completed(futures):
        output = futures[future]
        try:
            df = future.result()
        except Exception as exc:
            print(f"{output} generated an exception: {exc}")
        else:
            ioc_dfs[output] = df

#ioc_general = get_ioc_metadata("general", 4)
#ioc_contacts = get_ioc_metadata("contacts", 4)
#ioc_performance = get_ioc_metadata("performance", 8)

ioc = functools.reduce(pd.merge, (ioc_dfs["general"], ioc_dfs["contacts"], ioc_dfs["performance"]))  
ioc = normalize_ioc(ioc)

In [None]:
ioc