In [1]:
import re
import simplejson as json
from pathlib import Path

import numpy as np
import pandas as pd
from bs4 import BeautifulSoup

In [2]:
def script_finder(script_id):
    def _finder(tag):
        return (
            (tag.name == "script")
            and tag.has_attr("data-for")
            and (tag.attrs["data-for"] == script_id)
        )
    return _finder

In [3]:
def parse_fnmr_evolution(data, algorithm):
    df = pd.DataFrame(columns=["algorithm", "dataset", "date", "fmr", "fnmr"])
    
    # Find FMR value
    # Title should be of the form
    # False non-match rate (FNMR) at false match rate (FMR) = 0.000001
    ytitle = data["x"]["layout"]["yaxis"]["title"]["text"]
    fmr = float(re.search(r"\d+\.\d+$", ytitle).group())
    
    fmt = re.compile(
        r"^Submitted: (\d\d\d\d-\d\d-\d\d)<br />fnmr: (\d\.\d+)<br />Dataset: ([\w-]*)$"
    )
    for line in data["x"]["data"]:
        for entry in line["text"]:
            date, fnmr, dataset = fmt.findall(entry)[0]
            row = {
                "algorithm": algorithm, 
                "dataset": dataset, 
                "date": date, 
                "fnmr": fnmr,
                "fmr": fmr
            }
            df = df.append(row, ignore_index=True)
            
    return df

In [4]:
def parse_cross_region_fmr(data, algorithm):
    df = pd.DataFrame(
            columns=["algorithm", "overall_fmr", "pairing", "eregion", "vregion", "fmr"]
    )
    
    # Find FMR value
    # Title should be of the form
    # <b> Cross region FMR at threshold T = 2.740 for algorithm 3DIVI_003, giving FMR(T) = 0.0001 globally </b>
    title = data["x"]["layout"]["title"]["text"]
    overall_fmr = float(re.findall(r"FMR\(T\) = (\d+\.\d+) globally", title)[0])
    
    
    # Order of the four plots:
    # - Any impostor pairing
    # - Same age impostor
    # - Same sex impostor
    # - Same sex, same age impostor
    pairings = ["any", "same_age", "same_sex", "same_sex_age"]
    plot_idx = 0
    
    for plot in data["x"]["data"]:
        if "hovertext" not in plot:
            continue
        pairing = pairings[plot_idx]
        
        # Each entry is of the form
        # shortfmr0001: -3.3<br />eregion: CARIB<br />vregion: CARIB<br />fmr0001: -3.336707
        fmt = re.compile(
            r"eregion: ([A-Z]+)<br />vregion: ([A-Z]+)<br />fmr[0-9]+: ([-+]\d\.\d+)$"
        )
        for entry in plot["hovertext"]:
            eregion, vregion, fmr = fmt.findall(entry)[0]
            row = {
                "algorithm": algorithm,
                "overall_fmr": overall_fmr,
                "pairing": pairing,
                "eregion": eregion,
                "vregion": vregion,
                "fmr": fmr,
            }
            df = df.append(row, ignore_index=True)
        
        plot_idx += 1
        
    return df

In [5]:
def parse_plot_title(data, algorithm):
    try:
        title = data["x"]["layout"]["title"]["text"]
    except:
        return None

    if title.endswith("Evolution of accuracy on three datasets 2017 - present<br /> </b>"):
        return "fnmr_evolution"
    elif title.startswith("<b> Cross region FMR"):
        return "cross_region_fmr"
    else:
        return None

In [9]:
reports_dir = Path("../frvt_pages/reportcards/11")

In [10]:
res_df = {
    "fnmr_evolution": pd.DataFrame(
            columns=["algorithm", "dataset", "date", "fmr", "fnmr"]
        ),
    "cross_region_fmr": pd.DataFrame(
            columns=["algorithm", "overall_fmr", "pairing", "eregion", "vregion", "fmr"]
    )
}

parsers = {
    "fnmr_evolution": parse_fnmr_evolution,
    "cross_region_fmr": parse_cross_region_fmr,
}

In [11]:
for filepath in sorted(reports_dir.iterdir()):
    if filepath.suffix != ".html":
        print(filepath)
        continue
    algorithm = filepath.stem.replace("_", "-")
    
    soup = BeautifulSoup(filepath.read_text(), 'lxml')
    plots = soup.find_all("div", attrs="plotly html-widget")

    for plot in plots:
        data = soup.find(script_finder(plot.attrs["id"]))
        data = json.loads(data.contents[0])
        
        plot_type = parse_plot_title(data, algorithm)
        if plot_type is None:
            continue

        try:
            tmp_df = parsers[plot_type](data, algorithm)
        except:
            tmp_df = pd.DataFrame(columns=res_df[plot_type].columns)
        res_df[plot_type] = res_df[plot_type].append(
            tmp_df, ignore_index=True
        )
        print(algorithm, plot_type, len(tmp_df))

3divi-003 cross_region_fmr 400
3divi-004 fnmr_evolution 12
3divi-004 cross_region_fmr 400
3divi-005 fnmr_evolution 12
3divi-005 cross_region_fmr 400
acer-000 fnmr_evolution 6
acer-000 cross_region_fmr 400
acer-001 fnmr_evolution 6
acer-001 cross_region_fmr 400
acisw-003 fnmr_evolution 0
acisw-003 cross_region_fmr 400
adera-001 fnmr_evolution 0
adera-001 cross_region_fmr 400
advance-002 fnmr_evolution 0
advance-002 cross_region_fmr 400
aifirst-001 fnmr_evolution 0
aifirst-001 cross_region_fmr 400
aigen-001 fnmr_evolution 0
aigen-001 cross_region_fmr 400
ailabs-001 fnmr_evolution 0
ailabs-001 cross_region_fmr 400
aimall-001 cross_region_fmr 400
aimall-002 fnmr_evolution 8
aimall-002 cross_region_fmr 400
aimall-003 fnmr_evolution 8
aimall-003 cross_region_fmr 400
aiunionface-000 fnmr_evolution 0
aiunionface-000 cross_region_fmr 400
alchera-000 fnmr_evolution 6
alchera-000 cross_region_fmr 400
alchera-001 fnmr_evolution 6
alchera-001 cross_region_fmr 400
alleyes-000 fnmr_evolution 0
alleye

itmo-006 cross_region_fmr 400
itmo-007 fnmr_evolution 13
itmo-007 cross_region_fmr 400
iws-000 fnmr_evolution 0
iws-000 cross_region_fmr 400
kakao-002 fnmr_evolution 9
kakao-002 cross_region_fmr 400
kakao-003 fnmr_evolution 9
kakao-003 cross_region_fmr 400
kedacom-000 fnmr_evolution 0
kedacom-000 cross_region_fmr 400
kneron-003 fnmr_evolution 6
kneron-003 cross_region_fmr 400
kneron-005 fnmr_evolution 6
kneron-005 cross_region_fmr 400
kookmin-001 fnmr_evolution 0
lookman-002 fnmr_evolution 6
lookman-002 cross_region_fmr 400
lookman-004 fnmr_evolution 6
lookman-004 cross_region_fmr 400
luxand-000 fnmr_evolution 0
luxand-000 cross_region_fmr 400
megvii-001 fnmr_evolution 7
megvii-001 cross_region_fmr 400
megvii-002 fnmr_evolution 7
megvii-002 cross_region_fmr 400
meiya-001 fnmr_evolution 0
meiya-001 cross_region_fmr 400
microfocus-001 fnmr_evolution 7
microfocus-001 cross_region_fmr 400
microfocus-002 fnmr_evolution 7
microfocus-002 cross_region_fmr 400
mobai-000 fnmr_evolution 0
mobai-0

In [12]:
res_df["fnmr_evolution"].to_csv("../data/frvt_11_fnmr_evolution.csv", index=False)

In [13]:
res_df["cross_region_fmr"].to_csv("../data/frvt_11_cross_region_fmr.csv", index=False)