# PHT EB Catalog - PHT participants

- this notebook focuses on report the behavior of PHT participants
- Thus it primarily works on PHT subject level (rather than TIC level)

In [1]:
from importlib import reload

import numpy as np
import pandas as pd
from IPython.display import display, HTML

import catalog
import catalog_stats
import dashboard_utils

display(HTML("<style>.container { width:99% !important; }</style>"))

In [2]:
def sector_group_func1(sector):
    if sector <= 13:
        return "01-13"  # year 1
    elif sector <= 26:
        return "14-26"  # year 2
    elif sector <= 39:
        return "27-39"  # year 3
    else:
        return "others"

def sector_group_func2(sector):
    if sector <= 9:
        return "01-09"  # use the zoom-in UI1, sector 9 with UI2 and UI2
    elif sector <= 19:
        return "10-19"
    elif sector <= 29:
        return "20-29"
    elif sector <= 39:
        return "30-39"
    else:
        return "others"  # should not happen

sector_groups = ["01-09", "10-19", "20-29", "30-39"]
df = catalog.create_pht_eb_subj_catalog(sector_group_func=sector_group_func2)
print("Num. of subjects in the catalog:", len(df))

Num. of subjects in the catalog: 28092


## Tagging accuracy changes over time

- The proxy accuracy among subjects tagged by more than 3 users (`eb_score >=3`) does not change much over time.
- There might in fact be some *decrease* in sectors 30-39, down to <span style="background: rgba(255,255,0, 0.8);">91.8%</span>
- The proportion of subjects with `eb_score >=3` has incresaed, from <span style="background: rgba(255,168,0, 0.5);">19.5%</span> in sectors 1-9 up to <span style="background: rgba(255,168,0, 0.5);">39.2%</span> in sectors 30-39

In [3]:
max_eb_score =3
catalog_stats.add_eb_score_group(df, group_min=0, group_max=max_eb_score, recalc_if_exists=True)


sector_group_reports, sector_group_stylers = [], []
for i, sector_group in enumerate(sector_groups):
    report, styler = catalog_stats.pivot_by_eb_score_group(df[df["sector_group"] == sector_group], 
                                                           calc_totals_pct_col=True,
                                                           also_return_styler=True)
    # abbreviate is_eb_catalog, eb_score_group to fit the screen space
    styler.columns.names = [None, None, "is_eb"]
    styler.index.name = "score"

    # highlight the best
    selector_proxy_accuracy = (f"0{max_eb_score}+", ("count", "tic_id", "T/(T+F)"))
    styler = styler.applymap(lambda x: "background: rgba(255, 255, 0, 0.8)", subset=selector_proxy_accuracy)
    selector_totals_pct = (f"0{max_eb_score}+", ("count", "tic_id", "Totals %"))
    styler = styler.applymap(lambda x: "background: rgba(255, 168, 0, 0.5)", subset=selector_totals_pct)
    styler = styler.set_caption(f"Sectors {sector_group}")
    
    sector_group_reports.append(report)
    sector_group_stylers.append(styler)

dashboard_utils.n_columns(sector_group_stylers)

GridBox(children=(Output(layout=Layout(border='1px dotted gray')), Output(layout=Layout(border='1px dotted gra…