In [None]:
from IPython.display import display, Markdown, HTML
import plotly.express as px
import itertools

from azureml.core import Run, Model
from azureml.core import Datastore, Experiment, ScriptRunConfig, Workspace

from model_drift import settings
from model_drift.helpers import column_xs, correlate_performance, mutual_info_performance, w_avg
import pandas as pd
import os
import datetime

In [None]:
ws = Workspace.from_config(settings.AZUREML_CONFIG)

experiment_name = 'generate-drift-csv-label-mod-dbg'
exp = Experiment(workspace=ws, name=experiment_name)

import pandas as pd

df = []
for run in exp.get_runs():
    if run.status != "Completed":
        continue
    d = dict(**run.tags)
    d['id'] = run.id
    d['display_name'] = run.display_name
    d['url'] = run.get_portal_url()
    d['run'] = run
    df.append(d)

df2 = []
experiment_name = 'generate-drift-csv-3'
exp = Experiment(workspace=ws, name=experiment_name)
for run in exp.get_runs():
    if "tender_pear_lfbd6wwg" not in run.display_name:
        continue
    if run.status != "Completed":
        continue
    print(run.display_name)
    d = dict(**run.tags)
    d['id'] = run.id
    d['display_name'] = run.display_name
    d['url'] = run.get_portal_url()
    d['run'] = run
    d["label_modifiers"] = "baseline"
    df2.append(d)
    
df = pd.DataFrame(df).set_index(['display_name'])
# df = df.sort_values(["window", "nonfrontal_add_date", "frontal_remove_date", "peds_weight"])
df = df[~df['mod_end_date'].isnull()]
df = pd.concat([pd.DataFrame(df2).set_index(['display_name']), df])

In [None]:
def is_arg_col(c):
    if "mlflow" in c or "_aml" in c or 'run_' in c or 'url' in c:
        return False

    ignore = ["output_dir", "input_dir", 'run', 'display_name', 'id']

    return c not in ignore
arg_cols = [c for c in df.columns if is_arg_col(c)]

arg_df = df[arg_cols].query("classifier_dataset == 'padchest-finetuned-chx-frontalonly'").copy()
arg_df = arg_df[~arg_df.duplicated(keep='last')]
arg_df

In [None]:
fix_links_script = """
              <script>
                var x = document.getElementsByTagName('a');
                var i;
                for (i = 0; i < x.length; i++) {{
                    let url = x[i].getAttribute("href");
                    x[i].href = url + window.location.search;
                }}
                </script>
              """

In [None]:
html_top_dir = settings.TOP_DIR.joinpath("html", "graphs_label-simple")
html_top_dir.mkdir(exist_ok=True)

In [None]:
arg_df2 =arg_df[[c for c in arg_df if arg_df[c].fillna('NA').nunique() > 1]]
arg_df2['Link'] = [f"""<a href="{name}/index.html" disabled=>Graphs</a>""" if html_top_dir.joinpath(name).exists() else "N/A" for name in arg_df2.index]

with open(html_top_dir.joinpath("index.html"), 'w') as f:
        print("""
            <style>
            table {
            font-family: arial, sans-serif;
            border-collapse: collapse;
            width: 80%;
            }

            td, th {
            border: 1px solid #dddddd;
            text-align: left;
            padding: 8px;
            }

            tr:nth-child(even) {
            background-color: #dddddd;
            }
            </style>
            <br />
        """, file=f)
        print(f"pre Generated: {datetime.datetime.now()}", file=f)
        print(arg_df2.to_html(escape=False), file=f)
        print(fix_links_script, file=f)

In [None]:

df = df.loc[arg_df.index]

arg_df2.loc[[not html_top_dir.joinpath(name).exists() for name in df.index]]

In [None]:
run_row = df.loc[[not html_top_dir.joinpath(name).exists() for name in df.index]].iloc[0]
# run_row = df.loc["affable_parang_mhjpv4b2"]

r = run_row['run']
name = run_row.name


print(name)
# Diplay settings
span = 7
which = 'mean'
clip = 10

performance_col = ("performance", "macro avg", "auroc")

congruency_measure_col = ('in_distro', 'stats', 'mean')
add_error_bars = True

standardize_dates = (settings.PADCHEST_SPLIT_DATES[0], settings.PADCHEST_SPLIT_DATES[1])
standardize_ix = pd.date_range(*standardize_dates)
stat = []
# stat.append('pval')
stat.append('distance')

write_html = True
show_corr = False
graph_start = "2014-01-01"
graph_end = "2014-12-31"
drop_modality = False

font=dict(size=14)

In [None]:
import itertools
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px
from matplotlib import colors as mpl_colors
from collections import defaultdict

def to_rgba(rgb, alpha=None):
    rgb = mpl_colors.to_rgb(rgb)
    if alpha is None:
        return "rgb(%s, %s, %s)" % (rgb[0], rgb[1], rgb[2])
    return "rgba(%s, %s, %s, %s)" % (rgb[0], rgb[1], rgb[2], alpha)

def line_maker(color, **l):
    return dict(color=color, **l)

def marker_maker(color, **l):
    return dict(color=color)

def smooth(y: pd.DataFrame):
    if span > 0:
        ys = y.ewm(span=span, ignore_na=False).mean()
        ys[y.isna()] = None
    else:
        ys = y    
    return ys

def add_date_line(fig, date, name, y=1.08):
    fig.add_shape(type='line',
                x0=date,
                y0=0,
                x1=date,
                y1=1,
                line=dict(color='black', dash='dot'),
                xref='x',
                yref='paper'
                )
    fig.add_annotation(textangle=0,
                    xref="x",
                    yref="paper", x=date, y=y,
                       text=name, showarrow=False,)

def add_dates(fig, dates, line_y=1.05):
    
    dates_rev = defaultdict(list)
    for name, date in dates.items():
        dates_rev[date].append(name)
    
    for date, names in dates_rev.items():
        name = "<br>".join(names)
        if not pd.isna(date):
            add_date_line(fig, date, f"{name}<br />({date})", y=line_y)
            
def collect_corr(y, yp, name, when, weights_name, start_date=None, end_date=None):
    yp = yp.loc[start_date: end_date]
    y = y.loc[start_date: end_date]
    c, cm = yp.corr(y), smooth(yp).corr(smooth(y))
    return {"name": name, "weights_name": weights_name,
                "corr (raw)": c, "corr (smoothed)": cm, "when": when}

class FigureHelper(object):
    
    def __init__(self, x=None, color_list=px.colors.qualitative.Plotly, dashes=('solid',), smooth_func=smooth, merge_hover=True):
        self.traces = []
        self.error_traces = []
        self.color_list = color_list
        self.line_picker = itertools.cycle(itertools.product(dashes, self.color_list))
        self.lines = defaultdict(lambda: dict(zip(['dash', 'color'], next(self.line_picker))))
        self.names = set()
        self.smooth = smooth_func
        self.x = x
        self.merge_hover = merge_hover
        
    def set_line(self, key, line=None):
        line = line or {}
        self.lines[key] = self.lines[key]
        self.lines[key].update(line)
        self.lines[key]['color'] = self.lines[key]['color']
        return self.lines[key]
        
        
    def make_error_traces(self, x, yu, yl, name, color, alpha):
        
        
        # need to remove nans from error traces
        k = ~(yu.isnull()|yl.isnull())
        xe = x[k]
        yl = yl[k]
        yu = yu[k]
        
        return [go.Scatter(x=xe, 
                            y=yu, 
                            hoverinfo="skip",
                            showlegend=False,
                            legendgroup=name,
                            name=name,
                            connectgaps=False,
                            line=dict(width=0),
                ), 
                go.Scatter(x=xe, 
                            y=yl,
                            fillcolor=to_rgba(color, alpha),
                            fill='tonexty',
                            hoverinfo="skip",
                            showlegend=False,                            
                            legendgroup=name,
                            name=name,
                            connectgaps=False,
                            line=dict(width=0),
                )]

    def add_trace(self, y, name, x=None, kind=go.Scatter, color_key=None, row=1, col=1, line=None,
                  std=None, yu=None, yl=None, **trace_kwargs):
        color_key = color_key or name
        trace_kwargs.setdefault('showlegend', name not in self.names)
        self.names.add(name)
        trace_kwargs.setdefault('legendgroup', name)
        
        line = self.set_line(color_key, line)
        x = x or self.x
        y = y.reindex(x)
        t = kind(x=x, y=y, name=name, **trace_kwargs)
        if not isinstance(t, go.Bar):
            t.line = line_maker(**line)
        else:
            t.marker = marker_maker(**line)
            
    
        self.traces.append((row, col, t))
        
        if std is not None:
            yu = y+std
            yl = y-std
            
        if yu is not None and yl is not None:
            for t_ in self.make_error_traces(x, yu, yl, name=name, color=line["color"], alpha=0.2):
                self.error_traces.append((row, col, t_))
    
    
    def add_bar(self, y, name, x=None, color_key=None, row=1, col=1, line=None, include_line=True,
                **trace_kwargs):
        
        if include_line:
            self.add_trace(y=y, name=name, color_key=color_key, line=line, row=row, col=col, **trace_kwargs)
        self.add_trace(y=y, name=name, color_key=color_key, kind=go.Bar, line=line, row=row, col=col, **trace_kwargs)
        
        
    
    def make_fig(self, **fig_kwargs):

        data = {}
        max_row = 1
        max_col = 1
        for r, c, t in self.traces:
            max_row = max(r, max_row)
            max_col = max(c, max_col)
            data[t.name] = pd.Series(t.y, index=t.x)
            
        customdata = pd.DataFrame(data)
        fig = make_subplots(rows=max_row, cols=max_col, **fig_kwargs)
        for r, c, t in self.traces:
            if self.merge_hover:
                cus_cols = sorted(customdata)
                ho = "<br />".join(["{name}=%{{customdata[{i}]:.3f}}".format(i=i, name=name) for i,name in enumerate(cus_cols)])
                hovertemplate = "%{x}<br>" + f"{t.name}: " +"%{y}<br><br>"+f"{ho}<extra></extra>"
                t.customdata = customdata[cus_cols]
                t.hovertemplate = hovertemplate
            # t.hoverlabel = {'bgcolor': 'white'}
            fig.add_trace(t, row=r, col=c)
                        
        for r, c, t in self.error_traces:
            fig.add_trace(t, row=r, col=c)
        return fig
        
        
        
        

In [None]:
def is_arg_col(c):
    if "mlflow" in c or "_aml" in c or 'run_' in c or 'url' in c:
        return False

    ignore = ["output_dir", "input_dir", 'run', 'display_name', 'id']

    return c not in ignore
arg_cols = [c for c in df.columns if is_arg_col(c)]
arg_row = run_row[arg_cols].copy()

arg_row
output_file_path = settings.TOP_DIR.joinpath('results', 'drift', name+".csv")
fname = str(output_file_path)
r.download_file("outputs/output.csv", output_file_path=output_file_path)
# # Settings to file CSV file

display_args = ["span", "which", "clip", "standardize_perf", "shift_drift_to_perf", "performance_col", "this_center", "this_range", "standardize_dates", "stat", "add_error_bars", "drop_modality"]
d = locals()
display_args = {k: d[k] for k in display_args if k in d}

print(display_args)


if not os.path.exists(fname):
    raise ValueError("no fn")

combined_df_o = pd.read_csv(str(fname), index_col=0, header=[0, 1, 2, 3])
combined_df_o.index = pd.to_datetime(combined_df_o.index)

flip = column_xs(combined_df_o, include=["pval"])
combined_df_o[flip] = 1-combined_df_o[flip]
combined_df = combined_df_o.copy()


smooth_name = f"ewm{span}"

error_df = combined_df.swaplevel(0, -1, axis=1)[["std"]].swaplevel(0, -1, axis=1).droplevel(-1, axis=1).copy()
combined_df = combined_df.swaplevel(0, -1, axis=1)[[which]].swaplevel(0, -1, axis=1).droplevel(-1, axis=1).copy()

html_dir = html_top_dir.joinpath(name)
perf_col_name = '-'.join(performance_col)

if not os.path.exists(html_dir):
    os.makedirs(html_dir)

stat_str = '+'.join(stat)
fn = f"{html_dir}/{which}_{stat_str}_stdclip{clip}_smooth-{smooth_name}-dropModality{drop_modality}_{perf_col_name}.html"

print("output:", fn)
def is_arg_col(c):
    if "mlflow" in c or "_aml" in c or 'run_' in c or 'url' in c:
        return False

    ignore = ["output_dir", "input_dir", 'run', 'display_name', 'id']
    return c not in ignore

arg_row = run_row[arg_cols].copy()
display_row = pd.Series(display_args)
params = pd.concat({'Drift': arg_row, "Display": display_row}, axis=0).rename("Value").to_frame()

if write_html:
    with open(fn, 'w') as f:
        print("""
            <style>
            table {
            font-family: arial, sans-serif;
            border-collapse: collapse;
            width: 80%;
            }

            td, th {
            border: 1px solid #dddddd;
            text-align: left;
            padding: 8px;
            }

            tr:nth-child(even) {
            background-color: #dddddd;
            }
            </style>
        """, file=f)
        print(f"""
            <h1>Drift report</h1> created: {datetime.datetime.now()}
            <br /><br />
            <h2>Arguments </h2>
            {params.to_html()}
            """, file=f)

def shift_to_other(this, other, this_range=None, this_center=None):
    u = other.mean()
    r = other.std()#other.max()-other.min()

    if this_range is None:
        this_range = this.std()#this.max()-this.min()

    if this_center is None:
        this_center = this.mean()
    return (this-this_center)/(this_range)*r+u

perf_col = performance_col
perf_df = combined_df[perf_col]
exclude = ['performance', 'count']
if drop_modality:
    exclude.append("Modality_DICOM")
other_cols = column_xs(combined_df, exclude=exclude)

other_df = combined_df[other_cols]
extra_valids = pd.DataFrame(columns=other_df.columns)
extra_perf = pd.DataFrame(columns=perf_df.name)

extra_perf
cxs = column_xs(other_df, include=stat)
stats = pd.concat([other_df[cxs].dropna(axis=1), extra_valids[cxs].dropna(axis=1)], axis=0).sort_index()


stats = stats.loc[standardize_ix]

print(len(stats))
stats = stats.agg(["mean", "std"])


stats.T

otherstd = other_df[cxs].copy()

# cannot divide by zero
std0 = stats.loc['std'] == 0
stats.loc["std", stats.loc['std'] == 0] = 1
otherstd = (otherstd-stats.loc['mean'])/(stats.loc["std"])
errorstd = (error_df[cxs]-stats.loc['mean'])/(stats.loc["std"])
bad_cols = otherstd.columns[otherstd.isnull().max(axis=0)].tolist()


print(bad_cols)

vae_cols = [c for c in list(otherstd) if "mu." in c[0]]
score_cols = [c for c in list(otherstd) if "activation." in c[0]]
metadata_cols = sorted(set(otherstd).difference(vae_cols).difference(score_cols))

if clip is not None:
  otherstd = otherstd.clip(-1*clip, clip)

In [None]:
    
x = pd.date_range(combined_df.index.min(), combined_df.index.max())
yp = perf_df.reindex(x)
perf_error_df = error_df[perf_col].reindex(x)

all_corr_df = correlate_performance(yp.rename('auroc'), otherstd)
all_ig_df = mutual_info_performance(yp.rename('auroc'), otherstd, bins=25)

m_ = all_ig_df.to_frame().join(all_corr_df.abs().rename('abs(corr)'))
m_ = m_.join(m_.mean(axis=1).rename('mean[abs(corr),info_gain]'))
m_ = m_.assign(no_weights=1)
m_ = m_.fillna(0)
m_.sort_values(by='mean[abs(corr),info_gain]', ascending=False).head(20)

true_counts = combined_df_o['count'].droplevel([0, 1], axis=1)['obs']
count_df = combined_df['count'].reindex(x)
dates = {
    # "Lat. Added": run_row['nonfrontal_add_date'],
        #  "Frontal Removed": run_row['frontal_remove_date'],
        #  "Peds Added": run_row['peds_start_date'],
        #  "Peds Stop": run_row['peds_end_date'],
         "Val Start": settings.PADCHEST_SPLIT_DATES[0],
         "Test Start": settings.PADCHEST_SPLIT_DATES[1],
         }

import json
try:
    mods = json.loads(run_row.get('label_modifiers'))
except:
    mods = {}
for label, (pct, start_date, end_date) in mods.items():
    if start_date:
        dates[f"{label}={pct:.0%} Start"] = start_date

        # if end_date:
        #     dates[f"{label}={pct:.0%} End"] = end_date 


In [None]:
m = m_.copy()

yp = yp.reindex(x)
otherstd = otherstd.reindex(x)
counts = combined_df['count'].iloc[:, 0].reindex(x)
counts2 = true_counts.reindex(x)

#collect_corr(y, yp, name, when, weights_name, start_date=None, end_date=None)

fh = FigureHelper(x)
fh.add_trace(y=combined_df['count'].iloc[:, 0], name="Num Samples (used)", line={"color": "green"}, row=4, col=1)
fh.add_trace(y=combined_df['count'].iloc[:, 0], name="Num Samples (used)", kind=go.Bar, row=4, col=1)
fh.add_trace(y=true_counts, name="Num Samples (obs)", line={"color": "orange"}, row=4, col=1)
fh.add_trace(y=true_counts, name="Num Samples (obs)", kind=go.Bar, row=4, col=1)
fh.add_trace(y=smooth(yp), name="AUROC", connectgaps=False, line={"color": "blue"}, 
             yu=smooth(yp+perf_error_df), 
             yl=smooth(yp-perf_error_df),
             row=1, col=1)

# fh.add_trace(y=smooth(combined_df[congruency_measure_col]), row=2, name='Data Congruency (True)')

corrs = []
corrs.append(collect_corr(counts,yp, "num sampled (used)", "Everything", "None"))
corrs.append(collect_corr(counts,yp, "num sampled (used)", "Validation", "None",
                            start_date=settings.PADCHEST_SPLIT_DATES[0], end_date=settings.PADCHEST_SPLIT_DATES[1]))
corrs.append(collect_corr(counts,yp, "num sampled (used)", "Test", "None",
                        start_date=settings.PADCHEST_SPLIT_DATES[1]))
corrs.append(collect_corr(counts,yp, "num sampled (used)", "First Year of Test", "None",
                        start_date=settings.PADCHEST_SPLIT_DATES[1], end_date="2014-12-31"))

corrs.append(collect_corr(counts2,yp, "num sampled (obs)", "Everything","None"))
corrs.append(collect_corr(counts2,yp, "num sampled (obs)", "Validation", "None",
                            start_date=settings.PADCHEST_SPLIT_DATES[0], end_date=settings.PADCHEST_SPLIT_DATES[1]))
corrs.append(collect_corr(counts2,yp, "num sampled (obs)", "Test", "None",
                        start_date=settings.PADCHEST_SPLIT_DATES[1]))
corrs.append(collect_corr(counts2,yp, "num sampled (obs)", "First Year of Test", "None",
                        start_date=settings.PADCHEST_SPLIT_DATES[1], end_date="2014-12-31"))

errorstd = errorstd[otherstd.columns]

for i, name in enumerate(m_):
    weights = m_[name].sort_values(ascending=False)
    # weights = weights.iloc[:5]
    y = -w_avg(otherstd.reindex(x), weights=weights.to_dict())
    ystd = -w_avg(errorstd.reindex(x), weights=weights.to_dict())
    fh.add_trace(y=smooth(y),
                        # customdata=smooth(yo),
                        showlegend=True, legendgroup=name,
                        name=name, line={"width": 1},  
                        connectgaps=False, row=2, col=1)
    
    corrs.append(collect_corr(y,yp, "All Unified", "Everything", name))
    corrs.append(collect_corr(y,yp, "All Unified", "Validation", name,
                              start_date=settings.PADCHEST_SPLIT_DATES[0], end_date=settings.PADCHEST_SPLIT_DATES[1]))
    corrs.append(collect_corr(y,yp, "All Unified", "Test", name,
                            start_date=settings.PADCHEST_SPLIT_DATES[1]))
    corrs.append(collect_corr(y,yp, "All Unified", "First Year of Test", name,
                            start_date=settings.PADCHEST_SPLIT_DATES[1], end_date="2014-12-31"))


corr_df = pd.DataFrame(corrs).sort_values('when')
if show_corr:
    display(corr_df)


fig = fh.make_fig(shared_xaxes=True, vertical_spacing=0.01, row_heights=[.2, .2, .2, .1])
add_dates(fig, dates, line_y=1.05)
fig.update_xaxes(showspikes=True, spikecolor="black", spikesnap="cursor", spikemode="across", spikethickness=1)
fig.update_layout(spikedistance=1000)
fig.update_layout(height=800)
fig.update_xaxes(range=[graph_start, graph_end])
fig.update_layout(barmode='overlay')
fig.update_layout(font=font)
fig.show()

if write_html: 
    with open(fn, 'a') as f:
        fig_html = fig.to_html()
        print(f"<h2>Full Unified</h2>", file=f)
        if show_corr:
            for w, grp in corr_df.groupby('when'):
                print(f"<strong>{w}</strong>{grp.to_html()}", file=f)
        print(fig_html, file=f)

In [None]:
count_cols = column_xs(combined_df, ['support'])
count_cols = [c for c in count_cols if 'avg' not in c[1]]
label_counts = combined_df[count_cols]
label_counts.columns = [c[1] for c in label_counts.columns]
num_samples = combined_df['count'].iloc[:, 0]
label_rates = label_counts.div(num_samples, axis=0)

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

auroc_cols = column_xs(combined_df, ['auroc'])
precision_cols = column_xs(combined_df, ['precision'])
recall_cols = column_xs(combined_df, ['recall'])
f1_cols = column_xs(combined_df, ['f1-score'])
support_cols = column_xs(combined_df, ['support'])

cols_ = [auroc_cols, recall_cols, precision_cols, f1_cols, support_cols]
fig = make_subplots(rows=len(cols_), cols=1, shared_xaxes=True, vertical_spacing=0.01)

names = sorted(set([c[1] for c in itertools.chain(*cols_)]), key=lambda x: 'avg' in x)
colors = px.colors.qualitative.Plotly
dashes = ['solid', 'dash', 'dotted']

lines = {}
for name, spec in zip(names, itertools.product(dashes, colors)):
    lines[name] = {'color': spec[1], 'dash': spec[0]}

visited = set()
for r, cols__ in enumerate(cols_, 1):
    for c in cols__:
        ypp = combined_df[c].reindex(x)
        line = lines[c[1]]
        showlegend = not c[1] in visited
        visited.add(c[1])
        fig.add_trace(go.Scatter(x=x, y=smooth(ypp), showlegend=showlegend, legendgroup=c[1],
                name=c[1], hovertemplate="%{y: .5f}", connectgaps=False, line=line), row=r, col=1)
    fig.update_yaxes(title_text=c[-1], row=r, col=1)
add_dates(fig, dates, 1.025)

fig.update_layout(title=f"Peformance")
fig.update_layout(hovermode="x unified")
fig.update_layout(height=300*len(cols_))
fig.update_xaxes(range=[graph_start, graph_end])
fig.update_layout(font=font)
fig.show()

if write_html:
    fig_html = fig.to_html()
    with open(fn, 'a') as f:
        print(f"<h2>Performance</h2>", file=f)
        print(fig_html, file=f)

In [None]:
other_w = "no_weights"

fh = FigureHelper(x)
fh.add_trace(y=smooth(yp), name="AUROC", connectgaps=False, line={"color": "blue"}, yu=smooth(yp+perf_error_df), yl=smooth(yp-perf_error_df), row=1, col=1)
# fh.add_trace(y=smooth(combined_df[congruency_measure_col]), row=2, name='Data Congruency (True)')
corrs = []
run_row['peds_weight'] = 0
if not float(run_row['peds_weight']):
    xcols = zip(["metadata", "vae", "score", "vae+score", "metadate+vae+score"],  [metadata_cols, vae_cols, score_cols, vae_cols+score_cols, vae_cols+score_cols+metadata_cols])
else:
    xcols = zip(["vae", "score", "vae+score"],  [vae_cols, score_cols, vae_cols+score_cols])

for row, (name_, cols) in enumerate(xcols, 1):
    for i, name in enumerate([other_w]):
        otherstd_ = otherstd[cols]
        weights = m[name].sort_values(ascending=False)
        yo = -w_avg(otherstd_.loc[x], weights=weights.to_dict())
        
        fh.add_trace(y=smooth(yo), name=name_, line={"width": 1},  connectgaps=False, row=2, col=1)
        
        corrs.append(collect_corr(y,yo, name_, "Everything", name))
        corrs.append(collect_corr(y,yo, name_, "Validation", name,
                                start_date=settings.PADCHEST_SPLIT_DATES[0], end_date=settings.PADCHEST_SPLIT_DATES[1]))
        corrs.append(collect_corr(y,yo, name_, "Test", name,
                                start_date=settings.PADCHEST_SPLIT_DATES[1]))
        corrs.append(collect_corr(y,yo, name_, "First Year of Test", name,
                                start_date=settings.PADCHEST_SPLIT_DATES[1], end_date="2014-12-31"))


fig = fh.make_fig(shared_xaxes=True, vertical_spacing=0.01)
add_dates(fig, dates, 1.08)


fig.update_layout(title=f"Level 1 Metrics")
fig.update_xaxes(showspikes=True, spikecolor="black", spikesnap="cursor", spikemode="across", spikethickness=1)
fig.update_layout(spikedistance=1000)
fig.update_layout(height=600)
fig.update_xaxes(range=[graph_start, graph_end])
fig.update_layout(barmode='overlay')
corr_df = pd.DataFrame(corrs).sort_values('when')
display(corr_df)
fig.show()

fig_html = fig.to_html()

if write_html:
    with open(fn, 'a') as f:
        print(f"<h2>Level 1 Unified</h2>", file=f)
        for w, grp in corr_df.groupby('when'):
            print(f"<strong>{w}</strong>{grp.to_html()}", file=f)
        print(fig_html, file=f)


In [None]:
show_corr = False

In [None]:
fh = FigureHelper(x, dashes=['solid', 'dot', 'dash', 'longdash', 'dashdot', 'longdashdot'])
# fh.add_trace(y=smooth(yp), name="AUROC", connectgaps=False, line={"color": "blue"}, yu=smooth(yp+perf_error_df), yl=smooth(yp-perf_error_df), row=1, col=1)
# fh.add_trace(y=smooth(combined_df[congruency_measure_col]), row=2, name='Data Congruency (True)')
corrs = []

def partition(l, n):
    return [l[i:i + n] for i in range(0, len(l), n)]

cols = score_cols
otherstd_ = otherstd[cols]
cols_ = [column_xs(otherstd_, include='distance'), column_xs(otherstd_, include='pval'), ]
cols_ = [c for c in cols_ if len(c)]

for col in label_rates.columns:
    fh.add_trace(y=smooth(label_rates[col]), name=str(col), connectgaps=False, row=1, col=1)

print(len(cols_))

for row, cols in enumerate(cols_, 2):
    for c in cols:
        yo = -otherstd[c]
        fh.add_trace(y=smooth(yo), name=str(c), connectgaps=False, row=row, col=1)
        
        corrs.append(collect_corr(y,yo, str(c), "Everything", 'N/A'))
        corrs.append(collect_corr(y,yo, str(c), "Validation", 'N/A',
                                start_date=settings.PADCHEST_SPLIT_DATES[0], end_date=settings.PADCHEST_SPLIT_DATES[1]))
        corrs.append(collect_corr(y,yo, str(c), "Test", 'N/A',
                                start_date=settings.PADCHEST_SPLIT_DATES[1]))
        corrs.append(collect_corr(y,yo, str(c), "First Year of Test", 'N/A',
                                start_date=settings.PADCHEST_SPLIT_DATES[1], end_date="2014-12-31"))

if not float(run_row['peds_weight']):     
    fig = fh.make_fig(shared_xaxes=True, vertical_spacing=0.05)
    add_dates(fig, dates, 1.05)

    fig.update_layout(title=f"Score")
    fig.update_xaxes(showspikes=True, spikecolor="black", spikesnap="cursor", spikemode="across", spikethickness=1)
    fig.update_layout(spikedistance=1000)
    fig.update_layout(height=200*(len(cols_)+1))
    # fig.update_layout(yaxis1=dict(range=[.6, 1]))
    fig.update_layout(yaxis1=dict(range=[0, 1], title="label rate"))
    fig.update_layout(yaxis2=dict(range=[-10, 1], title="distance"))
    if len(cols_) > 2:
        fig.update_layout(yaxis3=dict(range=[-10, 1], title="pval"))
    fig.update_xaxes(range=[graph_start, graph_end])
    
    fig.update_layout(font=font)
    fig.update_layout(barmode='overlay')

    if show_corr:
        corr_df = pd.DataFrame(corrs)
        display(corr_df)

    fig.show()


    fig_html = fig.to_html()
    if write_html: 
        with open(fn, 'a') as f:
            print(f"<h2>Score</h2>", file=f)
            if show_corr:
                for w, grp in corr_df.groupby('when'):
                    print(f"<strong>{w}</strong>{grp.to_html()}", file=f)
            print(fig_html, file=f)

In [None]:
fh = FigureHelper(x, dashes=['solid', 'dot', 'dash', 'longdash', 'dashdot', 'longdashdot'])
fh.add_trace(y=smooth(yp), name="AUROC", connectgaps=False, line={"color": "blue"}, yu=smooth(yp+perf_error_df), yl=smooth(yp-perf_error_df), row=1, col=1)
# fh.add_trace(y=smooth(combined_df[congruency_measure_col]), row=2, name='Data Congruency (True)')
corrs = []

def partition(l, n):
    return [l[i:i + n] for i in range(0, len(l), n)]

cols = metadata_cols
otherstd_ = otherstd[cols]
cols_chi2 = column_xs(otherstd_, include='chi2')
otherstd_ = otherstd[cols_chi2]
cols_ = partition(cols_chi2, 12)
print(len(cols_))

for row, cols in enumerate(cols_, 2):
    for c in cols:
        yo = -otherstd[c]
        fh.add_trace(y=smooth(yo), name=str(c), connectgaps=False, row=row, col=1)
        
        corrs.append(collect_corr(y,yo, str(c), "Everything", 'N/A'))
        corrs.append(collect_corr(y,yo, str(c), "Validation", 'N/A',
                                start_date=settings.PADCHEST_SPLIT_DATES[0], end_date=settings.PADCHEST_SPLIT_DATES[1]))
        corrs.append(collect_corr(y,yo, str(c), "Test", 'N/A',
                                start_date=settings.PADCHEST_SPLIT_DATES[1]))
        corrs.append(collect_corr(y,yo, str(c), "First Year of Test", 'N/A',
                                start_date=settings.PADCHEST_SPLIT_DATES[1], end_date="2014-12-31"))

if not float(run_row['peds_weight']):     
    fig = fh.make_fig(shared_xaxes=True, vertical_spacing=0.01)
    add_dates(fig, dates, 1.05)

    fig.update_layout(title=f"Metadata Categorical")
    fig.update_xaxes(showspikes=True, spikecolor="black", spikesnap="cursor", spikemode="across", spikethickness=1)
    fig.update_layout(spikedistance=1000)
    fig.update_layout(height=200*(len(cols_)+2))
    fig.update_xaxes(range=[graph_start, graph_end])
    fig.update_layout(font=font)
    fig.update_layout(barmode='overlay')

    if show_corr:
        corr_df = pd.DataFrame(corrs)
        display(corr_df)

    fig.show()


    fig_html = fig.to_html()
    if write_html: 
        with open(fn, 'a') as f:
            print(f"<h2>Metadata Categorical</h2>", file=f)
            if show_corr:
                for w, grp in corr_df.groupby('when'):
                    print(f"<strong>{w}</strong>{grp.to_html()}", file=f)
            print(fig_html, file=f)

In [None]:
fh = FigureHelper(x, dashes=['solid', 'dot', 'dash', 'longdash', 'dashdot', 'longdashdot'])
fh.add_trace(y=smooth(yp), name="AUROC", connectgaps=False, line={"color": "blue"}, yu=smooth(yp+perf_error_df), yl=smooth(yp-perf_error_df), row=1, col=1)
fh.add_trace(y=smooth(combined_df[congruency_measure_col]), row=2, name='Data Congruency (True)')
corrs = []

def partition(l, n):
    return [l[i:i + n] for i in range(0, len(l), n)]

cols = metadata_cols
otherstd_ = otherstd[cols]
cols_chi2 = column_xs(otherstd_, include='ks')
otherstd_ = otherstd[cols_chi2]
cols_ = partition(cols_chi2, 14)
print(len(cols_chi2))

for row, cols in enumerate(cols_, 3):
    for c in cols:
        yo = -otherstd[c]
        fh.add_trace(y=smooth(yo), name=str(c), connectgaps=False, row=row, col=1)
        
        corrs.append(collect_corr(y,yo, str(c), "Everything", 'N/A'))
        corrs.append(collect_corr(y,yo, str(c), "Validation", 'N/A',
                                start_date=settings.PADCHEST_SPLIT_DATES[0], end_date=settings.PADCHEST_SPLIT_DATES[1]))
        corrs.append(collect_corr(y,yo, str(c), "Test", 'N/A',
                                start_date=settings.PADCHEST_SPLIT_DATES[1]))
        corrs.append(collect_corr(y,yo, str(c), "First Year of Test", 'N/A',
                                start_date=settings.PADCHEST_SPLIT_DATES[1], end_date="2014-12-31"))
if show_corr:
    corr_df = pd.DataFrame(corrs)
    display(corr_df)

if not float(run_row['peds_weight']):
    fig = fh.make_fig(shared_xaxes=True, vertical_spacing=0.01)
    add_dates(fig, dates, 1.08)

    fig.update_layout(title=f"Metadata Real Valued")
    fig.update_xaxes(showspikes=True, spikecolor="black", spikesnap="cursor", spikemode="across", spikethickness=1)
    fig.update_layout(spikedistance=1000)
    fig.update_layout(height=200*(len(cols_)+2))
    fig.update_xaxes(range=[graph_start, graph_end])
    fig.update_layout(font=font)
    fig.update_layout(barmode='overlay')
    fig.show()


    fig_html = fig.to_html()
    if write_html: 
        with open(fn, 'a') as f:
            print(f"<h2>Metadata Real Valued</h2>", file=f)
            if show_corr:
                for w, grp in corr_df.groupby('when'):
                    print(f"<strong>{w}</strong>{grp.to_html()}", file=f)
            print(fig_html, file=f)

In [None]:
fh = FigureHelper(x, dashes=['solid', 'dot', 'dash', 'longdash', 'dashdot', 'longdashdot'])
fh.add_trace(y=smooth(yp), name="AUROC", connectgaps=False, line={"color": "blue"}, yu=smooth(yp+perf_error_df), yl=smooth(yp-perf_error_df), row=1, col=1)
# fh.add_trace(y=smooth(combined_df[congruency_measure_col]), row=2, name='Data Congruency (True)')
corrs = []

def partition(l, n):
    return [l[i:i + n] for i in range(0, len(l), n)]


cols = vae_cols
o = other_df[cols].loc[settings.PADCHEST_SPLIT_DATES[1]:].swaplevel(0, 2, axis=1)[['distance']].swaplevel(0, 2, axis=1)
colss = o.max(axis=0).sort_values(ascending=False).head(12).index.tolist()
cols_ = partition(colss, 12)


for row, cols in enumerate(cols_, 2):
    for c in cols:
        yo = -otherstd[c]
        fh.add_trace(y=smooth(yo), name=str(c), connectgaps=False, row=row, col=1)
        
        corrs.append(collect_corr(y,yo, str(c), "Everything", 'N/A'))
        corrs.append(collect_corr(y,yo, str(c), "Validation", 'N/A',
                                start_date=settings.PADCHEST_SPLIT_DATES[0], end_date=settings.PADCHEST_SPLIT_DATES[1]))
        corrs.append(collect_corr(y,yo, str(c), "Test", 'N/A',
                                start_date=settings.PADCHEST_SPLIT_DATES[1]))
        corrs.append(collect_corr(y,yo, str(c), "First Year of Test", 'N/A',
                                start_date=settings.PADCHEST_SPLIT_DATES[1], end_date="2014-12-31"))
        
fig = fh.make_fig(shared_xaxes=True, vertical_spacing=0.01)
add_dates(fig, dates, 1.08)

fig.update_layout(title=f"VAE Mu (top {len(colss)})")
fig.update_xaxes(showspikes=True, spikecolor="black", spikesnap="cursor", spikemode="across", spikethickness=1)
fig.update_layout(spikedistance=1000)
fig.update_layout(height=200*(len(cols_)+2))
fig.update_xaxes(range=[settings.PADCHEST_SPLIT_DATES[1], graph_end])
fig.update_layout(barmode='overlay')
fig.update_layout(font=font)
corr_df = pd.DataFrame(corrs).sort_values('when')
if show_corr:
    display(corr_df)
fig.show()


fig_html = fig.to_html()
if write_html: 
    with open(fn, 'a') as f:
        print(f"<h2>VAE Mu (top {len(colss)})</h2>", file=f)
        if show_corr:
            for w, grp in corr_df.groupby('when'):
                print(f"<strong>{w}</strong>{grp.to_html()}", file=f)
        print(fig_html, file=f)

In [None]:
arg_df2['Link'] = [f"""<a href="{name}/index.html" disabled=>Graphs</a>""" if html_top_dir.joinpath(name).exists() else "N/A" for name in arg_df2.index]


with open(html_top_dir.joinpath("index.html"), 'w') as f:
        print("""
            <style>
            table {
            font-family: arial, sans-serif;
            border-collapse: collapse;
            width: 80%;
            }

            td, th {
            border: 1px solid #dddddd;
            text-align: left;
            padding: 8px;
            }

            tr:nth-child(even) {
            background-color: #dddddd;
            }
            </style>
        """, file=f)
        print(f"Generated: {datetime.datetime.now()}", file=f)
        print(arg_df2.to_html(escape=False), file=f)
        print(fix_links_script, file=f)

In [None]:
def create_index_html(child):
    if child.is_file(): return
    html_files = []
    html_folders = []
    if child.parent.joinpath('index.html').exists():
        html_folders.append("..")
    for html_file in child.iterdir():
        n = html_file.relative_to(child)
        if html_file.is_file() and not str(html_file).endswith('index.html'):
            html_files.append(n)
        elif html_file.joinpath('index.html').exists():
            html_folders.append(n)
    
    
    html = "folders: <ul>"
    for n in html_folders:
            html += f"""
            <li><a href="{n}/index.html">{n}</a></li>
            """
    html += "</ul>"
    html += "files:<ul>"
    for n in html_files:
            html += f"""
            <li><a href="{n}">{n}</a></li>
            """
    html += "</ul>"
    with open(child.joinpath('index.html'), 'w') as f:
        print(html, file=f)
        print(fix_links_script, file=f)
        
        


In [None]:
# create_index_html(html_top_dir.parent.joinpath('vae[all-data]'))
create_index_html(html_top_dir.parent)
    

In [None]:
for child in html_top_dir.iterdir():
    create_index_html(child)
    
            
    
    
            