In [None]:
from IPython.display import display, Markdown, HTML
import plotly.express as px
import itertools

from azureml.core import Run, Model
from azureml.core import Datastore, Experiment, ScriptRunConfig, Workspace

from model_drift import settings
from model_drift.helpers import column_xs, correlate_performance, mutual_info_performance, w_avg
import pandas as pd
import os
import datetime



In [None]:
ws = Workspace.from_config(settings.AZUREML_CONFIG)

experiment_name = 'generate-drift-csv'
exp = Experiment(workspace=ws, name=experiment_name)

import pandas as pd

df = []
for run in exp.get_runs():
    if run.status != "Completed":
        continue
    d = dict(**run.tags)
    d['id'] = run.id
    d['display_name'] = run.display_name
    d['url'] = run.get_portal_url()
    d['run'] = run
    df.append(d)
    
    
df = pd.DataFrame(df).set_index(['display_name'])

In [None]:
[c for c in arg_df if arg_df[c].fillna('NA').nunique() > 1]

In [None]:
def is_arg_col(c):
    if "mlflow" in c or "_aml" in c or 'run_' in c or 'url' in c:
        return False

    ignore = ["output_dir", "input_dir", 'run', 'display_name', 'id']

    return c not in ignore
arg_cols = [c for c in df.columns if is_arg_col(c)]
arg_df = df[arg_cols].copy()

arg_df = arg_df[~arg_df.duplicated(keep='last')]
arg_df

In [None]:
arg_df['Link'] = [f"""<a href="{name}">Graphs</a>""" for name in arg_df.index]
arg_df['Exists'] = [f"""{settings.TOP_DIR.joinpath("html", "new_graphs", name).exists()}""" for name in arg_df.index]
arg_df2 =arg_df[[c for c in arg_df if arg_df[c].fillna('NA').nunique() > 1]]

with open(settings.TOP_DIR.joinpath("html", "new_graphs", "index.html"), 'w') as f:
        print("""
            <style>
            table {
            font-family: arial, sans-serif;
            border-collapse: collapse;
            width: 80%;
            }

            td, th {
            border: 1px solid #dddddd;
            text-align: left;
            padding: 8px;
            }

            tr:nth-child(even) {
            background-color: #dddddd;
            }
            </style>
            pre<br />
        """, file=f)
        print(f"Generated: {datetime.datetime.now()}", file=f)
        print(arg_df2.to_html(escape=False), file=f)




In [None]:
from argparse import Namespace

def args2name(
              dataset,
              vae_dataset,
              vae_filter, 
              classifier_dataset,
              classifier_filter, 
              nonfrontal_add_date, 
              frontal_remove_date,
              stride, window, min_periods,
              ref_frontal_only, 
              sample_size, n_samples, replacement, 
              **kwargs
              ):
    fname = ['drift', dataset]
    fname.append(f"vae_{vae_dataset}-{vae_filter}")
    fname.append(f"scores_{classifier_dataset}-{classifier_filter}")
    fname.append(f"lateral-inject_addnfrnt{nonfrontal_add_date}-rmfrnt{frontal_remove_date}")
    fname.append(f"s{stride}-w{window}-min{min_periods}")
    fname.append(f"ref_frontalonly{bool(ref_frontal_only)}")
    fname.append(f"Samp_ss{sample_size}-n{n_samples}-repl{bool(replacement)}")
    return '__'.join(fname)



run_row = df.loc["dynamic_seed_rqrdmz21"]

r = run_row['run']
name = run_row.name

name

In [None]:
def is_arg_col(c):
    if "mlflow" in c or "_aml" in c or 'run_' in c or 'url' in c:
        return False

    ignore = ["output_dir", "input_dir", 'run', 'display_name', 'id']

    return c not in ignore
arg_cols = [c for c in df.columns if is_arg_col(c)]
arg_row = run_row[arg_cols].copy()

arg_row

In [None]:
output_file_path = settings.TOP_DIR.joinpath('results', 'drift', name+".csv")
fname = str(output_file_path)
r.download_file("outputs/output.csv", output_file_path=output_file_path)

In [None]:
# # Settings to file CSV file
write_html = True

# Diplay settings
span = 7
which = 'mean'
clip = 10
performance_col = ("performance", "Pneumonia", "auroc")
performance_col = ("performance", "macro avg", "auroc")
add_error_bars = True

standardize_dates = (settings.PADCHEST_SPLIT_DATES[0], settings.PADCHEST_SPLIT_DATES[1])
standardize_ix = pd.date_range(*standardize_dates)


stat = []
stat.append('pval')
stat.append('distance')

display_args = ["span", "which", "clip", "standardize_perf", "shift_drift_to_perf", "performance_col", "this_center", "this_range", "standardize_dates", "stat", "add_error_bars"]
d = locals()
display_args = {k: d[k] for k in display_args if k in d}

print(display_args)


if not os.path.exists(fname):
    raise ValueError("no fn")

combined_df_o = pd.read_csv(str(fname), index_col=0, header=[0, 1, 2, 3])
combined_df_o.index = pd.to_datetime(combined_df_o.index)

flip = column_xs(combined_df_o, include=["pval"])
combined_df_o[flip] = 1-combined_df_o[flip]
combined_df = combined_df_o.copy()

def smooth(y: pd.DataFrame):
    if span > 0:
        ys = y.ewm(span=span, ignore_na=False).mean()
        ys[y.isna()] = None
    else:
        ys = y    
    return ys


smooth_name = f"ewm{span}"

error_df = combined_df.swaplevel(0, -1, axis=1)[["std"]].swaplevel(0, -1, axis=1).droplevel(-1, axis=1).copy()
combined_df = combined_df.swaplevel(0, -1, axis=1)[[which]].swaplevel(0, -1, axis=1).droplevel(-1, axis=1).copy()

html_dir = settings.TOP_DIR.joinpath("html", "new_graphs", name)
perf_col_name = '-'.join(performance_col)

if not os.path.exists(html_dir):
    os.makedirs(html_dir)

stat_str = '+'.join(stat)
fn = f"{html_dir}/{which}_{stat_str}_stdclip{clip}_smooth-{smooth_name}_{perf_col_name}.html"

print("output:", fn)
def is_arg_col(c):
    if "mlflow" in c or "_aml" in c or 'run_' in c or 'url' in c:
        return False

    ignore = ["output_dir", "input_dir", 'run', 'display_name', 'id']
    return c not in ignore

arg_row = run_row[arg_cols].copy()
display_row = pd.Series(display_args)
params = pd.concat({'Drift': arg_row, "Display": display_row}, axis=0).rename("Value").to_frame()

if write_html:
    with open(fn, 'w') as f:
        print("""
            <style>
            table {
            font-family: arial, sans-serif;
            border-collapse: collapse;
            width: 80%;
            }

            td, th {
            border: 1px solid #dddddd;
            text-align: left;
            padding: 8px;
            }

            tr:nth-child(even) {
            background-color: #dddddd;
            }
            </style>
        """, file=f)
        print(f"""
            <h1>Drift report</h1> created: {datetime.datetime.now()}
            <br /><br />
            <h2>Arguments </h2>
            {params.to_html()}
            """, file=f)


In [None]:
def shift_to_other(this, other, this_range=None, this_center=None):
    u = other.mean()
    r = other.std()#other.max()-other.min()

    if this_range is None:
        this_range = this.std()#this.max()-this.min()

    if this_center is None:
        this_center = this.mean()
    return (this-this_center)/(this_range)*r+u


In [None]:
perf_col = performance_col
perf_df = combined_df[perf_col]
perf_df

In [None]:
other_cols = column_xs(combined_df, exclude=['performance', 'count'])
other_df = combined_df[other_cols]
other_df

In [None]:
extra_valids = pd.DataFrame(columns=other_df.columns)
extra_perf = pd.DataFrame(columns=perf_df.name)


In [None]:
extra_perf

In [None]:
cxs = column_xs(other_df, include=stat)
stats = pd.concat([other_df[cxs].dropna(axis=1), extra_valids[cxs].dropna(axis=1)], axis=0).sort_index()


stats = stats.loc[standardize_ix]

print(len(stats))
stats = stats.agg(["mean", "std"])


stats.T

In [None]:

otherstd = other_df[cxs].copy()

# cannot divide by zero
std0 = stats.loc['std'] == 0
stats.loc["std", stats.loc['std'] == 0] = 1
otherstd = (otherstd-stats.loc['mean'])/(stats.loc["std"])
errorstd = (error_df[cxs]-stats.loc['mean'])/(stats.loc["std"]
bad_cols = otherstd.columns[otherstd.isnull().max(axis=0)].tolist()

print(bad_cols)

vae_cols = [c for c in list(otherstd) if "mu." in c[0]]
score_cols = [c for c in list(otherstd) if "activation." in c[0]]
metadata_cols = sorted(set(otherstd).difference(vae_cols).difference(score_cols))

if clip is not None:
  otherstd = otherstd.clip(-1*clip, clip)

otherstd


In [None]:
    
x = pd.date_range(combined_df.index.min(), combined_df.index.max())
yp = perf_df.reindex(x)
perf_error_df = error_df[perf_col].reindex(x)


In [None]:
all_corr_df = correlate_performance(yp.rename('auroc'), otherstd)
all_ig_df = mutual_info_performance(yp.rename('auroc'), otherstd, bins=25)

m_ = all_ig_df.to_frame().join(all_corr_df.abs().rename('abs(corr)'))
m_ = m_.join(m_.mean(axis=1).rename('mean[abs(corr),info_gain]'))
m_ = m_.assign(no_weights=1)
m_ = m_.fillna(0)
m_.sort_values(by='mean[abs(corr),info_gain]', ascending=False).head(20)


In [None]:
true_counts = combined_df_o['count'].droplevel([0, 1], axis=1)['obs']
true_counts

In [None]:
count_df = combined_df['count'].reindex(x)
count_df

In [None]:
dates = {"Lat. Added": run_row['nonfrontal_add_date'],
         "Frontal Removed": run_row['frontal_remove_date'],
         "Peds Added": run_row['peds_start_date'],
         "Peds Stop": run_row['peds_end_date'],
         "Val Start": settings.PADCHEST_SPLIT_DATES[0],
         "Test Start":settings.PADCHEST_SPLIT_DATES[1],
         }
def add_date_line(date, name, y=1.08):
    fig.add_shape(type='line',
                x0=date,
                y0=0,
                x1=date,
                y1=1,
                line=dict(color='black', dash='dot'),
                xref='x',
                yref='paper'
                )
    fig.add_annotation(textangle=0,
                    xref="x",
                    yref="paper", x=date, y=y,
                       text=name, showarrow=False,)

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px


ignore_bad_cols = True

m = m_.copy()
if ignore_bad_cols:
    m.loc[bad_cols] = 0

yp = yp.reindex(x)
otherstd = otherstd.reindex(x)
counts = combined_df['count'].iloc[:, 0].reindex(x)
counts2 = true_counts.reindex(x)

traces = []
def add_trace(trace, row, col, customdata=True):
    traces.append((trace, row, col, customdata))

fig = make_subplots(rows=3, cols=1, shared_xaxes=True, vertical_spacing=0.05, row_heights=[.8, .8, .2])
# Sample graph
add_trace(go.Bar(x=x, y=(counts), showlegend=True, legendgroup="Num Samples (used)", marker={"color": "green"}, name="Num Samples (used)"
                     ), row=3, col=1)
add_trace(go.Scatter(x=x, y=(counts), showlegend=False, legendgroup="Num Samples (used)", marker={"color": "green", "line":{"width": .01}}, name="Num Samples (used)"
                     ), row=3, col=1)
add_trace(go.Bar(x=x, y=(counts2), showlegend=True, legendgroup="Num Samples (obs)", marker={"color": "orange"}, name="Num Samples (obs)"
                     ), row=3, col=1)
add_trace(go.Scatter(x=x, y=(counts2), showlegend=False, legendgroup="Num Samples (obs)", marker={"color": "orange", "line": {"width": .01}}, name="Num Samples (obs)"
                     ), row=3, col=1)

ype = perf_error_df.reindex(x)
# Performance
add_trace(go.Scatter(x=x, y=smooth(yp), showlegend=True,
    name="AUROC", connectgaps=False, line={"color": "blue"}), row=1, col=1)


yeu = smooth(yp+ype)
yel = smooth(yp-ype)

k = ~(yeu.isnull()|yel.isnull())
xe = x[k]
yel = yel[k]
yeu = yeu[k]

add_trace(go.Scatter(x=xe, 
                     y=yeu, 
                     hoverinfo="skip",
                     showlegend=False,
                     connectgaps=False,
                     line=dict(width=0),
        ), row=1, col=1, customdata=False)
add_trace(go.Scatter(x=xe, 
                     y=yel,
                     fillcolor='rgba(0,0,255,0.2)',
                     fill='tonexty',
                     hoverinfo="skip",
                     showlegend=False,
                     connectgaps=False,
                     line=dict(width=0),
        ), row=1, col=1, customdata=False)



single_disp = dict(line=dict(dash="dot", width=1))

corrs = []
colors = px.colors.qualitative.Set1

for i, name in enumerate(m):
    weights = m[name].sort_values(ascending=False)
    # weights = weights.iloc[:5]
    y = -w_avg(otherstd.loc[x], weights=weights.to_dict())

    c, cm = yp.corr(y), smooth(yp).corr(smooth(y))
    corrs.append({"name": 'All Unified', "stat": str(stat), "wieghts": name, "corr (raw)": c, "corr (smoothed)": cm, "when": "all"})
    
    c =  yp.loc[settings.PADCHEST_SPLIT_DATES[0]:settings.PADCHEST_SPLIT_DATES[1]].corr(y.loc[settings.PADCHEST_SPLIT_DATES[0]:settings.PADCHEST_SPLIT_DATES[1]]), 
    cm = smooth(yp).loc[settings.PADCHEST_SPLIT_DATES[0]:settings.PADCHEST_SPLIT_DATES[1]].corr(smooth(y).loc[settings.PADCHEST_SPLIT_DATES[0]:settings.PADCHEST_SPLIT_DATES[1]])
    corrs.append({"name": 'All Unified', "stat": str(stat), "wieghts": name,
                 "corr (raw)": c, "corr (smoothed)": cm, "when": "validation"})
    
    c = yp.loc[settings.PADCHEST_SPLIT_DATES[1]:].corr(
        y.loc[settings.PADCHEST_SPLIT_DATES[1]:]),
    cm = smooth(yp).loc[settings.PADCHEST_SPLIT_DATES[1]:].corr(
        smooth(y).loc[settings.PADCHEST_SPLIT_DATES[1]:])
    corrs.append({"name": 'All Unified', "stat": str(stat), "wieghts": name,
                 "corr (raw)": c, "corr (smoothed)": cm, "when": "test"})
    
    add_trace(go.Scatter(x=x, y=smooth(y),
                        # customdata=smooth(yo),
                        showlegend=True, legendgroup=name,
                        name=name, hovertemplate="%{customdata:.3f} (on graph: %{y:.3f})", line={"width": 1},  connectgaps=False), row=2, col=1)
    
    c, cm = yp.corr(counts), smooth(yp).corr(smooth(counts))
    # print(f"   num_samples: {c:.4f}, {cm:.4f}", )
    corrs.append({"name": "num sampled (used)", "stat": str(stat), "wieghts": "NA" , "corr (raw)": c, "corr (smoothed)": cm, 
                "when": "all"})
    c, cm = yp.corr(counts2), smooth(yp).corr(smooth(counts2))
    # print(f"   num_samples (observed): {c:.4f}, {cm:.4f}", )
    corrs.append({"name": "num sampled (obs)", "stat": str(stat), "wieghts": "NA",
                "corr (raw)": c, "corr (smoothed)": cm, "when": "all"})


def make_fig(traces, **fig_kwargs):
    from collections import defaultdict
    data = {}

    fig_map = defaultdict(list)
    max_row = 1
    max_col = 1
    for t, r, c, cd in traces:
        fig_map[(r,c)].append(t.name)
        max_row = max(r, max_row)
        max_col = max(c, max_col)
        if cd:
            data[t.name] = t.y
    customdata = pd.DataFrame(data, index=x)
    fig = make_subplots(rows=max_row, cols=max_col, **fig_kwargs)
    for t, r, c, cd in traces:
        cus_cols = sorted(customdata)
        # cus_cols = [cc for cc in cus_cols if cc not in fig_map[(r,c)]]

        ho = "<br />".join(["{name}=%{{customdata[{i}]:.3f}}".format(i=i, name=name) for i,name in enumerate(cus_cols)])
        hovertemplate = "%{x}<br>" + f"{t.name}: " +"%{y}<br><br>"+f"{ho}<extra></extra>"
        if cd:
            t.customdata = customdata[cus_cols]
            t.hovertemplate = hovertemplate
        # t.hoverlabel = {'bgcolor': 'white'}
        fig.add_trace(t, row=r, col=c)
        
    return fig

fig = make_fig(traces=traces, shared_xaxes=True, vertical_spacing=0.01, row_heights=[.6, .6, .2])

date_line_y=1.05
for name, date in dates.items():
    if not pd.isna(date):
        add_date_line(date, f"{name}<br />({date})", y=date_line_y)

fig.update_layout(
    title=f"(-1)*w_avg({stat}) and AUROC")
# fig.update_layout(hovermode="x")
fig.update_xaxes(showspikes=True, spikecolor="black", spikesnap="cursor", spikemode="across", spikethickness=1)
fig.update_layout(spikedistance=1000)
fig.update_layout(height=900)
fig.update_xaxes(range=[settings.PADCHEST_SPLIT_DATES[1], y.index.max()])
fig.update_layout(barmode='overlay')
corr_df = pd.DataFrame(corrs).sort_values('when')
display(corr_df)
fig.show()


fig_html = fig.to_html()
print(fn)
if write_html: 
    with open(fn, 'a') as f:
        print(f"<h2>Full Unified</h2>", file=f)
        corr_df.to_html(f)
        print(fig_html, file=f)


In [None]:
combined_df['performance']

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px

auroc_cols = column_xs(combined_df, ['auroc'])
precision_cols = column_xs(combined_df, ['precision'])
recall_cols = column_xs(combined_df, ['recall'])
f1_cols = column_xs(combined_df, ['f1-score'])
support_cols = column_xs(combined_df, ['support'])


cols_ = [auroc_cols, recall_cols, precision_cols, f1_cols, support_cols]
fig = make_subplots(rows=len(cols_), cols=1, shared_xaxes=True, vertical_spacing=0.01)

names = sorted(set([c[1] for c in itertools.chain(*cols_)]), key=lambda x: 'avg' in x)
colors = px.colors.qualitative.Plotly
dashes = ['solid', 'dash', 'dotted']

list(itertools.product(dashes, colors))

lines = {}
for name, spec in zip(names, itertools.product(dashes, colors)):
    lines[name] = {'color': spec[1], 'dash': spec[0]}



visited = set()
for r, cols__ in enumerate(cols_, 1):
    for c in cols__:
        ypp = combined_df[c].reindex(x)
        line = lines[c[1]]
        showlegend = not c[1] in visited
        visited.add(c[1])
        fig.add_trace(go.Line(x=x, y=smooth(ypp), showlegend=showlegend, legendgroup=c[1],
                name=c[1], hovertemplate="%{y: .5f}", connectgaps=False, line=line), row=r, col=1)
    fig.update_yaxes(title_text=c[-1], row=r, col=1)


date_line_y=1.025
for name, date in dates.items():
    if not pd.isna(date):
        add_date_line(date, f"{name}<br />({date})", y=date_line_y)

fig.update_layout(title=f"Peformance")
fig.update_layout(hovermode="x unified")
fig.update_layout(height=400*len(cols_))
fig.update_xaxes(range=[settings.PADCHEST_SPLIT_DATES[1], yp.index.max()])
fig.show()

fig_html = fig.to_html()
print(fn)
if write_html:
    with open(fn, 'a') as f:
        print(f"<h2>Performance</h2>", file=f)
        print(fig_html, file=f)

In [None]:
shift_drift_to_perf = False

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px


traces = []
add_trace(go.Scatter(x=x, y=smooth(yp), showlegend=True,
              name="AUROC", hovertemplate="%{y: .5f}", connectgaps=False), row=1, col=1)

u = yp.median()
r = yp.max()-yp.min()

single_disp = dict(line=dict(dash="dot", width=1))
# print("correlation with auroc")

colors = px.colors.qualitative.Set1


corrs = []
for row, (name_, cols) in enumerate(zip(["metadata", "vae", "score", "vae+score", "metadate+vae+score"],  [metadata_cols, vae_cols, score_cols, vae_cols+score_cols, vae_cols+score_cols+metadata_cols]), 1):
    for i, name in enumerate(["abs(corr)"]):
        otherstd_ = otherstd[cols]
        weights = m[name].sort_values(ascending=False)
        # weights = weights.iloc[:5]
        yo = -w_avg(otherstd_.loc[x], weights=weights.to_dict())

        c, cm = yp.corr(yo), smooth(yp).corr(smooth(yo))
        # print(f"   {name_}: (-1)*w_avg({stat}, w={name}): {c:.4f}, {cm:.4f}", )
        
        corrs.append({"name": name_, "stat": str(stat),
                     "wieghts": name, "corr (raw)": c, "corr (smoothed)": cm, "when": "all"})
        
        c = yp.loc[settings.PADCHEST_SPLIT_DATES[0]:settings.PADCHEST_SPLIT_DATES[1]].corr(
            yo.loc[settings.PADCHEST_SPLIT_DATES[0]:settings.PADCHEST_SPLIT_DATES[1]]),
        cm = smooth(yp).loc[settings.PADCHEST_SPLIT_DATES[0]:settings.PADCHEST_SPLIT_DATES[1]].corr(
        smooth(yo).loc[settings.PADCHEST_SPLIT_DATES[0]:settings.PADCHEST_SPLIT_DATES[1]])
        corrs.append({"name": name_, "stat": str(stat),
                     "wieghts": name, "corr (raw)": c, "corr (smoothed)": cm, "when": "validation"})

        c = yp.loc[settings.PADCHEST_SPLIT_DATES[1]:].corr(
        y.loc[settings.PADCHEST_SPLIT_DATES[1]:]),
        cm = smooth(yp).loc[settings.PADCHEST_SPLIT_DATES[1]:].corr(
        smooth(yo).loc[settings.PADCHEST_SPLIT_DATES[1]:])
        corrs.append({"name": name_, "stat": str(stat),
                     "wieghts": name, "corr (raw)": c, "corr (smoothed)": cm, "when": "test"})
        
        if shift_drift_to_perf:
            y = shift_to_other(yo, yp, this_range, this_center)
        else:
            y = yo
        add_trace(go.Scatter(x=x, y=smooth(y),
                            
                            showlegend=True, legendgroup=name_,
                            name=name_, connectgaps=False), row=2, col=1)


fig = make_fig(traces, shared_xaxes=True, vertical_spacing=0.01)
date_line_y=1.05
for name, date in dates.items():
    if not pd.isna(date):
        add_date_line(date, f"{name}<br />({date})", y=date_line_y)


fig.update_layout(
    title=f"(-1)*w_avg({stat},w={name}) and AUROC")
fig.update_xaxes(showspikes=True, spikecolor="black", spikesnap="cursor", spikemode="across", spikethickness=1)
fig.update_layout(spikedistance=1000)
fig.update_layout(height=600)
fig.update_xaxes(range=[settings.PADCHEST_SPLIT_DATES[1], y.index.max()])
fig.update_layout(barmode='overlay')
corr_df = pd.DataFrame(corrs).sort_values('when')
display(corr_df)
fig.show()

fig_html = fig.to_html()

if write_html:
    with open(fn, 'a') as f:
        print(f"<h2>Level 1 Unified</h2>", file=f)
        corr_df.to_html(f)
        print(fig_html, file=f)


In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px


def partition(l, n):
    return [l[i:i + n] for i in range(0, len(l), n)]

traces = []
col_name = "metadata"
cols = metadata_cols
otherstd_ = otherstd[cols]
cols_ = column_xs(otherstd_, include='chi2')
cols_ = partition(cols_, 10)

# fig = make_subplots(rows=len(cols_), cols=1, shared_xaxes=True, vertical_spacing=0.01)
add_trace(go.Scatter(x=x, y=smooth(yp), showlegend=True,
              name="AUROC", connectgaps=False), row=1, col=1)


u = yp.median()
r = yp.max()-yp.min()

single_disp = dict(line=dict(dash="dot", width=1))
print("correlation with auroc")

colors = px.colors.qualitative.Set1

name = "abs(corr)"
name_ = "metadata combined"
weights = m[name].sort_values(ascending=False)
# weights = weights.iloc[:5]
yo = -w_avg(otherstd_, weights=weights.to_dict())
c, cm = yp.corr(yo), smooth(yp).corr(smooth(yo))
if shift_drift_to_perf:
    y = shift_to_other(yo, yp, this_range, this_center)
else:
    y = yo
add_trace(go.Scatter(x=x, y=smooth(y),
                      customdata=smooth(yo),
                      showlegend=True, legendgroup=name_,
                      name=name_,  connectgaps=False), row=2, col=1)



line = {"width": 1}
for row, cols in enumerate(cols_, 3):
    for c in cols:
        yo = -otherstd[c]
        if shift_drift_to_perf:
            y = shift_to_other(yo, yp, this_range, this_center)
        else:
            y = yo
        add_trace(go.Scatter(x=x, y=smooth(y), showlegend=True,
                              customdata=smooth(yo),
                            #   legendgroup=str(c[0]),
                              name=str(c), connectgaps=False), row=row, col=1)

fig = make_fig(traces, shared_xaxes=True, vertical_spacing=0.01)

date_line_y = 1.04
for name, date in dates.items():
    if not pd.isna(date):
        add_date_line(date, f"{name}<br />({date})", y=date_line_y)

fig.update_layout(
    title=f"(-1)*w_avg({stat},w={name}) and AUROC")

fig.update_xaxes(showspikes=True, spikecolor="black", spikesnap="cursor", spikemode="across", spikethickness=1)
fig.update_layout(spikedistance=1000)
fig.update_layout(height=900)
fig.update_xaxes(range=[settings.PADCHEST_SPLIT_DATES[1], y.index.max()])
fig.update_layout(barmode='overlay')
fig.show()

fig_html = fig.to_html()
if write_html: 
    with open(fn, 'a') as f:
        print(f"<h2>Metadata Categorical</h2>", file=f)
        print(fig_html, file=f)

In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px


def partition(l, n):
    return [l[i:i + n] for i in range(0, len(l), n)]

traces = []
col_name = "metadata"
cols = metadata_cols
otherstd_ = otherstd[cols]
cols_ = column_xs(otherstd_, include='ks')
cols_ = partition(cols_, 10)

# fig = make_subplots(rows=len(cols_), cols=1, shared_xaxes=True, vertical_spacing=0.01)
add_trace(go.Scatter(x=x, y=smooth(yp), showlegend=True,
              name="AUROC", connectgaps=False), row=1, col=1)


u = yp.median()
r = yp.max()-yp.min()

single_disp = dict(line=dict(dash="dot", width=1))
print("correlation with auroc")

colors = px.colors.qualitative.Set1

name = "abs(corr)"
name_ = "metadata combined"
weights = m[name].sort_values(ascending=False)
# weights = weights.iloc[:5]
yo = -w_avg(otherstd_, weights=weights.to_dict())
c, cm = yp.corr(yo), smooth(yp).corr(smooth(yo))
if shift_drift_to_perf:
    y = shift_to_other(yo, yp, this_range, this_center)
else:
    y = yo
add_trace(go.Scatter(x=x, y=smooth(y),
                      customdata=smooth(yo),
                      showlegend=True, legendgroup=name_,
                      name=name_,  connectgaps=False), row=2, col=1)



line = {"width": 1}
for row, cols in enumerate(cols_, 3):
    for c in cols:
        yo = -otherstd[c]
        if shift_drift_to_perf:
            y = shift_to_other(yo, yp, this_range, this_center)
        else:
            y = yo
        add_trace(go.Scatter(x=x, y=smooth(y), showlegend=True,
                              customdata=smooth(yo),
                            #   legendgroup=str(c[0]),
                              name=str(c), connectgaps=False), row=row, col=1)

fig = make_fig(traces, shared_xaxes=True, vertical_spacing=0.01)

date_line_y=1.04
for name, date in dates.items():
    if not pd.isna(date):
        add_date_line(date, f"{name}<br />({date})", y=date_line_y)

fig.update_layout(
    title=f"(-1)*w_avg({stat},w={name}) and AUROC")

fig.update_xaxes(showspikes=True, spikecolor="black", spikesnap="cursor", spikemode="across", spikethickness=1)
fig.update_layout(spikedistance=1000)
fig.update_layout(height=900)
fig.update_xaxes(range=[settings.PADCHEST_SPLIT_DATES[1], y.index.max()])
fig.update_layout(barmode='overlay')
fig.show()

fig_html = fig.to_html()
if write_html: 
    with open(fn, 'a') as f:
        print(f"<h2>Metadata Real Valued</h2>", file=f)
        print(fig_html, file=f)


In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go
import plotly.express as px


traces = []
col_name = "vae"
cols = vae_cols
o = other_df[cols].loc[settings.PADCHEST_SPLIT_DATES[1]:].swaplevel(0, 2, axis=1)[['distance']].swaplevel(0, 2, axis=1)
cols = o.max(axis=0).sort_values(ascending=False).head(10).index.tolist()

otherstd_ = otherstd[cols]


cols_ = [column_xs(otherstd_, include='ks')]


add_trace(go.Scatter(x=x, y=smooth(yp), showlegend=True,
              name="AUROC", hovertemplate="%{y: .5f}", line=dict(color='blue'), connectgaps=False), row=1, col=1)


u = yp.mean()
r = yp.max()-yp.min()

single_disp = dict(line=dict(dash="dot", width=1))
colors = px.colors.qualitative.Set1
name = "abs(corr)"
name_ = "combined"
weights = m[name].sort_values(ascending=False)
yo = -w_avg(otherstd_, weights=weights.to_dict())
c, cm = yp.corr(yo), smooth(yp).corr(smooth(yo))
if shift_drift_to_perf:
    y = shift_to_other(yo, yp, this_range, this_center)
else:
    y = yo
add_trace(go.Scatter(x=x, y=smooth(y), 
                      customdata=smooth(yo),
                      showlegend=True, legendgroup=name_,
                      name=name_, hovertemplate="%{customdata:.3f} (on graph: %{y:.3f})", line={"color": 'red', "width": 1},  connectgaps=False), row=2, col=1)

line = {"width": 1}
for row, cols in enumerate(cols_, 2):
    for c in cols:
        yo = -otherstd[c]
        if shift_drift_to_perf:
            y = shift_to_other(yo, yp, this_range, this_center)
        else:
            y = yo
        add_trace(go.Scatter(x=x, y=smooth(y), showlegend=True, 
                              customdata=smooth(yo),
                              legendgroup=str(c[0]),
                              name=str(c), hovertemplate="%{customdata:.3f} (on graph: %{y:.3f})", line=line,  connectgaps=False), row=row, col=1)

fig = make_fig(traces, shared_xaxes=True, vertical_spacing=0.01)

date_line_y=1.08
for name, date in dates.items():
    if not pd.isna(date):
        add_date_line(date, f"{name}<br />({date})", y=date_line_y)

fig.update_layout(
    title=f"(-1)*w_avg({stat},w={name}) and AUROC")


fig.update_xaxes(showspikes=True, spikecolor="black", spikesnap="cursor", spikemode="across", spikethickness=1)
fig.update_layout(spikedistance=1000)
fig.update_layout(height=900)
fig.update_xaxes(range=[settings.PADCHEST_SPLIT_DATES[1], y.index.max()])
fig.update_layout(barmode='overlay')
fig.show()

fig_html = fig.to_html()

if write_html:
    with open(fn, 'a') as f:
        print(f"<h2>{col_name}</h2>", file=f)
        print(fig_html, file=f)


In [None]:
html_top_dir = settings.TOP_DIR.joinpath("html", "new_graphs")

In [None]:
x = os.listdir(str(html_top_dir))

x

In [None]:
arg_df2['Link'] = [f"""<a href="{name}">Graphs</a>""" for name in arg_df2.index]
arg_df2['Exists'] = [f"""{settings.TOP_DIR.joinpath("html", "new_graphs", name).exists()}""" for name in arg_df2.index]

with open(settings.TOP_DIR.joinpath("html", "new_graphs", "index.html"), 'w') as f:
        print("""
            <style>
            table {
            font-family: arial, sans-serif;
            border-collapse: collapse;
            width: 80%;
            }

            td, th {
            border: 1px solid #dddddd;
            text-align: left;
            padding: 8px;
            }

            tr:nth-child(even) {
            background-color: #dddddd;
            }
            </style>
        """, file=f)
        print(f"Generated: {datetime.datetime.now()}", file=f)
        print(arg_df2.to_html(escape=False), file=f)




In [None]:
# from plotly.subplots import make_subplots
# import plotly.graph_objects as go
# import plotly.express as px

# html = ["<h3>metadata correlation during validation</h3>"]
# corrs = []
# for col in metadata_cols:
#     y = -otherstd[col].loc[settings.PADCHEST_SPLIT_DATES[0]:settings.PADCHEST_SPLIT_DATES[1]]
#     c, cm = yp.loc[settings.PADCHEST_SPLIT_DATES[0]:settings.PADCHEST_SPLIT_DATES[1]].corr(y), smooth(
#         yp).loc[settings.PADCHEST_SPLIT_DATES[0]:settings.PADCHEST_SPLIT_DATES[1]].corr(smooth(y))
#     corrs.append({"name": col, "corr (raw)": c, "corr (smoothed)": cm})

# df = pd.DataFrame(corrs).sort_values("corr (raw)",ascending=False, )


# col_name = "metadata"
# cols = df.head(10).name.tolist()
# otherstd_ = otherstd[cols]
# cols_ = [cols]

# fig = make_subplots(rows=1, cols=1, shared_xaxes=True, vertical_spacing=0.01)
# fig.add_trace(go.Line(x=x, y=smooth(yp), showlegend=True,
#               name="AUROC", hovertemplate="%{y: .5f}", line=dict(color='blue'), connectgaps=False), row=1, col=1)

# u = yp.median()
# r = yp.max()-yp.min()

# single_disp = dict(line=dict(dash="dot", width=1))

# colors = px.colors.qualitative.Set1

# name = "info_gain"
# weights = m[name].sort_values(ascending=False)
# # weights = weights.iloc[:5]
# yo = -w_avg(otherstd_, weights=weights.to_dict())

# c, cm = yp.corr(yo), smooth(yp).corr(smooth(yo))
# if shift_drift_to_perf:
#     y = shift_to_other(yo, yp, this_range, this_center)
# else:
#     y = yo
# fig.add_trace(go.Line(x=x, y=smooth(y), showlegend=True, legendgroup=name_,
#                       name=name_, hovertemplate="%{y: .5f}", line={"color": 'red', "width": 1},  connectgaps=False), row=1, col=1)


# line = {"width": 1}
# for row, cols in enumerate(cols_, 1):
#     for c in cols:
#         yo = -otherstd[c]
#         if shift_drift_to_perf:
#             y = shift_to_other(yo, yp, this_range, this_center)
#         else:
#             y = yo
#         fig.add_trace(go.Line(x=x, y=smooth(y), showlegend=True,
#                               customdata=smooth(yo),
#                               legendgroup=str(c[0]),
#                               name=str(c), hovertemplate="%{customdata:.3f} (on graph: %{y:.3f})", line=line,  connectgaps=False), row=row, col=1)

# add_date_line(settings.PADCHEST_SPLIT_DATES[0], f"Val Start<br />({settings.PADCHEST_SPLIT_DATES[0]})")
# add_date_line(settings.PADCHEST_SPLIT_DATES[1], f"Test Start<br />({settings.PADCHEST_SPLIT_DATES[1].strip()})")

# if nonfrontal_add_date is not None:
#     add_date_line(nonfrontal_add_date, f"Lat. Added<br />({nonfrontal_add_date})")

# if frontal_remove_date is not None:
#     add_date_line(frontal_remove_date, f"Front Removed<br />({frontal_remove_date})")

# fig.update_layout(
#     title=f"(-1)*w_avg({stat},w={name}) and AUROC.")
# fig.update_layout(hovermode="x unified")
# fig.update_layout(height=600*len(cols_))
# fig.update_xaxes(range=[settings.PADCHEST_SPLIT_DATES[0], settings.PADCHEST_SPLIT_DATES[1]])
# fig.show()

# html.append(df.to_html())
# html.append(fig.to_html())
# if write_html:
#     with open(fn, 'a') as f:
#         print("<br />".join(html), file=f)


In [None]:
# html = ["<h3>metadata correlation during test</h3>"]

# corrs = []

# for col in metadata_cols:
#     y = -otherstd[col].loc[settings.PADCHEST_SPLIT_DATES[1]:]
#     c, cm = yp.loc[settings.PADCHEST_SPLIT_DATES[1]:].corr(y), smooth(
#         yp).loc[settings.PADCHEST_SPLIT_DATES[1]:].corr(smooth(y))
#     corrs.append({"name": col, "corr (raw)": c, "corr (smoothed)": cm})

# df = pd.DataFrame(corrs).sort_values("corr (raw)",ascending=False, )
# col_name = "metadata"
# cols = df.head(10).name.tolist()
# otherstd_ = otherstd[cols]
# cols_ = [cols]

# fig = make_subplots(rows=1, cols=1, shared_xaxes=True, vertical_spacing=0.01)
# fig.add_trace(go.Line(x=x, y=smooth(yp), showlegend=True,
#               name="AUROC", hovertemplate="%{y: .5f}", line=dict(color='blue'), connectgaps=False), row=1, col=1)

# u = yp.median()
# r = yp.max()-yp.min()

# single_disp = dict(line=dict(dash="dot", width=1))

# colors = px.colors.qualitative.Set1

# name = "info_gain"
# weights = m[name].sort_values(ascending=False)
# # weights = weights.iloc[:5]
# y = -w_avg(otherstd_, weights=weights.to_dict())

# c, cm = yp.corr(y), smooth(yp).corr(smooth(y))
# print(f"   {name_}: (-1)*w_avg({stat}, w={name}): {c:.4f}, {cm:.4f}", )
# y = (y-y.median())/(y.max()-y.min())*r+u
# fig.add_trace(go.Line(x=x, y=smooth(y), showlegend=True, legendgroup=name_,
#                       name=name_, hovertemplate="%{y: .5f}", line={"color": 'red', "width": 1},  connectgaps=False), row=1, col=1)


# line = {"width": 1}
# for row, cols in enumerate(cols_, 1):
#     for c in cols:
#         yo = -otherstd[c]
#         if shift_drift_to_perf:
#             y = shift_to_other(yo, yp, this_range, this_center)
#         else:
#             y = yo
#         fig.add_trace(go.Line(x=x, y=smooth(y), showlegend=True,
#                               customdata=smooth(yo),
#                               legendgroup=str(c[0]),
#                               name=str(c), hovertemplate="%{customdata:.3f} (on graph: %{y:.3f})", line=line,  connectgaps=False), row=row, col=1)

# add_date_line(settings.PADCHEST_SPLIT_DATES[0], f"Val Start<br />({settings.PADCHEST_SPLIT_DATES[0]})")
# add_date_line(settings.PADCHEST_SPLIT_DATES[1], f"Test Start<br />({settings.PADCHEST_SPLIT_DATES[1].strip()})")

# if nonfrontal_add_date is not None:
#     add_date_line(nonfrontal_add_date, f"Lat. Added<br />({nonfrontal_add_date})")

# if frontal_remove_date is not None:
#     add_date_line(frontal_remove_date, f"Front Removed<br />({frontal_remove_date})")

# fig.update_layout(
#     title=f"(-1)*w_avg({stat},w={name}) and AUROC")
# fig.update_layout(hovermode="x unified")
# fig.update_layout(height=600*len(cols_))
# fig.update_xaxes(range=[settings.PADCHEST_SPLIT_DATES[1], yp.index.max()])
# fig.show()

# html.append(df.to_html())
# html.append(fig.to_html())

# if write_html:
#     with open(fn, 'a') as f:
#         print("<br />".join(html), file=f)
