In [None]:
from IPython.display import display

import pandas as pd
import warnings
from model_drift import settings, helpers
from model_drift.data.utils import nested2series
import matplotlib.pylab as plt
import numpy as np
import seaborn as sns
from model_drift.drift.tabular import TabularDriftCalculator
from model_drift.drift.numeric import KSDriftCalculator, BasicDriftCalculator
from model_drift.drift.categorical import ChiSqDriftCalculator
from model_drift.drift.collection import DriftCollectionCalculator
from model_drift.drift.performance import AUROCCalculator

from model_drift.drift.sampler import Sampler
from model_drift.data.padchest import PadChest
import plotly.graph_objects as go

warnings.filterwarnings("ignore")


In [None]:
jsonl_file = str(settings.TOP_DIR.joinpath("results", 'vae', 'padchest-trained', "all-data", 'preds.jsonl'))
vae_df = helpers.jsonl_files2dataframe(jsonl_file)
vae_df = pd.concat(
    [
        vae_df,
        pd.DataFrame(vae_df['mu'].values.tolist(), columns=[f"mu.{c}" for c in range(128)])
    ],
    axis=1
)
vae_df.head()

In [None]:
from model_drift.data.padchest import LABEL_MAP
label_cols = list(LABEL_MAP)
jsonl_file = str(settings.TOP_DIR.joinpath("results", 'classifier', 'padchest-trained', "frontal_only", "preds.jsonl"))
scores_df = helpers.jsonl_files2dataframe(jsonl_file)
scores_df = pd.concat(
    [
        scores_df,
        pd.DataFrame(scores_df['activation'].values.tolist(), columns=[f"activation.{c}" for c in label_cols])
    ],
    axis=1)
scores_df.head()


In [None]:
from model_drift.data.utils import fix_strlst, remap_labels
# Load padchest CSV
pc = PadChest(settings.PADCHEST_FILENAME, label_map=LABEL_MAP)

df_o = pc.df[['ImageID','Labels']].copy().rename(columns={'Labels': 'OriginalLabels'})
df_o['OriginalLabels'] = fix_strlst(df_o['OriginalLabels'])
pc.prepare()

print(pc.df.query("Frontal")['Labels'].apply(len).describe())

pc.merge(vae_df, left_on="ImageID", right_on="index", how='inner')
pc.merge(scores_df, left_on="ImageID", right_on="index", how='inner')
pc.merge(df_o, left_on="ImageID", right_on="ImageID", how='inner')


# train, val, test = pc.split(settings.PADCHEST_SPLIT_DATES, studydate_index=True)


In [None]:
# pd.concat(
#     {
#         "all": pc.df["StudyDate"].describe(datetime_is_numeric=True),
#         "train": train.df["StudyDate"].describe(datetime_is_numeric=True),
#         "val": val.df["StudyDate"].describe(datetime_is_numeric=True),
#         "test": test.df["StudyDate"].describe(datetime_is_numeric=True),
#     },
#     axis=1,
# )


In [None]:


sample_bad = pc.df.set_index("StudyDate").loc["2016-12-01":"2016-12-28"].query("Frontal")
sample_good = pc.df.set_index("StudyDate").loc["2015-12-01":"2015-12-28"].query("Frontal")

len(sample_bad), len(sample_good)


In [None]:
from model_drift.data.utils import fix_strlst, remap_labels, remap_label_list

remap_labels(sample_bad['OriginalLabels'], label_map=LABEL_MAP)

In [None]:
label_cols = list(pc.label_map)




In [None]:
sample_bad['activation'].apply(np.mean).mean(), sample_good['activation'].apply(np.mean).mean()


In [None]:
pc.df['Labels'].apply(len).max()


In [None]:
LABEL_MAP

In [None]:
import functools
from collections import Counter


functools.reduce(lambda x1, x2: set(x1).union(x2), sample_bad['Labels'])
functools.reduce(lambda x1, x2: set(x1).union(x2), sample_good['Labels'])


x1 = sample_bad['Labels'].apply(Counter).sum()

x2 = sample_good['Labels'].apply(Counter).sum()



import matplotlib.pylab as plt

x1.most_common()[:10], x2.most_common()[:10]

                 
                #  , sample_good['Labels'].apply(len).value_counts()


In [None]:
from sklearn import metrics
from collections import defaultdict




s = sample_bad
th = 0.5

def class_report(s, th=0.5, drop_zero=False):
    scores = np.vstack(s['activation'].values)
    labels = np.vstack(s['label'].values).astype(int)
    target_names = np.array(list(LABEL_MAP))
    

    keeps = (labels.sum(axis=0) > 0)
    output = metrics.classification_report(labels, scores>=th, target_names=target_names, output_dict=True)
    graphs = defaultdict(dict)
    for i, k in enumerate(target_names):
        if keeps[i] == 0: continue
        output[k]['auroc'] = metrics.roc_auc_score(
            labels[:, i], scores[:, i])
        # graphs[k]['fpr'], graphs[k]['tpr'], graphs[k]['thrs'] = metrics.roc_curve(labels[:, i], scores[:, i])
    
        
    
        
    output['macro avg']['auroc'] = (metrics.roc_auc_score(labels[:, keeps], scores[:, keeps], labels=target_names[keeps], average='macro'))
    output['micro avg']['auroc'] = (metrics.roc_auc_score(labels, scores, average='micro'))
    
    
    
    
    # print(output)
    return pd.DataFrame(output), graphs


o1, graphs_bad = class_report(sample_bad, drop_zero=True)
o2, graphs_good = class_report(sample_good, drop_zero=True)


pd.concat({"bad":o1, "good":o2}).T.swaplevel(0,1,axis=1).sort_index(axis=1)['auroc']

In [None]:
o1

In [None]:
def FLOAT(x): return pd.to_numeric(x, errors='coerce').astype(float)
def CAT(x): return x.astype('category')


vae_cols = {c: FLOAT for c in list(pc.df) if c.startswith("mu.") and 'all' not in c}
score_cols = {c: FLOAT for c in list(pc.df) if c.startswith("activation.") and 'all' not in c}
metadata_cols = {
    'age': FLOAT,
    'Projection': CAT,
    "PatientSex_DICOM": CAT,
    "ViewPosition_DICOM": CAT,
    "Modality_DICOM": CAT,
    "Manufacturer_DICOM": CAT,
    "PhotometricInterpretation_DICOM": CAT,
    "PixelRepresentation_DICOM": CAT,
    "PixelAspectRatio_DICOM": CAT,
    "SpatialResolution_DICOM": FLOAT,
    "BitsStored_DICOM": CAT,
    "WindowCenter_DICOM": FLOAT,
    "WindowWidth_DICOM": FLOAT,
    "Rows_DICOM": FLOAT,
    "Columns_DICOM": FLOAT,
    "XRayTubeCurrent_DICOM": CAT,
    "Exposure_DICOM": FLOAT,
    "ExposureInuAs_DICOM": FLOAT,
    "RelativeXRayExposure_DICOM": FLOAT,
    "Frontal": lambda x: x.astype(str),
}


for c, f in metadata_cols.items():
    pc.df[c] = f(pc.df[c])


In [None]:
from plotly.subplots import make_subplots
import plotly.graph_objects as go



def make_plotly_graph(graphs):
    fig = make_subplots(1, 1, vertical_spacing=0.05, horizontal_spacing=0.05)
    for k, g in graphs.items():
        x = g['fpr']
        y = g['tpr']
        fig.add_trace(go.Line(x=x, y=y, showlegend=True, name=k))


    # fig.update_yaxes(scaleanchor="x", scaleratio=1)
    fig.update_yaxes(range=[0, 1.01], constrain="domain")
    fig.update_xaxes(range=[0, 1.01], constrain="domain")
    fig.update_layout(height=400, width=600, margin=go.layout.Margin(
        l=10,  # left margin
        r=10,  # right margin
        b=20,  # bottom margin
        t=20,  # top margin
    ))
    return fig

sample = sample_bad
def make_hist_plot(sample):
    fig = make_subplots(5, 4, vertical_spacing=.15)
    for i, col in enumerate(metadata_cols):
        c = i%4+1
        r = i//4+1
        try:
            hist = pd.cut(sample[col], bins=25).value_counts().sort_index()
        except:
            hist = sample[col].value_counts().sort_index()
        fig.add_trace(go.Bar(x=hist.index.map(str), y=hist, name=col), row=r, col=c)
        fig.update_layout(height=900,
                        margin=go.layout.Margin(
                            l=10,  # left margin
                            r=10,  # right margin
                            b=20,  # bottom margin
                            t=20,  # top margin
                        ))
    return fig


fig = make_hist_plot(sample)
fig.show()


In [None]:
from IPython.display import display_html, display_markdown, HTML, Markdown, display
import os
import json
output_dir = settings.TOP_DIR.joinpath("html")
if not os.path.exists(output_dir):
  os.makedirs(output_dir)

sample_bad = pc.df.set_index("StudyDate").loc["2016-12-01":"2016-12-28"].query("Frontal == 'True'")
sample_good = pc.df.set_index("StudyDate").loc["2015-12-01":"2015-12-28"].query("Frontal == 'True'")

sample = sample_bad
name = "dip in late 2016"
fname = "auroc_dip_images"

sample = sample_good
name = "normal performance"
fname = "auroc_normal_images"

N = 8
o1, graphs = class_report(sample)
fig = make_plotly_graph(graphs)
fig2 = make_hist_plot(sample)
o1 = o1.T.join(sample[list(pc.label_map)].agg(["sum", "count", "mean"]).T).apply(lambda x: np.round(x, 3)).fillna(' - ')


sas = "sv=2020-08-04&st=2021-11-23T18%3A15%3A07Z&se=2021-12-24T18%3A15%3A00Z&sr=c&sp=rl&sig=Zxi28kTgEj%2FIlv1RvTiuP%2FMGoc4DdoMtacHXxCc1VpA%3D"
container_url = "https://mlopsday2datasets.blob.core.windows.net/padchest/png/"


#<link rel="stylesheet" href="https://cdnjs.cloudflare.com/ajax/libs/twitter-bootstrap/4.0.0-alpha.4/css/bootstrap.min.css">
html = """
<style>
.container {
    width: 80%;
    margin: 0 auto;
  }

.gallery {
    display: block; 
    line-height:0;
   -webkit-column-count:5; /* split it into 5 columns */
   -webkit-column-gap:5px; /* give it a 5px gap between columns */
   -moz-column-count:5;
   -moz-column-gap:5px;
   column-count:4;
   column-gap:5px;
}

.gallery img {
   width: 100% !important;
   height: auto !important;
   margin-bottom:5px; /* to match column gap */
}
    </style>
    
    
<div class="container">
"""

html += f"""<div class="row"><h1>{name}, Between {sample.index.min()} to {sample.index.max()} </h1></div>
"""

html += """<div class="row">""" + o1.to_html() + """</div>"""
html += """<div class="row">""" + fig.to_html() + """</div>"""
html += """<div class="row">""" + fig2.to_html() + """</div>"""

for c in label_cols:
  s = sample[sample[c]>0]
  Ns = min(N, len(s))
  html += f"""<h2>{c} ({Ns} of {len(s)})<h2><div class="row gallery">
"""

  for i, (ix, row) in enumerate(s.sample(Ns).sort_index().iterrows()):
    tooltip = {k:str(v) for k,v in row.to_dict().items()}
    tooltip =  json.dumps(tooltip, indent=2).replace('"', '').strip("{").strip()
    html += """
      <img data="{ImageDir}/{ImageID}" 
      class="padchest" id="{i}" >
    """.format(
        i=i, **row, tooltip=tooltip)
  html += f"""</div> <!-- row -->"""
  

html += f"""
</div> <!-- container -->
<script>
const url = "{container_url}"; 
const sas = "{sas}"; 
var x = document.getElementsByClassName("padchest");
var i;
for (i = 0; i < x.length; i++) {{
  let data = x[i].getAttribute("data");
  x[i].src = url + data + "?" + sas
}}
</script>
"""

print(fname)
with open(f"{str(output_dir)}/{fname}.html", 'w') as f:
  print(html, file=f)
  

In [None]:
FLOAT = KSDriftCalculator
CAT = ChiSqDriftCalculator

cols = {
    'age': FLOAT,
    'Projection': CAT,
    "PatientSex_DICOM": CAT,
    "ViewPosition_DICOM": CAT,
    "Modality_DICOM": CAT,
    "Manufacturer_DICOM": CAT,
    "PhotometricInterpretation_DICOM": CAT,
    "PixelRepresentation_DICOM": CAT,
    "PixelAspectRatio_DICOM": CAT,
    "SpatialResolution_DICOM": CAT,
    "BitsStored_DICOM": CAT,
    "WindowCenter_DICOM": FLOAT,
    "WindowWidth_DICOM": FLOAT,
    "Rows_DICOM": FLOAT,
    "Columns_DICOM": FLOAT,
    "XRayTubeCurrent_DICOM": CAT,
    "Exposure_DICOM": CAT,
    "ExposureInuAs_DICOM": FLOAT,
    "RelativeXRayExposure_DICOM": FLOAT,
    'Frontal': BasicDriftCalculator,
}

cols.update({c:FLOAT for c in list(pc.df) if c.startswith("mu.") and 'all' not in c})
cols.update({c:FLOAT for c in list(pc.df) if c.startswith("activation.") and 'all' not in c})
cols[("score", "label")] = AUROCCalculator


In [None]:
window = "30D"
stride = "D"
ref_frontal_only = True
min_periods = 150

nonfrontal_add_date = "2007-05-01"
frontal_remove_date = None

replacement = True
sample_size = 2000
n_samples = 20


sampler = Sampler(sample_size, replacement=replacement)


In [None]:
refdf = val.df.copy()
if ref_frontal_only:
    refdf = refdf.query("Frontal")


print(len(refdf), len(val.df))

dwc = TabularDriftCalculator(refdf)

for c, kls in cols.items():
    dwc.add_drift_stat(c, kls)
dwc.prepare()

target_df = pc.df.set_index('StudyDate')


nonfrontals_target_df = target_df.query("~Frontal").copy()
frontals_target_df = target_df.query("Frontal").copy()

# print("nonfrontals_target_df", nonfrontals_target_df.index.min(), nonfrontals_target_df.index.max())
# print("frontals_target_df", frontals_target_df.index.min(), frontals_target_df.index.max())

nonfrontals_target_df = nonfrontals_target_df.loc[nonfrontal_add_date:]
frontals_target_df = frontals_target_df.loc[:frontal_remove_date]

print("nonfrontals_target_df", nonfrontals_target_df.index.min(), nonfrontals_target_df.index.max())
print("frontals_target_df", frontals_target_df.index.min(), frontals_target_df.index.max())

print(target_df['Frontal'].mean())
target_df = pd.concat([nonfrontals_target_df, frontals_target_df ]).sort_index()
print(target_df['Frontal'].mean())

print(len(target_df), len(pc.df.set_index('StudyDate')))


In [None]:
fname = settings.TOP_DIR.joinpath(
    "results", "drift_csvs", f"combined-od-inject-addnfrnt{nonfrontal_add_date}-rmfrnt{frontal_remove_date}_s{stride}-w{window}-min{min_periods}_frontalonly-ref{ref_frontal_only}_Samp-ss{sample_size}-n{n_samples}-repl{replacement}.csv")
print(fname)


In [None]:
from model_drift.data.utils import rolling_window_dt_apply

frontal_over_time = rolling_window_dt_apply(pc.df.set_index("StudyDate"), lambda x: {'frontal': x['Frontal'].mean()}, n_jobs=5, backend='threading')

frontal_over_time.plot(y='frontal', figsize=(20, 8))


In [None]:
output = dwc.rolling_window_predict(target_df,
                                    sampler=sampler, n_samples=n_samples,
                                    stride=stride, window=window, min_periods=min_periods,
                                    n_jobs=8, backend="threading"
                                    )


In [None]:
print(fname)
output.to_csv(fname)


In [None]:
output.head()

In [None]:


output['score'].ewm(span=90).mean().plot(figsize=(20, 8))

frontal_over_time.ewm(span=90).mean().plot(y='frontal', figsize=(20, 8))

frontal_over_time.ewm(span=90).mean().plot(y='window_count', figsize=(20, 8))


In [None]:
output[["Frontal", 'score']].drop([("Frontal", 'stats', "std")], axis=1).plot(figsize=(20, 8))
