In [2]:
from IPython.display import display

import pandas as pd
import warnings
from model_drift import settings, helpers
from model_drift.data.utils import nested2series
import matplotlib.pylab as plt
import numpy as np
import seaborn as sns
from model_drift.drift.tabular import TabularDriftCalculator
from model_drift.drift.numeric import KSDriftCalculator, BasicDriftCalculator
from model_drift.drift.categorical import ChiSqDriftCalculator
from model_drift.drift.collection import DriftCollectionCalculator
from model_drift.drift.performance import AUROCCalculator

from model_drift.data.padchest import PadChest
import plotly.graph_objects as go

warnings.filterwarnings("ignore")


In [3]:
jsonl_file = str(settings.TOP_DIR.joinpath("results", 'vae', 'padchest-trained', 'preds.jsonl'))
vae_df = helpers.jsonl_files2dataframe(jsonl_file)
vae_df = pd.concat(
    [
        vae_df,
        pd.DataFrame(vae_df['mu'].values.tolist(), columns=[f"mu.{c}" for c in range(128)])
    ],
    axis=1
)
vae_df.head()

 59%|█████▊    | 94477/160819 [00:10<00:07, 8866.69it/s]

In [None]:
from model_drift.data.padchest import LABEL_MAP
label_cols = list(LABEL_MAP)
jsonl_file = str(settings.TOP_DIR.joinpath("results", 'classifier', 'finetuned', "preds.jsonl"))
scores_df = helpers.jsonl_files2dataframe(jsonl_file)
scores_df = pd.concat(
    [
        scores_df,
        pd.DataFrame(scores_df['activation'].values.tolist(), columns=[f"activation.{c}" for c in label_cols])
    ],
    axis=1)
scores_df.head()


100%|██████████| 160819/160819 [00:03<00:00, 53151.50it/s]


Unnamed: 0,index,score,activation,label,activation.Atelectasis,activation.Cardiomegaly,activation.Consolidation,activation.Edema,activation.Lesion,activation.No Finding,activation.Opacity,activation.Pleural Abnormalities,activation.Pleural Effusion,activation.Pneumonia
0,20536686640136348236148679891455886468_k6ga29.png,"[-2.9898717403411865, 0.021230269223451614, -5...","[0.04788553714752197, 0.5053073763847351, 0.00...","[0.0, 0.0, 0.0, 0.0, 0.0, 1.0, 0.0, 0.0, 0.0, ...",0.047886,0.505307,0.003428,0.000492,0.031599,0.040083,0.046906,0.223817,0.010098,0.01091
1,113855343774216031107737439268243531979_3k951n...,"[-4.271155834197998, -3.2341253757476807, -7.5...","[0.013773280195891857, 0.03790152817964554, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.013773,0.037902,0.000529,9.7e-05,0.022773,0.269458,0.025679,0.003461,0.000689,0.003102
2,313723174271082992847610802266403640553-4_hhi4...,"[-1.9546648263931274, -4.485781669616699, -1.7...","[0.12404559552669525, 0.011142521165311337, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.124046,0.011143,0.148217,1.2e-05,0.022932,0.003598,0.751884,0.655488,0.479314,0.211316
3,105529804483623054726294337265160703666_6zn76a...,"[-2.843719244003296, -2.7252206802368164, -5.9...","[0.055006884038448334, 0.06150144338607788, 0....","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.055007,0.061501,0.002591,0.000115,0.016477,0.37059,0.017562,0.013063,0.001621,0.005887
4,102185472428957491598043403159908631419_zb8kco...,"[-2.794471502304077, -4.529303073883057, -5.34...","[0.05762365832924843, 0.01067304890602827, 0.0...","[0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, ...",0.057624,0.010673,0.004754,0.000176,0.021283,0.0975,0.051552,0.037033,0.019453,0.026915


In [None]:
# Load padchest CSV
pc = PadChest(settings.PADCHEST_FILENAME)
pc.prepare()


pc.merge(vae_df, left_on="ImageID", right_on="index", how='inner')
pc.merge(scores_df, left_on="ImageID", right_on="index", how='inner')


train, val, test = pc.split(settings.PADCHEST_SPLIT_DATES, studydate_index=True)


In [None]:
pd.concat(
    {
        "all": pc.df["StudyDate"].describe(datetime_is_numeric=True),
        "train": train.df["StudyDate"].describe(datetime_is_numeric=True),
        "val": val.df["StudyDate"].describe(datetime_is_numeric=True),
        "test": test.df["StudyDate"].describe(datetime_is_numeric=True),
    },
    axis=1,
)


Unnamed: 0,all,train,val,test
count,160819,91726,22176,46917
mean,2012-09-14 20:54:45.910246912,2011-01-06 03:16:23.616423168,2013-06-16 00:14:59.999999744,2015-08-29 00:15:02.359485696
min,2007-05-03 00:00:00,2007-05-03 00:00:00,2013-01-01 00:00:00,2014-01-01 00:00:00
25%,2010-10-27 00:00:00,2010-01-19 00:00:00,2013-03-10 00:00:00,2014-08-08 00:00:00
50%,2012-06-18 00:00:00,2011-01-18 00:00:00,2013-06-04 00:00:00,2015-06-09 00:00:00
75%,2014-05-28 00:00:00,2012-01-11 00:00:00,2013-09-25 00:00:00,2016-09-13 00:00:00
max,2017-11-17 00:00:00,2012-12-28 00:00:00,2013-12-31 00:00:00,2017-11-17 00:00:00


In [None]:
FLOAT = KSDriftCalculator
CAT = ChiSqDriftCalculator

cols = {
    'age': FLOAT,
    'Projection': CAT,
    "PatientSex_DICOM": CAT,
    "ViewPosition_DICOM": CAT,
    "Modality_DICOM": CAT,
    "Manufacturer_DICOM": CAT,
    "PhotometricInterpretation_DICOM": CAT,
    "PixelRepresentation_DICOM": CAT,
    "PixelAspectRatio_DICOM": CAT,
    "SpatialResolution_DICOM": CAT,
    "BitsStored_DICOM": CAT,
    "WindowCenter_DICOM": FLOAT,
    "WindowWidth_DICOM": FLOAT,
    "Rows_DICOM": FLOAT,
    "Columns_DICOM": FLOAT,
    "XRayTubeCurrent_DICOM": CAT,
    "Exposure_DICOM": CAT,
    "ExposureInuAs_DICOM": FLOAT,
    "RelativeXRayExposure_DICOM": FLOAT,
}

cols.update({c:FLOAT for c in list(pc.df) if c.startswith("mu.") and 'all' not in c})
cols.update({c:FLOAT for c in list(pc.df) if c.startswith("activation.") and 'all' not in c})
cols[("score", "label")] = AUROCCalculator


In [None]:
window = "30D"
stride = "D"
ref_frontal_only = True
target_frontal_only = False

In [None]:
refdf = val.df.copy()
if ref_frontal_only:
    refdf = refdf.query("Frontal")


print(len(refdf), len(val.df))

dwc = TabularDriftCalculator(refdf)

for c, kls in cols.items():
    dwc.add_drift_stat(c, kls)
dwc.prepare()

target_df = pc.df.set_index('StudyDate')

if target_frontal_only:
    target_df = target_df.query("Frontal")

print(len(target_df), len(pc.df.set_index('StudyDate')))


15267 22176


In [None]:
output = dwc.rolling_window_predict(target_df, stride=stride, window=window, min_periods=20)

2007-05-03 - 2017-11-17 window: 90D, stride: D: 100%|██████████| 3852/3852 [37:55<00:00,  1.69it/s, 2017-11-17]


In [None]:
fname = settings.TOP_DIR.joinpath(
    "results", "drift_csvs", f"combined_s{stride}-w{window}_frontalonly-ref{ref_frontal_only}-target{target_frontal_only}.csv")
print(fname)
output.to_csv(fname)


D:\Code\MLOpsDay2\MedImaging-ModelDriftMonitoring\results\drift_csvs\combined_sD-w90D_frontalonly-refTrue-targetTrue.csv


In [None]:
output.head()

Unnamed: 0_level_0,BitsStored_DICOM,BitsStored_DICOM,BitsStored_DICOM,BitsStored_DICOM,BitsStored_DICOM,Columns_DICOM,Columns_DICOM,Columns_DICOM,Columns_DICOM,ExposureInuAs_DICOM,...,mu.97,mu.98,mu.98,mu.98,mu.98,mu.99,mu.99,mu.99,mu.99,score
Unnamed: 0_level_1,chi2,chi2,chi2,chi2,chi2,ks,ks,ks,ks,ks,...,ks,ks,ks,ks,ks,ks,ks,ks,ks,label
Unnamed: 0_level_2,critical_diff,critical_value,distance,dof,pval,critical_diff,critical_value,distance,pval,critical_diff,...,pval,critical_diff,critical_value,distance,pval,critical_diff,critical_value,distance,pval,auroc
2009-01-01,-0.975747,2.705543,1.729796,1.0,0.188437,0.018136,0.24497,0.263106,0.05131107,-0.151511,...,0.371777,0.023311,0.24497,0.268281,0.044412,0.011573,0.24497,0.256543,0.061365,0.929388
2009-01-02,0.103674,2.705543,2.809217,1.0,0.093724,0.112864,0.149845,0.262709,0.0001435198,-0.039072,...,0.265631,0.016347,0.149845,0.166191,0.043717,0.009491,0.149845,0.159336,0.059382,0.906793
2009-01-03,0.479889,2.705543,3.185432,1.0,0.074297,0.096947,0.145581,0.242528,0.0003672758,-0.050996,...,0.139654,-0.002935,0.145581,0.142647,0.10066,0.012704,0.145581,0.158285,0.050817,0.908797
2009-01-04,-2.705543,2.705543,0.0,1.0,1.0,0.076367,0.139822,0.216188,0.001223817,-0.083324,...,0.087914,-0.021819,0.139822,0.118003,0.216155,0.010598,0.139822,0.15042,0.055041,0.906851
2009-01-05,-0.666181,2.705543,2.039363,1.0,0.153274,0.159688,0.096663,0.256352,9.276202e-10,0.057043,...,0.12754,-0.005945,0.096663,0.090719,0.135004,0.015256,0.096663,0.111919,0.033521,0.91959
