### Dependencies

In [1]:
# Base Dependencies
import os
import pickle
import warnings
warnings.filterwarnings('ignore')

# LinAlg / Stats / Plotting Dependencies
import numpy as np
import pandas as pd
pd.set_option("display.precision", 3)
from tqdm import tqdm

# Scikit-Learn Imports
import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

# Utils
from patch_evaluation_utils import kendalltau_bpq

### How To Use
1. Create the "embeddings_patch_library" using "patch_extraction.py"
3. Run this notebook!

### CRC-100K (Without SN)

In [2]:
crc100k_nonorm_aucs_all = {}
models = ['resnet50_trunc',
          'resnet50_tcga_brca_simclr',
          'vits_tcga_brca_dino',
        ]
model_names = ['ImageNet',
               'SimCLR (BRCA)',
               'DINO (BRCA)',
              ]

for enc in models:
    train_fname = os.path.join('./embeddings_patch_library/', 'crc100knonorm_train_%s.pkl' % enc)
    with open(train_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        train_embeddings, train_labels = asset_dict['embeddings'], asset_dict['labels']

    val_fname = os.path.join('./embeddings_patch_library/', 'crc100knonorm_val_%s.pkl' % enc)
    with open(val_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        val_embeddings, val_labels = asset_dict['embeddings'], asset_dict['labels']

    train_labels[train_labels=='MUS'] = 'STR'
    val_labels[val_labels=='MUS'] = 'STR'
    le = LabelEncoder().fit(train_labels)
    train_labels = le.transform(train_labels)
    val_labels = le.transform(val_labels)
    
    if enc in crc100k_nonorm_aucs_all.keys():
        pass
    else:
        clf = KNeighborsClassifier().fit(train_embeddings, train_labels)
        y_score = clf.predict_proba(val_embeddings)
        y_pred = clf.predict(val_embeddings)
        aucs, f1s = [], []
        for i, label in enumerate(np.unique(val_labels)):
            label_class = np.array(val_labels == label, int)
            aucs.append(sklearn.metrics.roc_auc_score(label_class, y_score[:,i]))
        aucs.append(sklearn.metrics.roc_auc_score(val_labels, y_score, average='macro', multi_class='ovr'))
        crc100k_nonorm_aucs_all[enc] = aucs

        
aucs_df = pd.DataFrame(crc100k_nonorm_aucs_all).T.loc[models]
aucs_df.index = model_names
aucs_df.columns = ['ADI', 'BACK', 'DEB', 'LYM', 'MUC', 'NORM', 'STR', 'TUM', 'All']
crc100kr = aucs_df.copy()
crc100kr

Unnamed: 0,ADI,BACK,DEB,LYM,MUC,NORM,STR,TUM,All
ImageNet,0.988,0.909,0.9,0.87,0.886,0.988,0.963,0.978,0.935
SimCLR (BRCA),0.981,0.765,0.955,0.951,0.926,0.976,0.979,0.973,0.938
DINO (BRCA),0.991,0.729,0.961,0.95,0.978,0.957,0.99,0.973,0.941


### CRC-100K (With SN)

In [3]:
crc100k_aucs_all = {}
models = ['resnet50_trunc',
          'resnet50_tcga_brca_simclr',
          'vits_tcga_brca_dino',
        ]
model_names = ['ImageNet',
               'SimCLR (BRCA)',
               'DINO (BRCA)',
              ]

for enc in models:
    train_fname = os.path.join('./embeddings_patch_library/', 'crc100k_train_%s.pkl' % enc)
    with open(train_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        train_embeddings, train_labels = asset_dict['embeddings'], asset_dict['labels']

    val_fname = os.path.join('./embeddings_patch_library/', 'crc100k_val_%s.pkl' % enc)
    with open(val_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        val_embeddings, val_labels = asset_dict['embeddings'], asset_dict['labels']

    train_labels[train_labels=='MUS'] = 'STR'
    val_labels[val_labels=='MUS'] = 'STR'
    le = LabelEncoder().fit(train_labels)
    train_labels = le.transform(train_labels)
    val_labels = le.transform(val_labels)
    
    if enc in crc100k_aucs_all.keys():
        pass
    else:
        clf = KNeighborsClassifier().fit(train_embeddings, train_labels)
        y_score = clf.predict_proba(val_embeddings)
        y_pred = clf.predict(val_embeddings)
        aucs, f1s = [], []
        for i, label in enumerate(np.unique(val_labels)):
            label_class = np.array(val_labels == label, int)
            aucs.append(sklearn.metrics.roc_auc_score(label_class, y_score[:,i]))
        aucs.append(sklearn.metrics.roc_auc_score(val_labels, y_score, average='macro', multi_class='ovr'))
        crc100k_aucs_all[enc] = aucs

aucs_df = pd.DataFrame(crc100k_aucs_all).T.loc[models]
aucs_df.index = model_names
aucs_df.columns = ['ADI', 'BACK', 'DEB', 'LYM', 'MUC', 'NORM', 'STR', 'TUM', 'All']
crc100kn = aucs_df.copy()
crc100kn

Unnamed: 0,ADI,BACK,DEB,LYM,MUC,NORM,STR,TUM,All
ImageNet,0.983,1.0,0.997,0.974,0.963,0.988,0.982,0.978,0.983
SimCLR (BRCA),0.988,1.0,0.994,0.98,0.969,0.973,0.979,0.969,0.981
DINO (BRCA),0.999,1.0,0.999,0.985,0.992,0.96,0.992,0.967,0.987


### BreastPathQ

In [4]:
bpq_mse_all = []
models = ['resnet50_trunc',
          'resnet50_tcga_brca_simclr',
          'vits_tcga_brca_dino',
        ]
model_names = ['ImageNet',
               'SimCLR (BRCA)',
               'DINO (BRCA)',
              ]

for enc in models:
    train_fname = os.path.join('./embeddings_patch_library/', 'breastpathq_train_%s.pkl' % enc)
    with open(train_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        train_embeddings, train_labels = asset_dict['embeddings'], asset_dict['labels']

    val_fname = os.path.join('./embeddings_patch_library/', 'breastpathq_val_%s.pkl' % enc)
    with open(val_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        val_embeddings, val_labels = asset_dict['embeddings'], asset_dict['labels']
    
    clf = LinearRegression().fit(train_embeddings, train_labels)
    y_score = clf.predict(val_embeddings)
    bpq_mse_all.append([sklearn.metrics.mean_squared_error(val_labels, y_score), kendalltau_bpq(val_labels, y_score)])

mse_df = pd.DataFrame(bpq_mse_all)
mse_df.columns = ['MSE', 'Tau']
mse_df.index = model_names
bpq = mse_df.copy()
bpq

Unnamed: 0,MSE,Tau
ImageNet,0.058,0.828
SimCLR (BRCA),0.078,0.788
DINO (BRCA),0.029,0.854


### BCSS

In [5]:
bcss_aucs_all = {}
models = ['resnet50_trunc',
          'resnet50_tcga_brca_simclr',
          'vits_tcga_brca_dino',
        ]
model_names = ['ImageNet',
               'SimCLR (BRCA)',
               'DINO (BRCA)',
              ]

for enc in models:
    train_fname = './embeddings_patch_library/bcss_train_%s.pkl' % enc
    with open(train_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        train_embeddings, train_labels = asset_dict['embeddings'], asset_dict['labels']

    val_fname = './embeddings_patch_library/bcss_val_%s.pkl' % enc
    with open(val_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        val_embeddings, val_labels = asset_dict['embeddings'], asset_dict['labels']
    
    if enc in bcss_aucs_all.keys():
        pass
    else:
        clf = KNeighborsClassifier().fit(train_embeddings, train_labels)
        y_score = clf.predict_proba(val_embeddings)
        y_pred = clf.predict(val_embeddings)
        aucs, f1s = [], []
        for i, label in enumerate(np.unique(val_labels)):
            label_class = np.array(val_labels == label, int)
            aucs.append(sklearn.metrics.roc_auc_score(label_class, y_score[:,i]))
        aucs.append(sklearn.metrics.roc_auc_score(val_labels, y_score, average='macro', multi_class='ovr'))
        bcss_aucs_all[enc] = aucs
        
aucs_df = pd.DataFrame(bcss_aucs_all).T.loc[models]
aucs_df.index = model_names
aucs_df.columns = list(np.unique(train_labels)) + ['All']
bcss = aucs_df.copy()
bcss

Unnamed: 0,0,1,2,3,All
ImageNet,0.503,0.704,0.552,0.636,0.599
SimCLR (BRCA),0.543,0.715,0.577,0.667,0.625
DINO (BRCA),0.52,0.678,0.54,0.632,0.593
