### Dependencies

In [18]:
import os
import pickle
import warnings
warnings.filterwarnings('ignore')

import numpy as np
import pandas
import pandas as pd
pandas.set_option("display.precision", 3)
from tqdm import tqdm

import sklearn
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import LabelEncoder
from sklearn.utils._testing import ignore_warnings
from sklearn.exceptions import ConvergenceWarning

### How To Use
1. Pre-extracted features for each patch dataset are available in "embeddings_patch_library" on Google Drive
2. Download "embeddings_patch_library" to this containing directory
3. Run

### CRC-100K (Without SN)

In [19]:
crc100k_nonorm_aucs_all = {}
models = ['resnet50_trunc',
          'vits_tcga_brca_dino',
          'vits_tcga_pancancer_dino',
          'vits_tcga_pancancer_dino_s4'
        ]
model_names = ['ImageNet',
               'DINO (BRCA)',
               'DINO (PAN)', 
               'DINO (PAN S4)'
              ]

for enc in models:
    train_fname = os.path.join('./embeddings_patch_library/', 'kather100knonorm_train_%s.pkl' % enc)
    with open(train_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        train_embeddings, train_labels = asset_dict['embeddings'], asset_dict['labels']

    val_fname = os.path.join('./embeddings_patch_library/', 'kather100knonorm_val_%s.pkl' % enc)
    with open(val_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        val_embeddings, val_labels = asset_dict['embeddings'], asset_dict['labels']

    train_labels[train_labels=='MUS'] = 'STR'
    val_labels[val_labels=='MUS'] = 'STR'
    le = LabelEncoder().fit(train_labels)
    train_labels = le.transform(train_labels)
    val_labels = le.transform(val_labels)
    
    if enc in crc100k_nonorm_aucs_all.keys():
        pass
    else:
        clf = KNeighborsClassifier().fit(train_embeddings, train_labels)
        y_score = clf.predict_proba(val_embeddings)
        y_pred = clf.predict(val_embeddings)
        aucs, f1s = [], []
        for i, label in enumerate(np.unique(val_labels)):
            label_class = np.array(val_labels == label, int)
            aucs.append(sklearn.metrics.roc_auc_score(label_class, y_score[:,i]))
        aucs.append(sklearn.metrics.roc_auc_score(val_labels, y_score, average='macro', multi_class='ovr'))
        crc100k_nonorm_aucs_all[enc] = aucs

aucs_df = pd.DataFrame(crc100k_nonorm_aucs_all).T.loc[models]
aucs_df.index = model_names
aucs_df.columns = list(np.unique(train_labels)) + ['All']
crc100kr = aucs_df['All']

### CRC-100K (With SN)

In [20]:
crc100k_aucs_all = {}
models = ['resnet50_trunc',
          'vits_tcga_brca_dino',
          'vits_tcga_pancancer_dino',
          'vits_tcga_pancancer_dino_s4'
        ]
model_names = ['ImageNet',
               'DINO (BRCA)',
               'DINO (PAN)', 
               'DINO (PAN S4)'
              ]

for enc in models:
    train_fname = os.path.join('./embeddings_patch_library/', 'kather100k_train_%s.pkl' % enc)
    with open(train_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        train_embeddings, train_labels = asset_dict['embeddings'], asset_dict['labels']

    val_fname = os.path.join('./embeddings_patch_library/', 'kather100k_val_%s.pkl' % enc)
    with open(val_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        val_embeddings, val_labels = asset_dict['embeddings'], asset_dict['labels']

    train_labels[train_labels=='MUS'] = 'STR'
    val_labels[val_labels=='MUS'] = 'STR'
    le = LabelEncoder().fit(train_labels)
    train_labels = le.transform(train_labels)
    val_labels = le.transform(val_labels)
    
    if enc in crc100k_aucs_all.keys():
        pass
    else:
        clf = KNeighborsClassifier().fit(train_embeddings, train_labels)
        y_score = clf.predict_proba(val_embeddings)
        y_pred = clf.predict(val_embeddings)
        aucs, f1s = [], []
        for i, label in enumerate(np.unique(val_labels)):
            label_class = np.array(val_labels == label, int)
            aucs.append(sklearn.metrics.roc_auc_score(label_class, y_score[:,i]))
        aucs.append(sklearn.metrics.roc_auc_score(val_labels, y_score, average='macro', multi_class='ovr'))
        crc100k_aucs_all[enc] = aucs

aucs_df = pd.DataFrame(crc100k_aucs_all).T.loc[models]
aucs_df.index = model_names
aucs_df.columns = list(np.unique(train_labels)) + ['All']
crc100kn = aucs_df['All']

### BCSS

In [25]:
bpq_mse_all = []
models = ['resnet50_trunc', 
          'vits_tcga_brca_dino', 
          'vits_tcga_pancancer_dino', 
          'vits_tcga_pancancer_dino_s4'
         ]
model_names = ['ImageNet',
               'DINO (BRCA)',
               'DINO (PAN)',
               'DINO (PAN S4)'
              ]

for enc in models:
    train_fname = os.path.join('./embeddings_patch_library/', 'breastq_train_%s.pkl' % enc)
    with open(train_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        train_embeddings, train_labels = asset_dict['embeddings'], asset_dict['labels']

    val_fname = os.path.join('./embeddings_patch_library/', 'breastq_val_%s.pkl' % enc)
    with open(val_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        val_embeddings, val_labels = asset_dict['embeddings'], asset_dict['labels']
    
    clf = LinearRegression().fit(train_embeddings, train_labels)
    y_score = clf.predict(val_embeddings)
    bpq_mse_all.append(sklearn.metrics.mean_squared_error(val_labels, y_score))

mse_df = pd.DataFrame(bpq_mse_all)
mse_df.columns = ['MSE']
mse_df.index = model_names
bpq = mse_df

### BreastPathQ

In [26]:
bcss_aucs_all = {}
models = ['resnet50_trunc', 
          'vits_tcga_brca_dino', 
          'vits_tcga_pancancer_dino', 
          'vits_tcga_pancancer_dino_s4'
         ]
model_names = ['ImageNet',
               'DINO (BRCA)',
               'DINO (PAN)',
               'DINO (PAN S4)'
              ]

for enc in models:
    train_fname = './embeddings_patch_library/bcss_train_%s.pkl' % enc
    with open(train_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        train_embeddings, train_labels = asset_dict['embeddings'], asset_dict['labels']

    val_fname = './embeddings_patch_library/bcss_val_%s.pkl' % enc
    with open(val_fname, 'rb') as handle:
        asset_dict = pickle.load(handle)
        val_embeddings, val_labels = asset_dict['embeddings'], asset_dict['labels']
    
    if enc in bcss_aucs_all.keys():
        pass
    else:
        clf = KNeighborsClassifier().fit(train_embeddings, train_labels)
        y_score = clf.predict_proba(val_embeddings)
        y_pred = clf.predict(val_embeddings)
        aucs, f1s = [], []
        for i, label in enumerate(np.unique(val_labels)):
            label_class = np.array(val_labels == label, int)
            aucs.append(sklearn.metrics.roc_auc_score(label_class, y_score[:,i]))
        aucs.append(sklearn.metrics.roc_auc_score(val_labels, y_score, average='macro', multi_class='ovr'))
        bcss_aucs_all[enc] = aucs
        
aucs_df = pd.DataFrame(bcss_aucs_all).T.loc[models]
aucs_df.index = model_names
aucs_df.columns = list(np.unique(train_labels)) + ['All']
bcss = aucs_df['All']

### Table 5

In [32]:
aucs_df = pd.concat([crc100kr, crc100kn, bcss, bpq], axis=1)
aucs_df.columns = ['CRC-100K-R', 'CRC-100K-N', 'BCSS', 'BreastPathQ']
print(aucs_df.to_latex())
display(aucs_df)

\begin{tabular}{lrrrr}
\toprule
{} &  CRC-100K-R &  CRC-100K-N &   BCSS &  BreastPathQ \\
\midrule
ImageNet      &       0.935 &       0.983 &  0.599 &        0.058 \\
DINO (BRCA)   &       0.941 &       0.987 &  0.593 &        0.029 \\
DINO (PAN)    &       0.941 &       0.983 &  0.616 &        0.023 \\
DINO (PAN S4) &       0.927 &       0.985 &  0.612 &        0.052 \\
\bottomrule
\end{tabular}



Unnamed: 0,CRC-100K-R,CRC-100K-N,BCSS,BreastPathQ
ImageNet,0.935,0.983,0.599,0.058
DINO (BRCA),0.941,0.987,0.593,0.029
DINO (PAN),0.941,0.983,0.616,0.023
DINO (PAN S4),0.927,0.985,0.612,0.052
