This notebook is to explore the features and attentions from different encoders (ImageNet, ImageNetSimCLR, InnerEyeSSL, HistoSSL)

In [None]:
import os 
import sys
from pathlib import Path
import pandas as pd
import numpy
import torch

from sklearn.metrics import silhouette_samples, davies_bouldin_score, calinski_harabasz_score

# the working directory is not correctly picked up in sys.path
current_dir = Path(os.getcwd())
radiomics_root = current_dir.parent.parent.parent
if (radiomics_root / "InnerEyePrivate").is_dir():
    radiomics_root_str = str(radiomics_root)
    if radiomics_root_str not in sys.path:
        print(f"Adding to sys.path: {radiomics_root_str}")
        sys.path.insert(0, radiomics_root_str)
        sys.path.insert(0, str(radiomics_root / "innereye-deeplearning"))
        print(f"Sys path {sys.path}")
from InnerEye.ML.Histopathology.utils.analysis_plot_utils import plot_box_whisker, get_tsne_projection, get_umap_projection, plot_projected_features_2d, plot_histogram, normalize_array_minmax, normalize_array_mean
from InnerEye.ML.Histopathology.utils.download_utils import download_file_if_necessary

### Download test outputs (CSV and encoded features) from AML runs

In [None]:
output_dir = Path("outputs/")
run_ids = ["hsharma_features_viz:hsharma_features_viz_1636727694_8881c83d",
            "hsharma_features_viz:hsharma_features_viz_1636727732_ef0ab3fc",
            "hsharma_features_viz:hsharma_features_viz_1636727760_899692b2",
            "hsharma_features_viz:hsharma_features_viz_1636727790_73839184"]
features_filename = 'test_encoded_features.pickle'
csv_filename = 'test_output.csv'
encoder_names = ["ImageNet", "ImageNetSimCLR", "InnerEyeSSL","HistoSSL"]

for run_id, encoder_name in zip(run_ids, encoder_names):
    print(f"Downloading files for run {run_id} and {encoder_name} encoder.")
    download_file_if_necessary(run_id=run_id, remote_dir=output_dir, download_dir=output_dir, filename=features_filename)
    download_file_if_necessary(run_id=run_id, remote_dir=output_dir, download_dir=output_dir, filename=csv_filename)

### Visualize t-SNE projection of encoded features at slide-level

Slide-level feature obtained as mean of all tile-level features

In [None]:
classes=['MSS','MSI']

for run_id, encoder_name in zip(run_ids, encoder_names):
    file_features = output_dir / run_id.split(":")[1] / "outputs" / features_filename
    file_outputs = output_dir / run_id.split(":")[1] / "outputs" / csv_filename
    print(f"Loading test encoded features for {encoder_name} encoder...")
    lists_features = torch.load(file_features)  

    print(f"Loading test output CSV for {encoder_name} encoder...")
    metadata = pd.read_csv(file_outputs)
    labels = metadata.groupby('slide_id')['true_label'].agg(numpy.mean)

    print(f"Collecting test slide-level features for {encoder_name} encoder...")
    slides = []
    for slide_features in lists_features:
        tiles = []
        for tile_features in slide_features:
            tiles.append(tile_features)  
        mean_slide_feature = torch.mean(torch.stack(tiles), dim=0)
        slides.append(mean_slide_feature)

    slides_list = torch.stack(slides).numpy()  
    print(len(slides_list[1]))
    print(f"Running slide-level t-sne for {encoder_name} encoder...")
    tsne_slides = get_tsne_projection(features=slides_list, n_components=2, init='random', random_state=0, verbose=True, n_jobs=-1)
    plot_projected_features_2d(data=tsne_slides, labels=labels, classes=classes, title=encoder_name)

### Visualize t-SNE projection of encoded features at tile-level 

This will take few hours to run for each encoder as # tiles > 66K

In [None]:
classes=['MSS','MSI']

for run_id, encoder_name in zip(run_ids, encoder_names):
    file_features = output_dir / run_id.split(":")[1] / "outputs" / features_filename
    file_outputs = output_dir / run_id.split(":")[1] / "outputs" / csv_filename
    print(f"Loading test encoded features for {encoder_name} encoder...")
    lists_features = torch.load(file_features)  

    print(f"Loading test output CSV for {encoder_name} encoder...")
    metadata = pd.read_csv(file_outputs)
    labels = metadata['true_label']

    print(f"Collecting test tile-level features for {encoder_name} encoder...")
    tiles = []
    for slide_features in lists_features:
        for tile_features in slide_features:
            tiles.append(tile_features)

    tiles_list = torch.stack(tiles).numpy()  
    print(f"Running tile-level t-sne for {encoder_name} encoder...")
    tsne_tiles = get_tsne_projection(features=tiles_list, n_components=2, init='random', random_state=0, verbose=True, n_jobs=-1)
    plot_projected_features_2d(data=tsne_tiles, labels=labels, classes=classes, title=encoder_name)

### Visualize UMAP projection of encoded features at slide-level

Slide-level feature obtained as mean of all tile-level features

In [None]:
classes=['MSS','MSI']

for run_id, encoder_name in zip(run_ids, encoder_names):
    file_features = output_dir / run_id.split(":")[1] / "outputs" / features_filename
    file_outputs = output_dir / run_id.split(":")[1] / "outputs" / csv_filename
    print(f"Loading test encoded features for {encoder_name} encoder...")
    lists_features = torch.load(file_features)  

    print(f"Loading test output CSV for {encoder_name} encoder...")
    metadata = pd.read_csv(file_outputs)
    labels = metadata.groupby('slide_id')['true_label'].agg(numpy.mean)

    print(f"Collecting test slide-level features for {encoder_name} encoder...")
    slides = []
    for slide_features in lists_features:
        tiles = []
        for tile_features in slide_features:
            tiles.append(tile_features)  
        mean_slide_feature = torch.mean(torch.stack(tiles), dim=0)
        slides.append(mean_slide_feature)

    slides_list = torch.stack(slides).numpy()  
    print(f"Running slide-level umap for {encoder_name} encoder...")
    tsne_slides = get_umap_projection(features=slides_list, n_components=2, init='random', random_state=0, verbose=True, n_jobs=-1)
    plot_projected_features_2d(data=tsne_slides, labels=labels, classes=classes, title=encoder_name)

### Visualize UMAP projection of encoded features at tile-level 

In [None]:
classes=['MSS','MSI']

for run_id, encoder_name in zip(run_ids, encoder_names):
    file_features = output_dir / run_id.split(":")[1] / "outputs" / features_filename
    file_outputs = output_dir / run_id.split(":")[1] / "outputs" / csv_filename
    print(f"Loading test encoded features for {encoder_name} encoder...")
    lists_features = torch.load(file_features)  

    print(f"Loading test output CSV for {encoder_name} encoder...")
    metadata = pd.read_csv(file_outputs)
    labels = metadata['true_label']

    print(f"Collecting test tile-level features for {encoder_name} encoder...")
    tiles = []
    for slide_features in lists_features:
        for tile_features in slide_features:
            tiles.append(tile_features)

    tiles_list = torch.stack(tiles).numpy()  
    print(f"Running tile-level umap for {encoder_name} encoder...")
    tsne_tiles = get_umap_projection(features=tiles_list, n_components=2, init='random', random_state=0, verbose=True, n_jobs=-1)
    plot_projected_features_2d(data=tsne_tiles, labels=labels, classes=classes, title=encoder_name)

### Attention histograms - unnormalized

Histogram of attentions without any normalization

In [None]:
for run_id, encoder_name in zip(run_ids, encoder_names):
    file_outputs = output_dir / run_id.split(":")[1] / "outputs" / csv_filename
    print(f"Loading test output CSV for {encoder_name} encoder...")
    metadata = pd.read_csv(file_outputs)
    bag_attention = metadata['bag_attn']

    print("Plotting unnormalized attention histograms...")
    plot_histogram(data=bag_attention, title=encoder_name)

### Attention histograms - normalized (zero mean and unit variance)

Histogram of attentions after normalizing to zero mean and unit variance (x_i-mean(x)/std(x))

In [None]:
for run_id, encoder_name in zip(run_ids, encoder_names):
    file_outputs = output_dir / run_id.split(":")[1] / "outputs" / csv_filename
    print(f"Loading test output CSV for {encoder_name} encoder...")
    metadata = pd.read_csv(file_outputs)

    slide_ids = metadata.groupby('slide_id').groups.keys()
    bag_attention_normalized_mean = []
    for slide_id in slide_ids:
        slide_attn = metadata.loc[metadata['slide_id'] == slide_id, 'bag_attn']
        normalized_attn_mean = normalize_array_mean(slide_attn)
        bag_attention_normalized_mean.extend(normalized_attn_mean)
    
    print("Plotting normalized attention histograms...")
    plot_histogram(data=bag_attention_normalized_mean, title=encoder_name)

### Attention histograms - normalized (range 0-1)

Histogram of attentions after normalizing to range 0-1 (x_i-min(x))/(max(x)-min(x))

In [None]:
for run_id, encoder_name in zip(run_ids, encoder_names):
    file_outputs = output_dir / run_id.split(":")[1] / "outputs" / csv_filename
    print(f"Loading test output CSV for {encoder_name} encoder...")
    metadata = pd.read_csv(file_outputs)

    slide_ids = metadata.groupby('slide_id').groups.keys()
    bag_attention_normalized_minmax = []
    for slide_id in slide_ids:
        slide_attn = metadata.loc[metadata['slide_id'] == slide_id, 'bag_attn']
        normalized_attn_minmax = normalize_array_minmax(slide_attn)
        bag_attention_normalized_minmax.extend(normalized_attn_minmax)
    
    print("Plotting normalized attention histograms...")
    plot_histogram(data=bag_attention_normalized_minmax, title=encoder_name)

### Box-whisker plot to compare attentions of different encoders

Compare attention distributions of different encoders in box-whisker plots

In [None]:
bag_attention_list = []
for run_id, encoder_name in zip(run_ids, encoder_names):
    metadata = pd.read_csv(output_dir / run_id.split(":")[1] / "outputs" / csv_filename)
    bag_attention = metadata['bag_attn']
    bag_attention_list.append(bag_attention.tolist())

plot_box_whisker(data_list=bag_attention_list, column_names=encoder_names, show_outliers=False, title="Boxplot of attentions")
plot_box_whisker(data_list=bag_attention_list, column_names=encoder_names, show_outliers=True, title="Boxplot of attentions with outliers")

### How well are the encoded features separated between slides?


Scores to find distances between clusters (a slide is a cluster). A good encoder should not lead to good separation with respect to slides.

In [None]:
for run_id, encoder_name in zip(run_ids, encoder_names):
    file_features = output_dir / run_id.split(":")[1] / "outputs" / features_filename
    file_outputs = output_dir / run_id.split(":")[1] / "outputs" / csv_filename
    print(f"Loading test encoded features for {encoder_name} encoder...")
    lists_features = torch.load(file_features)  

    print(f"Loading test output CSV for {encoder_name} encoder...")
    metadata = pd.read_csv(file_outputs)

    print(f"Collecting test tile-level features for {encoder_name} encoder...")    
    tiles = []
    for slide_features in lists_features:
        for tile_features in slide_features:
            tiles.append(tile_features)

    tiles_list = torch.stack(tiles).numpy() 
    labels = metadata['slide_id']

    print("Finding clustering scores (a slide is a cluster)...")
    slide_ids = metadata.groupby('slide_id').groups.keys()
    silhouette_coeffs = silhouette_samples(tiles_list, labels, n_jobs=-1)
    slide_coeffs = []
    for slide_id in slide_ids:
        slide_silhouette_coeffs = silhouette_coeffs[metadata['slide_id'] == slide_id]
        slide_coeffs.append(numpy.mean(slide_silhouette_coeffs))

    print(f"Silhouette clustering quality index for {encoder_name} encoder: ", numpy.mean(slide_coeffs))
    print(f"Davis-Bouldin index for {encoder_name} encoder: ", davies_bouldin_score(tiles_list, labels))
    print(f"Calinski-Harabasz index  for {encoder_name} encoder: ", calinski_harabasz_score(tiles_list, labels))