In [None]:
!pip -q install -e ./../../BatchDetect

In [None]:
%load_ext autoreload
%autoreload 2
from pathlib import Path

import numpy as np
import pandas as pd
import torch

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Patch-level methods evaluation

Evaluate the performance of patch-level methods, such as stain normalization or stain augmentation techniques.

## Reading metadata

In [None]:
dataset = 'CRC'  # TODO make work for other datasets

In [None]:
# create metadata dataframe from clini_table and folder structure
clini_table = pd.read_csv(Path('/lustre/groups/shared/users/peng_marr/BatchDetect/BatchDetectCRC_clini.csv'))

# metadata with columns: file, label (MSI-H), submission site
base_dir = Path(f'/lustre/groups/shared/users/peng_marr/BatchDetect/BatchDetect{dataset}')
patch_list = list(base_dir.glob('**/*.jpeg'))
print('Number of patches:', len(patch_list))

submission_site = [patch.parent.parent.name for patch in patch_list]
metadata = pd.DataFrame(list(zip(patch_list, submission_site)), columns=['file', 'dataset'])

labels = list(clini_table.columns)  # or costum list
labels.remove('PATIENT')
for l in labels:
    label = [clini_table[l][clini_table['PATIENT'] == patch.name.split('_')[0]].item() for patch in patch_list]
    metadata[l] = label

In [None]:
clini_table.columns

In [None]:
np.unique(np.array(submission_site), return_counts=True)

In [None]:
metadata

In [None]:
metadata.to_csv('/lustre/groups/shared/users/peng_marr/BatchDetect/BatchDetectCRC_metadata.csv', index=False)

## Features
Create or load features

In [None]:
method = "original"  # TODO extend this to list of methods
features = 'first_and_second_order'  # TODO extend this to list of feature extractors

In [None]:
from batchdetect.image import first_and_second_order

df_features_path = base_dir / f'{method}_{features}_features.csv'

if df_features_path.exists():
    df_features = pd.read_csv(df_features_path)
else:
    df_features = first_and_second_order(metadata)
    df_features.to_csv(df_features_path)

In [None]:
df_features.columns

In [None]:
# considering that one of the datasets include four channels, we need to remove it from the dataframe for a fair comaparison
# TODO check if this is needed
col_index = ~df_features.columns.str.contains("Ch4")

df_features = df_features.loc[:,col_index]

## Let's see if there is a batch effect in the data

In [None]:
from batchdetect.batchdetect import BatchDetect

bd = BatchDetect(metadata.loc[:,["label","dataset"]], df_features)

In [None]:
df_features

### Visualizations

In [None]:
bd.low_dim_visualization("pca")

In [None]:
bd.low_dim_visualization("tsne")

In [None]:
bd.low_dim_visualization("umap")

### Anova test of principal components vs. labels

In [None]:
bd.prince_plot()

### classification test of random forest (RF) vs a random classifier

In [None]:
bd.classification_test(scorer="f1_macro")

### Clustering metrics

In [None]:
targets = ["label", "dataset"]

from batchdetect.metrics import mean_local_diversity, silhouette_score

metrics = [mean_local_diversity, silhouette_score]
metrics_labels = [f'{m.__name__}'.replace('_', ' ').title() for m in metrics]

result_df = pd.DataFrame(columns=["Target", *metrics_labels])

for i, m in enumerate(metrics):
    res = m(metadata, targets, df_features)
    for j, t in enumerate(targets):
        result_df.loc[j, "Target"] = t
        result_df.loc[j, metrics_labels[i]] = res[t]
    


In [None]:
result_df