# Batch Effect detection class

In [None]:
!pip -q install ./../../BatchDetect

## Reading metadata

let's use a subset of data for now

In [None]:
%load_ext autoreload
%autoreload 2
from pathlib import Path

import numpy as np
import pandas as pd
import torch

from histaugan.model import EfficientHistAuGAN

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

In [None]:
# create metadata dataframe from clini_table and folder structure
clini_table = pd.read_excel('/lustre/groups/peng/datasets/histology_data/clini_tables/TCGA-CRC-DX_CLINI.xlsx')

# metadata with columns: file, label (MSI-H), submission site
base_dir = Path('/lustre/groups/shared/users/peng_marr/BatchDetect/')
patch_list = list(base_dir.glob('BatchDetectTCGA/*/TCGA*/*.jpeg'))
print('Number of patches:', len(patch_list))

submission_site = [patch.parent.parent.name for patch in patch_list]
label = [clini_table.isMSIH[clini_table['PATIENT'] == patch.parent.name[:12]].item() for patch in patch_list]
metadata = pd.DataFrame(list(zip(patch_list, label, submission_site)), columns=['file', 'label', 'dataset'])

In [None]:
np.unique(np.array(submission_site), return_counts=True)

In [None]:
metadata

## Features

In [None]:
from batchdetect.image import automatic_feature_extraction

df_features_path = base_dir / 'automatic_features_efficient_histaugan.csv'

# load efficient histaugan model
checkpoint_dir = Path('/lustre/groups/peng/workspace/sophia.wagner/logs/histaugan_lightning/checkpoints')
run = 'l1_a_cc+correct_adv_cls+attr_VAE+128'
model_name = 'Efficient-HistAuGAN-epoch=01-l1_cc_loss_val=0.72.ckpt'

model = EfficientHistAuGAN.load_from_checkpoint(checkpoint_dir / run / model_name)
model = model.to(device)
model.eval();
opts = model.opts

if df_features_path.exists():
    df_features = pd.read_csv(df_features_path)
else:
    df_features = automatic_feature_extraction(metadata, model)
    df_features.to_csv(df_features_path)

In [None]:
df_features

considering that one of the datasets include four channels, we need to remove it from the dataframe for a fair comaparison

In [None]:
col_index = ~df_features.columns.str.contains("Ch4")

df_features = df_features.loc[:,col_index]

## Let's see if there is a batch effect in the data

In [None]:
from batchdetect.batchdetect import BatchDetect

bd = BatchDetect(metadata.loc[:,["label","dataset"]], df_features)

In [None]:
df_features

# visualizations

In [None]:
bd.low_dim_visualization("pca")

In [None]:
bd.low_dim_visualization("tsne")

In [None]:
bd.low_dim_visualization("umap")

# Anova test of principal components vs. labels

In [None]:
bd.prince_plot()

## classification test of  RF vs a random classifier

In [None]:
bd.classification_test(scorer="f1_macro")