In [1]:
import anndata as ad
from os.path import join
import pandas as pd
import json
import os

In [2]:
import random
import numpy as np
import torch

In [3]:
def set_random_seed(seed: int, deterministic: bool = True) -> None:
    """
    Set random seed for reproducibility across random, numpy, and torch.

    Args:
        seed (int): The seed value to set.
        deterministic (bool): If True, sets PyTorch to deterministic mode.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # for multi-GPU setups

    if deterministic:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    print(f"Random seed set to: {seed}")

In [4]:
set_random_seed(42)

Random seed set to: 42


In [5]:
from analysis_utils import collect_cv_metrics, map_groups

In [6]:
def load_results(base_dir):
    experiments = [ join(base_dir, f.name) for f in os.scandir(base_dir) if f.is_dir() ]
    results = collect_cv_metrics(experiments)
    return results 

In [7]:
save_dir ='./brca_full_chemo'
task ='chemo'
base_dir = '/home/jupyter/__output_clean/brca_full/chemo'
os.makedirs(save_dir, exist_ok=True)
results= load_results(base_dir)
mil_df = results["mil"]
vote_df = results["vote"]
avg_df = results["avg"]

Missing files in /home/jupyter/__output_clean/brca_full/chemo/gf-6L-30M-i2048_finetune
Missing files in /home/jupyter/__output_clean/brca_full/chemo/Geneformer-V2-104M_finetune


In [26]:
mil_df = results['mil']
avg_df = results['avg']
vote_df = results['vote']
# mil_df[mil_df.experiment =='gf-6L-30M-i2048_test']

In [12]:
mil_df.experiment.unique()

array(['hvg', 'pca', 'scimilarity', 'scgpt', 'scgpt_cancer',
       'Geneformer-V2-104M_CLcancer', 'gf-6L-30M-i2048_test',
       'Geneformer-V2-104M', 'cellplm', 'scfoundation',
       'Geneformer-V2-316M', 'scvi', 'gf-6L-30M-i2048',
       'scfoundation_full'], dtype=object)

In [27]:
mil_df_auprc= mil_df[mil_df.Metrics=='AUPRC']
avg_df_auprc= avg_df[avg_df.Metrics=='AUPRC']
vote_df_auprc= vote_df[vote_df.Metrics=='AUPRC']


In [28]:
mil_df_auprc

Unnamed: 0,Metrics,model,fold,experiment
1,AUPRC,0.250000,fold_1,hvg
7,AUPRC,0.833333,fold_2,hvg
13,AUPRC,0.250000,fold_3,hvg
19,AUPRC,0.416667,fold_4,hvg
25,AUPRC,0.333333,fold_5,hvg
...,...,...,...,...
391,AUPRC,1.000000,fold_1,scfoundation_full
397,AUPRC,0.333333,fold_2,scfoundation_full
403,AUPRC,0.250000,fold_3,scfoundation_full
409,AUPRC,0.750000,fold_4,scfoundation_full


In [29]:
mil_df_auprc.groupby('experiment').mean(numeric_only=True)

Unnamed: 0_level_0,model
experiment,Unnamed: 1_level_1
Geneformer-V2-104M,0.581667
Geneformer-V2-104M_CLcancer,0.623333
Geneformer-V2-316M,0.673333
cellplm,0.633333
gf-6L-30M-i2048,0.540476
gf-6L-30M-i2048_test,0.506667
hvg,0.416667
pca,0.623333
scfoundation,0.511905
scfoundation_full,0.495238


In [30]:
avg_df_auprc.groupby('experiment').mean(numeric_only=True)

Unnamed: 0_level_0,model
experiment,Unnamed: 1_level_1
Geneformer-V2-104M,0.723333
Geneformer-V2-104M_CLcancer,0.716667
Geneformer-V2-316M,0.85
cellplm,0.44
gf-6L-30M-i2048,0.723333
gf-6L-30M-i2048_test,0.723333
hvg,0.64
pca,0.506667
scfoundation,0.546667
scfoundation_full,0.546667


In [31]:
vote_df_auprc.groupby('experiment').mean(numeric_only=True)

Unnamed: 0_level_0,model
experiment,Unnamed: 1_level_1
Geneformer-V2-104M,0.458571
Geneformer-V2-104M_CLcancer,0.42
Geneformer-V2-316M,0.523333
cellplm,0.525
gf-6L-30M-i2048,0.513333
gf-6L-30M-i2048_test,0.496667
hvg,0.491667
pca,0.511905
scfoundation,0.397143
scgpt,0.516667


In [15]:
from auprc_plots import (
    summarize_by_experiment,      # -> tidy DF (experiment, method, mean, std, n)
    plot_grouped_by_experiment,   # -> one chart: grouped bars per experiment (vote/avg/mil)
    plot_single_method            # -> one chart for a single method
) 