In [1]:
import anndata as ad
from os.path import join
import pandas as pd
import json
import os

In [2]:
import random
import numpy as np
import torch

In [3]:
def set_random_seed(seed: int, deterministic: bool = True) -> None:
    """
    Set random seed for reproducibility across random, numpy, and torch.

    Args:
        seed (int): The seed value to set.
        deterministic (bool): If True, sets PyTorch to deterministic mode.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # for multi-GPU setups

    if deterministic:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    print(f"Random seed set to: {seed}")

In [4]:
set_random_seed(42)

Random seed set to: 42


In [5]:
%run train_utils.py

  from pkg_resources import get_distribution, DistributionNotFound


## Chemo (cancer cells only)

In [6]:
def load_data(base_dir,model_name ):
    fname = join(base_dir, model_name)
    fname = join(fname, 'data.h5ad')
    adata = ad.read_h5ad(fname)
    adata_pre = adata[adata.obs['timepoint']== 'Pre']
    return adata_pre.copy()

def prepare_data(adata, id_column):
    
    adata.obs['sample_id'] = adata.obs[id_column]
    adata.obs['label'] = adata.obs['cohort'].astype(str)
    
    label_map = {'treatment_naive': 0, 'neoadjuvant_chemo': 1}
    adata.obs.label = adata.obs.label.map(label_map)
    
    return adata

def run_cv(adata, train_func, cv_split_dict, embedding_col='X_geneformer'):
    
    id_column, n_splits, train_ids_list, test_ids_list = get_splits_cv(cv_split_dict)
    
    metrics_test= []
    metrics_train= []
    predictions_test = {}
    predictions_train = {}
    for i in range(n_splits):
            fold = i+1
            print(f'---------- fold {i+1}----------')
            train_ids, test_ids = train_ids_list[i], test_ids_list[i]
            print('Split data')
            adata_train, adata_test = split_data(adata,  id_column, train_ids, test_ids)
            X_train, y_train = adata_train.obsm[embedding_col], adata_train.obs['label'].values
            X_test, y_test = adata_test.obsm[embedding_col], adata_test.obs['label'].values

            print('Train classifier')
            
            pred_df_test, pred_df_train =  train_func(adata_train, adata_test, embedding_col, model_name='random_forest' )

            print('Evaluations')

            fold_metrics_test = get_classification_metrics(pred_df_test)
            fold_metrics_train = get_classification_metrics(pred_df_train)
            
            metrics_test.append(fold_metrics_test)
            metrics_train.append(fold_metrics_train)

            predictions_train[f'fold_{fold}'] = pred_df_train
            predictions_test[f'fold_{fold}'] = pred_df_test

    pred_test_df  = pd.concat(predictions_test, names=['fold']).reset_index(level=0)
    pred_train_df  = pd.concat(predictions_train, names=['fold']).reset_index(level=0)
    metrics_test_df = pd.DataFrame(metrics_test)
    metrics_train_df = pd.DataFrame(metrics_train)

        
    return  pred_train_df, pred_test_df, metrics_train_df, metrics_test_df

def save_results( pred_train_df, pred_test_df, metrics_train_df, metrics_test_df, save_dir, model_name, prefix):
    model_saving_dir = join(save_dir, model_name)
    os.makedirs(model_saving_dir, exist_ok=True)
    print (save_dir)
    print (model_name)
    
    pred_train_df.to_csv(join(model_saving_dir, f'{prefix}_pred_train.csv'))
    pred_test_df.to_csv(join(model_saving_dir, f'{prefix}_pred_test.csv'))
    
    metrics_train_df.to_csv(join(model_saving_dir, f'{prefix}_metrics_train.csv'))
    metrics_test_df.to_csv(join(model_saving_dir, f'{prefix}_metrics_test.csv'))
    
def get_embd_key(model_name):
    

    if 'pca' in model_name :
        key = f'X_pca'    
        
    if 'hvg' in model_name :
        key = f'X_hvg'    
        
    if 'scvi' in model_name :
        key = f'X_scVI'
    if 'scgpt' in model_name :
        key = f'X_scGPT'
    if 'Geneformer' in model_name :
        key = f'X_geneformer'
    if 'gf' in model_name :
        key = f'X_geneformer'
    if 'scfoundation' in model_name :
        key = f'X_scfoundation'
    if 'scimilarity' in model_name:
        key = f'X_scimilarity'
    if 'cellplm' in model_name:
        key = f'X_cellplm'
    return key

In [7]:
saving_dir ='./outcomes/chemo'
os.makedirs(saving_dir, exist_ok=True)

In [8]:
base_dir ='/home/jupyter/__output_clean/brca_full/cancer_cells'
# base_dir ='/home/jupyter/__output/brca_full/chemo'

In [9]:
# gf_names = [ 'gf-6L-30M-i2048',  'Geneformer-V2-104M', 'Geneformer-V2-104M_CLcancer', 'Geneformer-V2-316M']
others= ['scfoundation', 'scimilarity', 'cellplm']
# model_names = [ 'hvg',  'pca', 'scvi',  'scgpt', 'scgpt_cancer'] + gf_names

In [10]:
model_names = others + ['Geneformer-V2-104M']

In [11]:
# cv_splits_file = '/home/jupyter/scFM_eval/data_splits/brca_full/brca_chemo/cv_splits.json'
cv_splits_file = '/home/jupyter/scFM_eval/data_splits/brca_full/brca_chemo/cv_splits_oversampled.json'
cv_split_dict = json.load(open(cv_splits_file))

In [12]:
id_column, n_splits, train_ids_list, test_ids_list = get_splits_cv(cv_split_dict)

In [13]:
# adata.obs['label'].dtype

In [75]:
# adata.obs['label'].unique()

In [14]:
metrics_test_list= [] 
for m in model_names:
    print(m)
    embedding_col = get_embd_key(m)
    adata = load_data(base_dir,m)
    adata = prepare_data(adata, id_column)
    
    pred_train_df, pred_test_df, metrics_train_df, metrics_test_df = run_cv(adata, __train_mil,  cv_split_dict,embedding_col )
    save_results(pred_train_df, pred_test_df, metrics_train_df, metrics_test_df, saving_dir, model_name=m, prefix='mil')
    dd = metrics_test_df.mean(numeric_only=True)
    metrics_test_list.append(dd)
    
results = pd.DataFrame(metrics_test_list, index = model_names)
results.to_csv(join(saving_dir, 'mil_metrics_test.csv' ))

scfoundation
---------- fold 1----------
Split data
Train classifier
Training model (Multi instance Learning (MIL))
Epoch 1, Loss: 5.6431
Epoch 2, Loss: 0.6730
Epoch 3, Loss: 0.6854
Epoch 4, Loss: 0.6809
Epoch 5, Loss: 0.6766
Epoch 6, Loss: 0.6724
Epoch 7, Loss: 0.6684
Epoch 8, Loss: 0.6645
Epoch 9, Loss: 0.6607
Epoch 10, Loss: 0.6571
Epoch 11, Loss: 0.6535
Epoch 12, Loss: 0.6501
Epoch 13, Loss: 0.6467
Epoch 14, Loss: 0.6434
Epoch 15, Loss: 0.6403
Epoch 16, Loss: 0.6372
Epoch 17, Loss: 0.6342
Epoch 18, Loss: 0.6314
Epoch 19, Loss: 0.6285
Epoch 20, Loss: 0.6258
Epoch 21, Loss: 0.6232
Epoch 22, Loss: 0.6206
Epoch 23, Loss: 0.6181
Epoch 24, Loss: 0.6157
Epoch 25, Loss: 0.6134
Epoch 26, Loss: 0.6111
Epoch 27, Loss: 0.6089
Epoch 28, Loss: 0.6067
Epoch 29, Loss: 0.6047
Epoch 30, Loss: 0.6026
Epoch 31, Loss: 0.6007
Epoch 32, Loss: 0.5988
Epoch 33, Loss: 0.5969
Epoch 34, Loss: 0.5951
Epoch 35, Loss: 0.5934
Epoch 36, Loss: 0.5917
Epoch 37, Loss: 0.5901
Epoch 38, Loss: 0.5885
Epoch 39, Loss: 0.5

In [15]:
metrics_test_list= [] 
for m in model_names:
    embedding_col = get_embd_key(m)
    adata = load_data(base_dir,m)
    adata = prepare_data(adata, id_column)
    
    pred_train_df, pred_test_df, metrics_train_df, metrics_test_df = run_cv(adata, __train_vote,  cv_split_dict, embedding_col)
    save_results(pred_train_df, pred_test_df, metrics_train_df, metrics_test_df, saving_dir, model_name=m, prefix='vote')
    dd = metrics_test_df.mean(numeric_only=True)
    metrics_test_list.append(dd)
    
results = pd.DataFrame(metrics_test_list, index = model_names)
results.to_csv(join(saving_dir, 'vote_metrics_test.csv' ))

---------- fold 1----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(3875, 2) (14787,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    11250
1     3537
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    3443
1     432
Name: count, dtype: int64
Evaluations
---------- fold 2----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(4200, 2) (14462,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    10620
1     3842
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    2646
1    1554
Name: count, dtype: int64
Evaluations
---------- fold 3----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(4216, 2) (14446,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    12625
1     1821
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    3556
1     660
Name: count, dtype: int64
Evaluations
---------- fold 4----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(2285, 2) (16377,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    13049
1     3328
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    1992
1     293
Name: count, dtype: int64
Evaluations
---------- fold 5----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(4086, 2) (14576,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    11692
1     2884
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    3403
1     683
Name: count, dtype: int64
Evaluations
./outcomes/chemo
scfoundation
---------- fold 1----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(3875, 2) (14787,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    10172
1     4615
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    2902
1     973
Name: count, dtype: int64
Evaluations
---------- fold 2----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(4200, 2) (14462,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    9867
1    4595
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    3122
1    1078
Name: count, dtype: int64
Evaluations
---------- fold 3----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(4216, 2) (14446,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    11602
1     2844
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    3158
1    1058
Name: count, dtype: int64
Evaluations
---------- fold 4----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(2285, 2) (16377,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    11632
1     4745
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    1877
1     408
Name: count, dtype: int64
Evaluations
---------- fold 5----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(4086, 2) (14576,)
Cell-level prediction distribution (train):
pred
0    10581
1     3995
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    2549
1    1537
Name: count, dtype: int64


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Evaluations
./outcomes/chemo
scimilarity
---------- fold 1----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(3875, 2) (14787,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    10019
1     4768
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    2716
1    1159
Name: count, dtype: int64
Evaluations
---------- fold 2----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(4200, 2) (14462,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    9439
1    5023
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    3258
1     942
Name: count, dtype: int64
Evaluations
---------- fold 3----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(4216, 2) (14446,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    11005
1     3441
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    2946
1    1270
Name: count, dtype: int64
Evaluations
---------- fold 4----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(2285, 2) (16377,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    11013
1     5364
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    1767
1     518
Name: count, dtype: int64
Evaluations
---------- fold 5----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(4086, 2) (14576,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    10176
1     4400
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    2582
1    1504
Name: count, dtype: int64
Evaluations
./outcomes/chemo
cellplm
---------- fold 1----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(3875, 2) (14787,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    9308
1    5479
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    2966
1     909
Name: count, dtype: int64
Evaluations
---------- fold 2----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(4200, 2) (14462,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    8738
1    5724
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    2214
1    1986
Name: count, dtype: int64
Evaluations
---------- fold 3----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(4216, 2) (14446,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    11304
1     3142
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    2938
1    1278
Name: count, dtype: int64
Evaluations
---------- fold 4----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(2285, 2) (16377,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    10548
1     5829
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    1704
1     581
Name: count, dtype: int64
Evaluations
---------- fold 5----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(4086, 2) (14576,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    10735
1     3841
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    3165
1     921
Name: count, dtype: int64
Evaluations
./outcomes/chemo
Geneformer-V2-104M


In [77]:
metrics_test_list= [] 
for m in model_names:
    embedding_col = get_embd_key(m)
    adata = load_data(base_dir,m)
    adata = prepare_data(adata, id_column)

    pred_train_df, pred_test_df, metrics_train_df, metrics_test_df = run_cv(adata, __train_avg_expression,  cv_split_dict, embedding_col)
    save_results(pred_train_df, pred_test_df, metrics_train_df, metrics_test_df, saving_dir, model_name=m, prefix='avg')
    dd = metrics_test_df.mean(numeric_only=True)
    metrics_test_list.append(dd)

results = pd.DataFrame(metrics_test_list, index = model_names)
results.to_csv(join(saving_dir, 'avg_metrics_test.csv' ))

---------- fold 1----------
Split data
Train classifier
Training model (Average Embedding Per Sample)


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


Unique values in train label: [0 1]
Unique values in test label: [0 1]
(8, 2) (31,)
Evaluations
---------- fold 2----------
Split data
Train classifier
Training model (Average Embedding Per Sample)


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


Unique values in train label: [0 1]
Unique values in test label: [0 1]
(8, 2) (31,)
Evaluations
---------- fold 3----------
Split data
Train classifier
Training model (Average Embedding Per Sample)


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


Unique values in train label: [0 1]
Unique values in test label: [0 1]
(8, 2) (31,)
Evaluations
---------- fold 4----------
Split data
Train classifier
Training model (Average Embedding Per Sample)


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


Unique values in train label: [0 1]
Unique values in test label: [0 1]
(8, 2) (31,)
Evaluations
---------- fold 5----------
Split data
Train classifier
Training model (Average Embedding Per Sample)


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


Unique values in train label: [0 1]
Unique values in test label: [0 1]
(7, 2) (32,)
Evaluations
./outcomes/chemo
scfoundation
---------- fold 1----------
Split data
Train classifier
Training model (Average Embedding Per Sample)


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


Unique values in train label: [0 1]
Unique values in test label: [0 1]
(8, 2) (31,)
Evaluations
---------- fold 2----------
Split data
Train classifier
Training model (Average Embedding Per Sample)


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


Unique values in train label: [0 1]
Unique values in test label: [0 1]
(8, 2) (31,)
Evaluations
---------- fold 3----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
(8, 2) (31,)
Evaluations
---------- fold 4----------
Split data


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
(8, 2) (31,)
Evaluations
---------- fold 5----------
Split data


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
(7, 2) (32,)
Evaluations


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


./outcomes/chemo
scimilarity
---------- fold 1----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(8, 2) (31,)
Evaluations
---------- fold 2----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(8, 2) (31,)
Evaluations
---------- fold 3----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(8, 2) (31,)
Evaluations
---------- fold 4----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(8, 2) (31,)
Evaluations
---------- fold 5----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(7, 2) (32,)
Evaluations
./outcomes/chemo
cellplm
---------- fold 1----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(8, 2) (31,)
Evaluations
---------- fold 2----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(8, 2) (31,)
Evaluations
---------- fold 3----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(8, 2) (31,)
Evaluations
---------- fold 4----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(8, 2) (31,)
Evaluations
---------- fold 5----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(7, 2) (32,)
Evaluations
./outcomes/chemo
Geneformer-V2-104M


In [None]:
pd.DataFrame(metrics_test_list, index = model_names)

In [None]:
pd.DataFrame(metrics_test_list, index = model_names)

In [None]:
pd.DataFrame(metrics_test_list, index = model_names)

In [None]:
pd.DataFrame(metrics_test_list, index = model_names)

In [None]:
pd.DataFrame(metrics_test_list, index = model_names)

In [None]:
pd.DataFrame(metrics_test_list, index = model_names).to_csv(join(saving_dir, 'vote_metrics_test.csv' ))

In [None]:
pd.DataFrame(metrics_test_list, index = model_names)

In [None]:
pd.DataFrame(metrics_test_list, index = model_names).to_csv(join(saving_dir, 'avg_metrics_test.csv' ))

In [None]:
pd.DataFrame(metrics_test_list, index = model_names)

In [None]:
%%time


In [None]:
pred_train_df  = pd.concat(predictions_train, names=['fold']).reset_index(level=0)
pred_train_df.to_csv(join(saving_dir, 'vote_pred_train.csv'))

In [None]:
pred_train_df  = pd.concat(predictions_train, names=['fold']).reset_index(level=0)
pred_train_df.to_csv(join(saving_dir, 'vote_pred_train.csv'))

In [None]:
pd.DataFrame(metrics_test).mean(numeric_only=True)

In [None]:
pd.DataFrame(metrics_test).to_csv(join(saving_dir, 'vote_metrics_test.csv'))

In [None]:
pd.DataFrame(metrics_train).to_csv(join(saving_dir, 'vote_metrics_train.csv'))

In [None]:
pd.DataFrame(metrics)['auprc'].mean()

In [None]:
pd.DataFrame(metrics)

In [None]:
pd.DataFrame(metrics)['auprc'].mean()

## Subtype

In [None]:
adata.obs['cohort'].value_counts()

In [None]:
adata_subtype = adata[adata.obs['cohort'] == 'treatment_naive'].copy()


In [None]:
adata_subtype.obs['timepoint'].value_counts()

In [None]:
adata_subtype.obs['timepoint'].value_counts()

In [None]:
adata_subtype = adata_subtype[adata_subtype.obs['timepoint'] == 'Pre']


In [None]:
adata_subtype.obs

In [None]:
adata_subtype.obs['Cancer_type'].value_counts()

In [None]:
adata_subtype = adata_subtype[adata_subtype.obs['Cancer_type'].isin( ['ER+', 'TNBC'])]


In [None]:
adata_subtype.obs['Cancer_type'].value_counts()

In [None]:
cv_splits_file = '/home/jupyter/sceval/data_splits/brca_full/brca_subtype/cv_splits.json'
cv_split_dict = json.load(open(cv_splits_file))

In [None]:
adata_subtype.obs['sample_id'] = adata_subtype.obs[id_column]
adata_subtype.obs['label'] = adata_subtype.obs['Cancer_type']


In [None]:
adata_subtype.obs['label']

In [None]:
adata_subtype.obs['label'] = adata_subtype.obs['label'].astype(str)

In [None]:
adata_subtype.obs['label'] 

In [None]:
label_map = {'ER+': 0, 'TNBC': 1}
adata_subtype.obs.label =adata_subtype.obs.label.map(label_map)

In [None]:
adata_subtype.obs.label

In [None]:
%%time
embedding_col = 'X_geneformer'
metrics= []
for i in range(n_splits):
            print(f'---------- fold {i+1}----------')
            train_ids, test_ids = train_ids_list[i], test_ids_list[i]
            print('Split data')
            adata_train, adata_test = split_data(adata_subtype,  id_column, train_ids, test_ids)
            X_train, y_train = adata_train.obsm[embedding_col], adata_train.obs['label'].values
            X_test, y_test = adata_test.obsm[embedding_col], adata_test.obs['label'].values
        
            print('Train classifier')
            model, _, _ , y_pred_train, y_pred_test, y_pred_score_train, y_pred_score_test = train_classifier(X_train, y_train, X_test, y_test, 
                                                                  model_name='random_forest')
                
            print('Evaluations')
            adata_test.obs['pred'] = y_pred_test
            adata_test.obs['pred_score'] = y_pred_score_test[:, 1] #assume binary classification
            adata_test_df = adata_test.obs
            pred_df = get_patient_level( adata_test_df)
            fold_metrics = get_classification_metrics(pred_df)
            metrics.append(fold_metrics)

In [None]:
pd.DataFrame(metrics)

## Pre vs post (all cells)

In [None]:
adata.obs['pre_post'].value_counts()

In [None]:
# read cv splits
# split data
# run cv
# get patien level predictions
# save preictions and metrics


In [None]:
import json

In [None]:
adata_tcell = adata[adata.obs.cell_types=='T_cell']

In [None]:
adata_tcell

In [None]:
split_data

In [None]:
cv_splits_file = '/home/jupyter/sceval/data_splits/brca_full/brca_pre_post/cv_splits.json'

In [None]:
cv_split_dict = json.load(open(cv_splits_file))

In [None]:
cv_split_dict

In [None]:
id_column, n_splits, train_ids_list, test_ids_list = get_splits_cv(cv_split_dict)

In [None]:
id_column

In [None]:
adata_tcell.obs['sample_id'] = adata_tcell.obs[id_column]

label_map = {'Pre': 0, 'Post': 1}


In [None]:
adata_tcell.obs.label= adata_tcell.obs.label.map(label_map)

In [None]:
%%time
embedding_col = 'X_geneformer'
metrics= []
for i in range(n_splits):
            print(f'---------- fold {i+1}----------')
            train_ids, test_ids = train_ids_list[i], test_ids_list[i]
            print('Split data')
            adata_train, adata_test = split_data(adata_tcell,  id_column, train_ids, test_ids)
            X_train, y_train = adata_train.obsm[embedding_col], adata_train.obs['label']
            X_test, y_test = adata_test.obsm[embedding_col], adata_test.obs['label']
        
            print('Train classifier')
            model, y_train, y_test, y_pred_train, y_pred_test, y_pred_score_train, y_pred_score_test = train_classifier(X_train, y_train, X_test, y_test, 
                                                                  model_name='random_forest')
                
            print('Evaluations')
            adata_test.obs['pred'] = y_pred_test
            adata_test.obs['pred_score'] = y_pred_score_test[:, 1] #assume binary classification
            adata_test_df = adata_test.obs
            pred_df = get_patient_level( adata_test_df)
            fold_metrics = get_classification_metrics(pred_df)
            metrics.append(fold_metrics)
            
            
            

In [None]:
pd.DataFrame(metrics)

In [None]:
pd.DataFrame(metrics)['auprc'].mean()

In [None]:
adata.obs.cell_types.value_counts()

In [None]:
adata_cancer = adata[adata.obs.cell_types=='Cancer_cell']

In [None]:
adata_cancer

In [None]:
adata_cancer.obs['sample_id'] = adata_cancer.obs[id_column]

label_map = {'Pre': 0, 'Post': 1}

In [None]:
adata_cancer.obs.label= adata_cancer.obs.label.map(label_map)

In [None]:
%%time
embedding_col = 'X_geneformer'
metrics= []
for i in range(n_splits):
            print(f'---------- fold {i+1}----------')
            train_ids, test_ids = train_ids_list[i], test_ids_list[i]
            print('Split data')
            adata_train, adata_test = split_data(adata_cancer,  id_column, train_ids, test_ids)
            X_train, y_train = adata_train.obsm[embedding_col], adata_train.obs['label']
            X_test, y_test = adata_test.obsm[embedding_col], adata_test.obs['label']
        
            print('Train classifier')
            model, y_train, y_test, y_pred_train, y_pred_test, y_pred_score_train, y_pred_score_test = train_classifier(X_train, y_train, X_test, y_test, 
                                                                  model_name='random_forest')
                
            print('Evaluations')
            adata_test.obs['pred'] = y_pred_test
            adata_test.obs['pred_score'] = y_pred_score_test[:, 1] #assume binary classification
            adata_test_df = adata_test.obs
            pred_df = get_patient_level( adata_test_df)
            fold_metrics = get_classification_metrics(pred_df)
            metrics.append(fold_metrics)

In [None]:
pd.DataFrame(metrics)