In [1]:
import anndata as ad
from os.path import join
import pandas as pd
import json
import os

In [2]:
import random
import numpy as np
import torch

In [3]:
def set_random_seed(seed: int, deterministic: bool = True) -> None:
    """
    Set random seed for reproducibility across random, numpy, and torch.

    Args:
        seed (int): The seed value to set.
        deterministic (bool): If True, sets PyTorch to deterministic mode.
    """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    torch.cuda.manual_seed(seed)
    torch.cuda.manual_seed_all(seed)  # for multi-GPU setups

    if deterministic:
        torch.backends.cudnn.deterministic = True
        torch.backends.cudnn.benchmark = False

    print(f"Random seed set to: {seed}")

In [4]:
set_random_seed(42)

Random seed set to: 42


In [5]:
%run train_utils.py

## Chemo (cancer cells only)

In [6]:
def load_data(base_dir,model_name ):
    fname = join(base_dir, model_name)
    fname = join(fname, 'data.h5ad')
    adata = ad.read_h5ad(fname)
    adata_pre = adata[adata.obs['timepoint']== 'Pre']
    return adata_pre.copy()

def prepare_data(adata, id_column):
    
    adata.obs['sample_id'] = adata.obs[id_column]
    adata.obs['label'] = adata.obs['cohort'].astype(str)
    
    label_map = {'treatment_naive': 0, 'neoadjuvant_chemo': 1}
    adata.obs.label = adata.obs.label.map(label_map)
    
    return adata

def run_cv(adata, train_func, cv_split_dict, embedding_col='X_geneformer'):
    
    id_column, n_splits, train_ids_list, test_ids_list = get_splits_cv(cv_split_dict)
    
    metrics_test= []
    metrics_train= []
    predictions_test = {}
    predictions_train = {}
    for i in range(n_splits):
            fold = i+1
            print(f'---------- fold {i+1}----------')
            train_ids, test_ids = train_ids_list[i], test_ids_list[i]
            print('Split data')
            adata_train, adata_test = split_data(adata,  id_column, train_ids, test_ids)
            X_train, y_train = adata_train.obsm[embedding_col], adata_train.obs['label'].values
            X_test, y_test = adata_test.obsm[embedding_col], adata_test.obs['label'].values

            print('Train classifier')
            
            pred_df_test, pred_df_train =  train_func(adata_train, adata_test, embedding_col, model_name='random_forest' )

            print('Evaluations')

            fold_metrics_test = get_classification_metrics(pred_df_test)
            fold_metrics_train = get_classification_metrics(pred_df_train)
            
            metrics_test.append(fold_metrics_test)
            metrics_train.append(fold_metrics_train)

            predictions_train[f'fold_{fold}'] = pred_df_train
            predictions_test[f'fold_{fold}'] = pred_df_test

    pred_test_df  = pd.concat(predictions_test, names=['fold']).reset_index(level=0)
    pred_train_df  = pd.concat(predictions_train, names=['fold']).reset_index(level=0)
    metrics_test_df = pd.DataFrame(metrics_test)
    metrics_train_df = pd.DataFrame(metrics_train)

        
    return  pred_train_df, pred_test_df, metrics_train_df, metrics_test_df

def save_results( pred_train_df, pred_test_df, metrics_train_df, metrics_test_df, save_dir, model_name, prefix):
    model_saving_dir = join(save_dir, model_name)
    os.makedirs(model_saving_dir, exist_ok=True)
    print (save_dir)
    print (model_name)
    
    pred_train_df.to_csv(join(model_saving_dir, f'{prefix}_pred_train.csv'))
    pred_test_df.to_csv(join(model_saving_dir, f'{prefix}_pred_test.csv'))
    
    metrics_train_df.to_csv(join(model_saving_dir, f'{prefix}_metrics_train.csv'))
    metrics_test_df.to_csv(join(model_saving_dir, f'{prefix}_metrics_test.csv'))
    
def get_embd_key(model_name):
    

    if 'pca' in model_name :
        key = f'X_pca'    
        
    if 'hvg' in model_name :
        key = f'X_hvg'    
        
    if 'scvi' in model_name :
        key = f'X_scVI'
    if 'scgpt' in model_name :
        key = f'X_scGPT'
    if 'Geneformer' in model_name :
        key = f'X_geneformer'
    if 'gf' in model_name :
        key = f'X_geneformer'
    if 'scfoundation' in model_name :
        key = f'X_scfoundation'
    if 'scimilarity' in model_name:
        key = f'X_scimilarity'
    if 'cellplm' in model_name:
        key = f'X_cellplm'
    return key

In [7]:
saving_dir ='./outcomes/chemo'
os.makedirs(saving_dir, exist_ok=True)

In [51]:
# base_dir ='/home/jupyter/__output_clean/brca_full/cancer_cells'
base_dir ='/home/jupyter/__output_clean/brca_full/chemo'
# base_dir ='/home/jupyter/__output_clean/brca_full/all_cells'


In [52]:
# gf_names = [ 'gf-6L-30M-i2048',  'Geneformer-V2-104M', 'Geneformer-V2-104M_CLcancer', 'Geneformer-V2-316M']
# others= ['scfoundation', 'scimilarity', 'cellplm']
# model_names = [ 'hvg',  'pca', 'scvi',  'scgpt', 'scgpt_cancer'] + gf_names

In [53]:
# model_names = others + ['Geneformer-V2-104M']

In [54]:
# model_names= ['gf-6L-30M-i2048', 'gf-6L-30M-i2048_test']

In [55]:
model_names= ['scimilarity']

In [56]:
# cv_splits_file = '/home/jupyter/scFM_eval/data_splits/brca_full/brca_chemo/cv_splits.json'
cv_splits_file = '/home/jupyter/scFM_eval/data_splits/brca_full/brca_chemo/cv_splits_oversampled.json'
cv_split_dict = json.load(open(cv_splits_file))

In [57]:
id_column, n_splits, train_ids_list, test_ids_list = get_splits_cv(cv_split_dict)

In [58]:
# adata.obs['label'].dtype

In [59]:
# adata.obs['label'].unique()

In [60]:
# adata

In [61]:
# test_ids_fold_0 = test_ids_list[0]

In [62]:
# id_column

In [63]:
# adata_test = adata[adata.obs[id_column].isin(test_ids_fold_0)]

In [64]:
# adata.obs.groupby('donor_id')['cell_id'].count()

In [65]:
# adata_test.obs.groupby('donor_id')['cell_id'].count()

In [66]:
'X_scimilarity' in adata.obsm

True

In [67]:
metrics_test_list= [] 
for m in model_names:
    print(m)
    embedding_col = get_embd_key(m)
    adata = load_data(base_dir,m)
    adata = prepare_data(adata, id_column)
    if not embedding_col in adata.obsm:
        adata.obsm[embedding_col] = adata.X
    
    pred_train_df, pred_test_df, metrics_train_df, metrics_test_df = run_cv(adata, __train_mil,  cv_split_dict,embedding_col )
    save_results(pred_train_df, pred_test_df, metrics_train_df, metrics_test_df, saving_dir, model_name=m, prefix='mil')
    dd = metrics_test_df.mean(numeric_only=True)
    metrics_test_list.append(dd)
    
results = pd.DataFrame(metrics_test_list, index = model_names)
# results.to_csv(join(saving_dir, 'mil_metrics_test.csv' ))
results

scimilarity
---------- fold 1----------
Split data
Train classifier
Training model (Multi instance Learning (MIL))
Epoch 001 | train_loss=0.6581 | train_loss=0.6581 | no_improve=0/20
Epoch 002 | train_loss=0.6044 | train_loss=0.6044 | no_improve=0/20
Epoch 003 | train_loss=0.5593 | train_loss=0.5593 | no_improve=0/20
Epoch 004 | train_loss=0.5429 | train_loss=0.5429 | no_improve=0/20
Epoch 005 | train_loss=0.5309 | train_loss=0.5309 | no_improve=0/20
Epoch 006 | train_loss=0.5256 | train_loss=0.5256 | no_improve=0/20
Epoch 007 | train_loss=0.5317 | train_loss=0.5317 | no_improve=1/20
Epoch 008 | train_loss=0.5208 | train_loss=0.5208 | no_improve=0/20
Epoch 009 | train_loss=0.5198 | train_loss=0.5198 | no_improve=0/20
Epoch 010 | train_loss=0.5174 | train_loss=0.5174 | no_improve=0/20
Epoch 011 | train_loss=0.5135 | train_loss=0.5135 | no_improve=0/20
Epoch 012 | train_loss=0.5031 | train_loss=0.5031 | no_improve=0/20
Epoch 013 | train_loss=0.4995 | train_loss=0.4995 | no_improve=0/20
E

Unnamed: 0,accuracy,precision,recall,f1_score,roc_auc,auprc
scimilarity,0.821429,0.4,0.4,0.366667,0.666667,0.641667


In [68]:
results# chemo

Unnamed: 0,accuracy,precision,recall,f1_score,roc_auc,auprc
scimilarity,0.821429,0.4,0.4,0.366667,0.666667,0.641667


In [47]:
results# all cells

Unnamed: 0,accuracy,precision,recall,f1_score,roc_auc,auprc
scimilarity,0.742857,0.3,0.4,0.333333,0.688095,0.44


In [24]:
results# cancer

Unnamed: 0,accuracy,precision,recall,f1_score,roc_auc,auprc
scimilarity,0.796429,0.366667,0.4,0.333333,0.683333,0.648333


In [23]:
results

Unnamed: 0,accuracy,precision,recall,f1_score,roc_auc,auprc
gf-6L-30M-i2048,0.771429,0.2,0.2,0.2,0.633333,0.506667
gf-6L-30M-i2048_test,0.771429,0.2,0.2,0.2,0.633333,0.506667


In [21]:
results

Unnamed: 0,accuracy,precision,recall,f1_score,roc_auc,auprc
gf-6L-30M-i2048,0.796429,0.0,0.0,0.0,0.833333,0.683333
gf-6L-30M-i2048_test,0.796429,0.0,0.0,0.0,0.733333,0.623333


In [69]:
metrics_test_list= [] 
for m in model_names:
    embedding_col = get_embd_key(m)
    adata = load_data(base_dir,m)
    adata = prepare_data(adata, id_column)
    if not embedding_col in adata.obsm:
        adata.obsm[embedding_col] = adata.X
        
    pred_train_df, pred_test_df, metrics_train_df, metrics_test_df = run_cv(adata, __train_avg_expression,  cv_split_dict, embedding_col)
    save_results(pred_train_df, pred_test_df, metrics_train_df, metrics_test_df, saving_dir, model_name=m, prefix='avg')
    dd = metrics_test_df.mean(numeric_only=True)
    metrics_test_list.append(dd)

results = pd.DataFrame(metrics_test_list, index = model_names)
# results.to_csv(join(saving_dir, 'avg_metrics_test.csv' ))
results

---------- fold 1----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(8, 2) (31,)
Evaluations
---------- fold 2----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(8, 2) (31,)
Evaluations
---------- fold 3----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(8, 2) (31,)
Evaluations
---------- fold 4----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(8, 2) (31,)
Evaluations
---------- fold 5----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(7, 2) (32,)
Evaluations
./outcomes/chemo
scimilarity


Unnamed: 0,accuracy,precision,recall,f1_score,roc_auc,auprc
scimilarity,0.821429,0.2,0.1,0.133333,0.671429,0.538333


In [70]:
results# chemo

Unnamed: 0,accuracy,precision,recall,f1_score,roc_auc,auprc
scimilarity,0.821429,0.2,0.1,0.133333,0.671429,0.538333


In [50]:
results #all cells

Unnamed: 0,accuracy,precision,recall,f1_score,roc_auc,auprc
scimilarity,0.771429,0.0,0.0,0.0,0.671429,0.496667


In [26]:
results #cancer

Unnamed: 0,accuracy,precision,recall,f1_score,roc_auc,auprc
scimilarity,0.821429,0.2,0.1,0.133333,0.688095,0.538333


In [21]:
results.to_csv(join(saving_dir, 'avg_metrics_test.csv' ))


In [24]:
metrics_test_list= [] 
for m in model_names:
    embedding_col = get_embd_key(m)
    adata = load_data(base_dir,m)
    adata = prepare_data(adata, id_column)
    
    pred_train_df, pred_test_df, metrics_train_df, metrics_test_df = run_cv(adata, __train_vote,  cv_split_dict, embedding_col)
    save_results(pred_train_df, pred_test_df, metrics_train_df, metrics_test_df, saving_dir, model_name=m, prefix='vote')
    dd = metrics_test_df.mean(numeric_only=True)
    metrics_test_list.append(dd)
    
results = pd.DataFrame(metrics_test_list, index = model_names)
results.to_csv(join(saving_dir, 'vote_metrics_test.csv' ))

---------- fold 1----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(3875, 2) (14787,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    8718
1    6069
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    2565
1    1310
Name: count, dtype: int64
Evaluations
---------- fold 2----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(4200, 2) (14462,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    8076
1    6386
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
1    2448
0    1752
Name: count, dtype: int64
Evaluations
---------- fold 3----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(4216, 2) (14446,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    10340
1     4106
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    2746
1    1470
Name: count, dtype: int64
Evaluations
---------- fold 4----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(2285, 2) (16377,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    9680
1    6697
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    1568
1     717
Name: count, dtype: int64
Evaluations
---------- fold 5----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(4086, 2) (14576,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    10138
1     4438
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    3056
1    1030
Name: count, dtype: int64
Evaluations
./outcomes/chemo
gf-6L-30M-i2048
---------- fold 1----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(3875, 2) (14787,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    8755
1    6032
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    2565
1    1310
Name: count, dtype: int64
Evaluations
---------- fold 2----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(4200, 2) (14462,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    8059
1    6403
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
1    2443
0    1757
Name: count, dtype: int64
Evaluations
---------- fold 3----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(4216, 2) (14446,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    10326
1     4120
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    2692
1    1524
Name: count, dtype: int64
Evaluations
---------- fold 4----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(2285, 2) (16377,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    9663
1    6714
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    1578
1     707
Name: count, dtype: int64
Evaluations
---------- fold 5----------
Split data
Train classifier
Training model (Majority Vote)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
Training model
(4086, 2) (14576,)


  adata_test.obs['pred'] = model.predict(X_test)
  adata_train.obs['pred'] = model.predict(X)


Cell-level prediction distribution (train):
pred
0    10151
1     4425
Name: count, dtype: int64
Cell-level prediction distribution (test):
pred
0    3073
1    1013
Name: count, dtype: int64
Evaluations
./outcomes/chemo
gf-6L-30M-i2048_test


In [77]:
metrics_test_list= [] 
for m in model_names:
    embedding_col = get_embd_key(m)
    adata = load_data(base_dir,m)
    adata = prepare_data(adata, id_column)

    pred_train_df, pred_test_df, metrics_train_df, metrics_test_df = run_cv(adata, __train_avg_expression,  cv_split_dict, embedding_col)
    save_results(pred_train_df, pred_test_df, metrics_train_df, metrics_test_df, saving_dir, model_name=m, prefix='avg')
    dd = metrics_test_df.mean(numeric_only=True)
    metrics_test_list.append(dd)

results = pd.DataFrame(metrics_test_list, index = model_names)
results.to_csv(join(saving_dir, 'avg_metrics_test.csv' ))

---------- fold 1----------
Split data
Train classifier
Training model (Average Embedding Per Sample)


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


Unique values in train label: [0 1]
Unique values in test label: [0 1]
(8, 2) (31,)
Evaluations
---------- fold 2----------
Split data
Train classifier
Training model (Average Embedding Per Sample)


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


Unique values in train label: [0 1]
Unique values in test label: [0 1]
(8, 2) (31,)
Evaluations
---------- fold 3----------
Split data
Train classifier
Training model (Average Embedding Per Sample)


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


Unique values in train label: [0 1]
Unique values in test label: [0 1]
(8, 2) (31,)
Evaluations
---------- fold 4----------
Split data
Train classifier
Training model (Average Embedding Per Sample)


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


Unique values in train label: [0 1]
Unique values in test label: [0 1]
(8, 2) (31,)
Evaluations
---------- fold 5----------
Split data
Train classifier
Training model (Average Embedding Per Sample)


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


Unique values in train label: [0 1]
Unique values in test label: [0 1]
(7, 2) (32,)
Evaluations
./outcomes/chemo
scfoundation
---------- fold 1----------
Split data
Train classifier
Training model (Average Embedding Per Sample)


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


Unique values in train label: [0 1]
Unique values in test label: [0 1]
(8, 2) (31,)
Evaluations
---------- fold 2----------
Split data
Train classifier
Training model (Average Embedding Per Sample)


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


Unique values in train label: [0 1]
Unique values in test label: [0 1]
(8, 2) (31,)
Evaluations
---------- fold 3----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
(8, 2) (31,)
Evaluations
---------- fold 4----------
Split data


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
(8, 2) (31,)
Evaluations
---------- fold 5----------
Split data


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]
(7, 2) (32,)
Evaluations


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


./outcomes/chemo
scimilarity
---------- fold 1----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(8, 2) (31,)
Evaluations
---------- fold 2----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(8, 2) (31,)
Evaluations
---------- fold 3----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(8, 2) (31,)
Evaluations
---------- fold 4----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(8, 2) (31,)
Evaluations
---------- fold 5----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(7, 2) (32,)
Evaluations
./outcomes/chemo
cellplm
---------- fold 1----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(8, 2) (31,)
Evaluations
---------- fold 2----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(8, 2) (31,)
Evaluations
---------- fold 3----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(8, 2) (31,)
Evaluations
---------- fold 4----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(8, 2) (31,)
Evaluations
---------- fold 5----------
Split data
Train classifier
Training model (Average Embedding Per Sample)
Unique values in train label: [0 1]
Unique values in test label: [0 1]


  mean_emb = df_emb.groupby(df_emb.index).mean()
  mean_emb = df_emb.groupby(df_emb.index).mean()


(7, 2) (32,)
Evaluations
./outcomes/chemo
Geneformer-V2-104M


In [None]:
pd.DataFrame(metrics_test_list, index = model_names)

In [None]:
pd.DataFrame(metrics_test_list, index = model_names)

In [None]:
pd.DataFrame(metrics_test_list, index = model_names)

In [None]:
pd.DataFrame(metrics_test_list, index = model_names)

In [None]:
pd.DataFrame(metrics_test_list, index = model_names)

In [None]:
pd.DataFrame(metrics_test_list, index = model_names).to_csv(join(saving_dir, 'vote_metrics_test.csv' ))

In [None]:
pd.DataFrame(metrics_test_list, index = model_names)

In [None]:
pd.DataFrame(metrics_test_list, index = model_names).to_csv(join(saving_dir, 'avg_metrics_test.csv' ))

In [None]:
pd.DataFrame(metrics_test_list, index = model_names)

In [None]:
%%time


In [None]:
pred_train_df  = pd.concat(predictions_train, names=['fold']).reset_index(level=0)
pred_train_df.to_csv(join(saving_dir, 'vote_pred_train.csv'))

In [None]:
pred_train_df  = pd.concat(predictions_train, names=['fold']).reset_index(level=0)
pred_train_df.to_csv(join(saving_dir, 'vote_pred_train.csv'))

In [None]:
pd.DataFrame(metrics_test).mean(numeric_only=True)

In [None]:
pd.DataFrame(metrics_test).to_csv(join(saving_dir, 'vote_metrics_test.csv'))

In [None]:
pd.DataFrame(metrics_train).to_csv(join(saving_dir, 'vote_metrics_train.csv'))

In [None]:
pd.DataFrame(metrics)['auprc'].mean()

In [None]:
pd.DataFrame(metrics)

In [None]:
pd.DataFrame(metrics)['auprc'].mean()

## Subtype

In [None]:
adata.obs['cohort'].value_counts()

In [None]:
adata_subtype = adata[adata.obs['cohort'] == 'treatment_naive'].copy()


In [None]:
adata_subtype.obs['timepoint'].value_counts()

In [None]:
adata_subtype.obs['timepoint'].value_counts()

In [None]:
adata_subtype = adata_subtype[adata_subtype.obs['timepoint'] == 'Pre']


In [None]:
adata_subtype.obs

In [None]:
adata_subtype.obs['Cancer_type'].value_counts()

In [None]:
adata_subtype = adata_subtype[adata_subtype.obs['Cancer_type'].isin( ['ER+', 'TNBC'])]


In [None]:
adata_subtype.obs['Cancer_type'].value_counts()

In [None]:
cv_splits_file = '/home/jupyter/sceval/data_splits/brca_full/brca_subtype/cv_splits.json'
cv_split_dict = json.load(open(cv_splits_file))

In [None]:
adata_subtype.obs['sample_id'] = adata_subtype.obs[id_column]
adata_subtype.obs['label'] = adata_subtype.obs['Cancer_type']


In [None]:
adata_subtype.obs['label']

In [None]:
adata_subtype.obs['label'] = adata_subtype.obs['label'].astype(str)

In [None]:
adata_subtype.obs['label'] 

In [None]:
label_map = {'ER+': 0, 'TNBC': 1}
adata_subtype.obs.label =adata_subtype.obs.label.map(label_map)

In [None]:
adata_subtype.obs.label

In [None]:
%%time
embedding_col = 'X_geneformer'
metrics= []
for i in range(n_splits):
            print(f'---------- fold {i+1}----------')
            train_ids, test_ids = train_ids_list[i], test_ids_list[i]
            print('Split data')
            adata_train, adata_test = split_data(adata_subtype,  id_column, train_ids, test_ids)
            X_train, y_train = adata_train.obsm[embedding_col], adata_train.obs['label'].values
            X_test, y_test = adata_test.obsm[embedding_col], adata_test.obs['label'].values
        
            print('Train classifier')
            model, _, _ , y_pred_train, y_pred_test, y_pred_score_train, y_pred_score_test = train_classifier(X_train, y_train, X_test, y_test, 
                                                                  model_name='random_forest')
                
            print('Evaluations')
            adata_test.obs['pred'] = y_pred_test
            adata_test.obs['pred_score'] = y_pred_score_test[:, 1] #assume binary classification
            adata_test_df = adata_test.obs
            pred_df = get_patient_level( adata_test_df)
            fold_metrics = get_classification_metrics(pred_df)
            metrics.append(fold_metrics)

In [None]:
pd.DataFrame(metrics)

## Pre vs post (all cells)

In [None]:
adata.obs['pre_post'].value_counts()

In [None]:
# read cv splits
# split data
# run cv
# get patien level predictions
# save preictions and metrics


In [None]:
import json

In [None]:
adata_tcell = adata[adata.obs.cell_types=='T_cell']

In [None]:
adata_tcell

In [None]:
split_data

In [None]:
cv_splits_file = '/home/jupyter/sceval/data_splits/brca_full/brca_pre_post/cv_splits.json'

In [None]:
cv_split_dict = json.load(open(cv_splits_file))

In [None]:
cv_split_dict

In [None]:
id_column, n_splits, train_ids_list, test_ids_list = get_splits_cv(cv_split_dict)

In [None]:
id_column

In [None]:
adata_tcell.obs['sample_id'] = adata_tcell.obs[id_column]

label_map = {'Pre': 0, 'Post': 1}


In [None]:
adata_tcell.obs.label= adata_tcell.obs.label.map(label_map)

In [None]:
%%time
embedding_col = 'X_geneformer'
metrics= []
for i in range(n_splits):
            print(f'---------- fold {i+1}----------')
            train_ids, test_ids = train_ids_list[i], test_ids_list[i]
            print('Split data')
            adata_train, adata_test = split_data(adata_tcell,  id_column, train_ids, test_ids)
            X_train, y_train = adata_train.obsm[embedding_col], adata_train.obs['label']
            X_test, y_test = adata_test.obsm[embedding_col], adata_test.obs['label']
        
            print('Train classifier')
            model, y_train, y_test, y_pred_train, y_pred_test, y_pred_score_train, y_pred_score_test = train_classifier(X_train, y_train, X_test, y_test, 
                                                                  model_name='random_forest')
                
            print('Evaluations')
            adata_test.obs['pred'] = y_pred_test
            adata_test.obs['pred_score'] = y_pred_score_test[:, 1] #assume binary classification
            adata_test_df = adata_test.obs
            pred_df = get_patient_level( adata_test_df)
            fold_metrics = get_classification_metrics(pred_df)
            metrics.append(fold_metrics)
            
            
            

In [None]:
pd.DataFrame(metrics)

In [None]:
pd.DataFrame(metrics)['auprc'].mean()

In [None]:
adata.obs.cell_types.value_counts()

In [None]:
adata_cancer = adata[adata.obs.cell_types=='Cancer_cell']

In [None]:
adata_cancer

In [None]:
adata_cancer.obs['sample_id'] = adata_cancer.obs[id_column]

label_map = {'Pre': 0, 'Post': 1}

In [None]:
adata_cancer.obs.label= adata_cancer.obs.label.map(label_map)

In [None]:
%%time
embedding_col = 'X_geneformer'
metrics= []
for i in range(n_splits):
            print(f'---------- fold {i+1}----------')
            train_ids, test_ids = train_ids_list[i], test_ids_list[i]
            print('Split data')
            adata_train, adata_test = split_data(adata_cancer,  id_column, train_ids, test_ids)
            X_train, y_train = adata_train.obsm[embedding_col], adata_train.obs['label']
            X_test, y_test = adata_test.obsm[embedding_col], adata_test.obs['label']
        
            print('Train classifier')
            model, y_train, y_test, y_pred_train, y_pred_test, y_pred_score_train, y_pred_score_test = train_classifier(X_train, y_train, X_test, y_test, 
                                                                  model_name='random_forest')
                
            print('Evaluations')
            adata_test.obs['pred'] = y_pred_test
            adata_test.obs['pred_score'] = y_pred_score_test[:, 1] #assume binary classification
            adata_test_df = adata_test.obs
            pred_df = get_patient_level( adata_test_df)
            fold_metrics = get_classification_metrics(pred_df)
            metrics.append(fold_metrics)

In [None]:
pd.DataFrame(metrics)