In [1]:
import pandas as pd
import scanpy as sc
from CRISP.utils import rank_genes_groups_by_cov
import numpy as np
import CRISP.scFM as scFM

### Create anndata

In [None]:
# raw data can be downloaded in NeurIPS competition website: https://www.kaggle.com/competitions/open-problems-single-cell-perturbations/data?select=sample_submission.csv
obs_meta = pd.read_csv('raw/adata_obs_meta.csv')
adata_train = pd.read_parquet('raw/adata_train.parquet')
de_train = pd.read_parquet('raw/de_train.parquet')

In [None]:
obs_idx_dict = dict(zip(obs_meta['obs_id'].values,list(obs_meta.index)))
gene_idx_dict = dict(zip(list(de_train.columns)[5:],list(range(len(list(de_train.columns)[5:])))))

In [None]:
adata_train['obs_idx'] = adata_train['obs_id'].apply(lambda i: obs_idx_dict[i])
def map_gene_idx(i):
    try:
        a = gene_idx_dict[i]
    except:
        a = None
    return a

In [None]:
adata_train['gene_idx'] = adata_train['gene'].apply(map_gene_idx)
adata_train_sub = adata_train[~adata_train['gene_idx'].isna()]
adata_train_sub['gene_idx'] = adata_train_sub['gene_idx'].astype(int)

In [None]:
from scipy.sparse import coo_matrix
from anndata import AnnData
sparse_matrix = coo_matrix((adata_train_sub['normalized_count'], (adata_train_sub['obs_idx'], adata_train_sub['gene_idx'])))
adata = AnnData(X=sparse_matrix)

In [None]:
adata.obs = obs_meta
adata.obs_names = obs_meta['obs_id'].values
adata.X = adata.X.tocsc()
adata.var_names = list(gene_idx_dict.keys())
adata.var['gene_id'] = list(gene_idx_dict.keys())

In [None]:
sc.write('adata_pp.h5ad',adata)

### Preprocess

In [None]:
adata = sc.read('adata_pp.h5ad')

In [None]:
import re

def remove_non_alphanumeric(input_string):
    return re.sub(r'[^a-zA-Z0-9]', '', input_string)

adata.obs['condition'] = adata.obs['sm_name']
adata.obs['condition'] = adata.obs['condition'].apply(remove_non_alphanumeric)
adata.obs['condition'] = adata.obs['condition'].replace('DimethylSulfoxide','DMSO')
adata.obs['dose_val'] = adata.obs['dose_uM'].astype(float) / np.max(adata.obs['dose_uM'].astype(float))
adata.obs['cov_drug_dose_name'] = adata.obs.cell_type.astype(str) + '_' + adata.obs.condition.astype(str) + '_' + adata.obs.dose_val.astype(str)
adata.obs['cov_drug_name'] = adata.obs.cell_type.astype(str) + '_' + adata.obs.condition.astype(str)
adata.obs['eval_category'] = adata.obs['cov_drug_name']
adata.obs['control'] = adata.obs['control'].astype(int)
adata.obs['drug_dose_name'] = adata.obs.condition.astype(str) + '_' + adata.obs.dose_val.astype(str)
adata.obs['neg_control'] = (adata.obs['condition']=='DMSO').astype(int)

In [None]:
a = pd.DataFrame(adata.obs.cov_drug_name.value_counts())
type_drug_less_index = a[a['cov_drug_name'] < 5].index
adata_filtered = adata[~adata.obs['cov_drug_name'].isin(type_drug_less_index)]

In [None]:
rank_genes_groups_by_cov(adata_filtered, groupby='cov_drug_name', covariate='cell_type', control_group='DMSO')

In [None]:
from rdkit import Chem
smiles_list = adata_filtered.obs.SMILES.apply(lambda s: Chem.CanonSmiles(s))

In [None]:
model_path = '/path/to/scGPT/model' # use blood
adata_filtered = scFM.calc_gpt(adata_filtered,model_path,gene_name='gene_name',return_key='X_scGPT')

### Train-test-ood split

In [None]:
def split_dataset(adata,cell_types_inood,split_key):
    # set all ood cell type samples as ood
    adata.obs[split_key] = 'train'
    setout_idx = adata[adata.obs.cell_type.isin(cell_types_inood)].obs.index
    adata.obs[split_key].loc[setout_idx] = 'ood'

    # set 20% left samples as test
    def stratified_sample(group):
        return group.sample(frac=0.2) 

    settest_idx = adata[adata.obs[split_key] != 'ood'].obs.groupby(['cell_type','donor_id','condition'], group_keys=False).apply(stratified_sample).index
    adata.obs[split_key].loc[settest_idx] = 'test'

    # set 75% unperturbed ood cell type samples as train
    def stratified_sample(group):
        return group.sample(frac=0.75)
    settrain_idx = adata[(adata.obs[split_key] == 'ood') & (adata.obs.neg_control == 1)].obs.groupby(['cell_type','donor_id','condition'], group_keys=False).apply(stratified_sample).index
    adata.obs[split_key].loc[settrain_idx] = 'train'


In [None]:
adata = split_dataset(adata,['Myeloid cells','T regulatory cells'],'split')
adata = split_dataset(adata,['T cells CD4+','B cells'],'split2')
adata = split_dataset(adata,['T cells CD8+','NK cells'],'split3')

In [6]:
pd.crosstab(adata[adata.obs['neg_control']==0].obs['split'],adata[adata.obs['neg_control']==0].obs['cell_type'])

cell_type,B cells,Myeloid cells,NK cells,T cells CD4+,T cells CD8+,T regulatory cells
split,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ood,0,11264,0,0,0,7418
test,2217,0,10675,22849,2849,0
train,8863,0,42703,91344,11415,0


In [7]:
pd.crosstab(adata[adata.obs['neg_control']==0].obs['split2'],adata[adata.obs['neg_control']==0].obs['cell_type'])

cell_type,B cells,Myeloid cells,NK cells,T cells CD4+,T cells CD8+,T regulatory cells
split2,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ood,11080,0,0,114193,0,0
test,0,2253,10684,0,2854,1491
train,0,9011,42694,0,11410,5927


In [8]:
pd.crosstab(adata[adata.obs['neg_control']==0].obs['split3'],adata[adata.obs['neg_control']==0].obs['cell_type'])

cell_type,B cells,Myeloid cells,NK cells,T cells CD4+,T cells CD8+,T regulatory cells
split3,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ood,0,0,53378,0,14264,0
test,2214,2253,0,22846,0,1491
train,8866,9011,0,91347,0,5927
