In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
from gears import PertData, GEARS
import numpy as np
import pandas as pd
from tqdm import tqdm
import scanpy as sc
import anndata
from scipy.sparse import csr_matrix

In [4]:
data_dir = '/mlbio_scratch/vinas/sc_perturbation_benchmark/data/replogle_k562_v3_2022'

#### Download data

* Download data from https://plus.figshare.com/articles/dataset/_Mapping_information-rich_genotype-phenotype_landscapes_with_genome-scale_Perturb-seq_Replogle_et_al_2022_processed_Perturb-seq_datasets/20029387 
    * Files:
        * K562_essential_raw_singlecell.h5ad

In [5]:
adata = anndata.read_h5ad(f'{data_dir}/K562_essential_raw_singlecell.h5ad')

#### scGPT processing
Subsample to 100 samples per perturbation and 2500 control samples

In [6]:
# Select HVGs
sc.pp.normalize_total(adata)
sc.pp.log1p(adata)
sc.pp.highly_variable_genes(adata, n_top_genes=5000, subset=False)
hvg_flag = adata.var['highly_variable'].values
gene_flag = adata.var['gene_name'].isin(adata.obs['gene'].values).values
select_flag = np.logical_or(hvg_flag, gene_flag)
condition_flag = adata.obs['gene'].isin(adata.var['gene_name'].values.tolist()+['non-targeting']).values
adata_subset = adata[condition_flag, select_flag]

In [None]:
# Subsample to 100 samples per perturbation and 2500 control samples
target_cells = 100
cluster_key = 'gene'
adatas = [adata_subset[adata_subset.obs[cluster_key]==clust] for clust in adata_subset.obs[cluster_key].cat.categories]
for dat in tqdm(adatas):
    if dat.n_obs > target_cells:
        if dat.obs[cluster_key].cat.categories.values[0] != 'non-targeting':
            sc.pp.subsample(dat, n_obs=target_cells)
        else:
            sc.pp.subsample(dat, n_obs=2500)
adata_downsampled = adatas[0].concatenate(*adatas[1:])

100%|█████████████████████████████████████████████| 1869/1869 [00:21<00:00, 88.35it/s]


In [None]:
adata_downsampled.obs['condition'] = [i+'+ctrl' for i in adata_downsampled.obs['gene'].values]
adata_downsampled.obs['condition'] = adata_downsampled.obs['condition'].replace({'non-targeting+ctrl': 'ctrl'})
adata_downsampled.obs['cell_type'] = 'K562'
len(adata_downsampled.obs['condition'].unique())

In [None]:
out_dir = '/mlbio_scratch/vinas/sc_perturbation_benchmark/data/replogle_k562_v3_2022'
pert_data = PertData(out_dir)
adata_downsampled.X = csr_matrix(adata_downsampled.X)
pert_data.new_data_process(dataset_name = 'K562', adata = adata_downsampled)