In [2]:
import warnings
warnings.filterwarnings(action='ignore')

import os
import scanpy as sc
import squidpy as sq
import pandas as pd

In [3]:
input_dir = '/data/pinello/PROJECTS/2023_03_SVGBenchmarking/workflow/results/spatial_atac'
out_dir = '../../results/09_spatial_atac/01_get_peaks'

In [4]:
os.makedirs(out_dir, exist_ok=True)

In [6]:
method_list = ['nnSVG', 'Spanve',  'SPARK-X', 'SpatialDE2',
               'SpatialDE', 'SOMDE', 'MoranI', 'scGCO', 'SpaGCN',
               'SpaGFT', 'Sepal']

dataset_list = ['E12_5_rep1', 'E12_5_rep2', 
                'E13_5_rep2', 'E15_5_rep1', 'E15_5_rep2']

In [7]:
def get_peaks(df, method, n_svgs=20000):
    if method == 'nnSVG':
        df = df.sort_values(['prop_sv'], ascending=False)
        df = df.head(n_svgs).reset_index()
        df = df[['gene_id']]
        
    elif method == 'Spanve':
        df = df.sort_values(['ent'], ascending=False)
        df = df.head(n_svgs).reset_index()
        df = df[['Unnamed: 0']]
    
    elif method == 'SPARK-X':
        df = df.sort_values(['adjustedPval'])
        df = df.head(n_svgs).reset_index()
        df = df[['Unnamed: 0']]
    
    elif method == 'SPARK':
        df = df.sort_values(['adjusted_pvalue'])
        df = df.head(n_svgs).reset_index()
        df = df[['Unnamed: 0']]
        
    elif method == 'SpatialDE2':
        df = df.sort_values(['FSV'], ascending=False)
        df = df.head(n_svgs).reset_index()
        df = df[['Unnamed: 0']]
    
    elif method == 'SpatialDE':
        df = df.sort_values(['FSV'], ascending=False)
        df = df.head(n_svgs).reset_index()
        df = df[['Unnamed: 0']]
        
    elif method == 'SOMDE':
        df = df.sort_values(['FSV'], ascending=False)
        df = df.head(n_svgs).reset_index()
        df = df[['Unnamed: 0']]
        
    elif method == 'MoranI':
        df = df.sort_values(['I'], ascending=False)
        df = df.head(n_svgs).reset_index()
        df = df[['Unnamed: 0']]
        
    elif method == 'scGCO':
        df = df.sort_values(['fdr'])
        df = df.head(n_svgs).reset_index()
        df = df[['Unnamed: 0']]
        
    elif method == 'SpaGCN':
        df = df.sort_values(['pvals_adj'])
        df = df.head(n_svgs).reset_index()
        df = df[['Unnamed: 0']]

    elif method == 'SpaGFT':
        df = df.sort_values(['gft_score'], ascending=False)
        df = df.head(n_svgs).reset_index()
        df = df[['Unnamed: 0']]

    elif method == 'GPcounts':
        df['log_likelihood_ratio'] = df['log_likelihood_ratio'].fillna(0)
        df = df.sort_values(['log_likelihood_ratio'], ascending=False)
        df = df.head(n_svgs).reset_index()
        df = df[['Unnamed: 0']]
    
    elif method == 'Sepal':
        df = df.sort_values(['sepal_score'], ascending=False)
        df = df.head(n_svgs).reset_index()
        df = df[['Unnamed: 0']]
        
    elif method == 'BOOST-GP':
        df = df.sort_values(['PPI'], ascending=False)
        df = df.head(n_svgs).reset_index()
        df = df[['Unnamed: 0']]
    else:
        print(f'unknown method: {method}')
        
    return df

In [9]:
for method in method_list:
    os.makedirs(f'{out_dir}/{method}', exist_ok=True)
    
    for dataset in dataset_list:
        input_filename = f'{input_dir}/{method}/{dataset}.csv'
        
        if not os.path.exists(input_filename):
            print(input_filename)
            continue
        
        df = pd.read_csv(input_filename)
        df = get_peaks(df, method)
        df.columns = ['peak']
        df.to_csv(f'{out_dir}/{method}/{dataset}.csv', header=True, index=False)