# Simulation Tests

This notebook is used to benchmark scanpro methods using simulated cell counts. Cell counts are simulated using the hierarchical model described in the propeller paper (Phipson et al., 2022). The function `simulate_cell_counts` is used to simulate clusters that do not differ in cell proportions across conditions. The function `simulate_cell_counts_2` is used for cell counts with differences in proportions in three cell types: cardiomyocytes, fibroblasts and smooth muscel cells. The function `convert_counts_to_df` is used to convert the simulated cell count matrix into a dataframe and dividing cells into two conditions.

1 & 2: Testing performance of scanpro on 100 random datasets with no differences (1) and with differences (2). The hit rate is the proportion of tests where all clusters were identified as non-significant (1) or where only cardiomyocytes, fibroblasts and smooth muscle cells are identified as significant (2). We also visualize the proportion of significant tests for each cell types/cluster individually.

In [1]:
import pandas as pd
import numpy as np
import multiprocessing as mp
import time

from scanpro import scanpro
from scanpro.utils import simulate_cell_counts, simulate_cell_counts_2, convert_counts_to_df, estimate_params_from_counts

In [2]:
OUT_PATH = './results/benchmark'

In [3]:
def simulate_data(p, a, b, n_reps, n_conds=2, n=20, mu=5000, n_sims=100, null=True):
    if b is None:
        b = a * (1-p) / p
        
    datasets = []
    for sim in range(n_sims):
        if null:  # null simulation -> no differences
            counts = simulate_cell_counts(p, n_reps, a, b, n_conds, n=n, mu=mu)
        else:  # true differences
            counts = simulate_cell_counts_2(p, n_reps, a, b, n_conds, n=n, mu=mu)

        counts_df = convert_counts_to_df(counts, column_name="cluster")

        datasets.append(counts_df)
    
    return datasets

# Benchmarking

In [4]:
def monitor_jobs(jobs):
    """
    Monitor the status of jobs submitted to a pool.

    Parameters
    ----------
    jobs : list of job objects
        List of job objects, e.g. as returned by pool.map_async().
    """

    if isinstance(jobs, dict):
        jobs = list(jobs.values())

    from tqdm import tqdm_notebook as tqdm
    pbar = tqdm(total=len(jobs))
    
    # Wait for all jobs to finish
    n_ready = sum([job.ready() for job in jobs])
    while n_ready != len(jobs):
        if n_ready != pbar.n:
            pbar.n = n_ready
            pbar.refresh()
        time.sleep(1)
        n_ready = sum([job.ready() for job in jobs])

    pbar.n = n_ready  # update progress bar to 100%
    pbar.refresh()
    pbar.close()

In [5]:
def test_performance(datasets,
                     n_reps,  # number of samples per condition
                     transform,
                     null=True,
                     repd_data=False
                     ):
    """Test the performance of normal and bootstrap scanpro on simulated data.
    :param list datasets: List of datasets as pandas dataframes to run scanpro on
    :param int n_reps: Number of replicates the bootstrap is going to generate
    :param str transform: method of transformation (logit or arcsin)
    :return pandas.DataFrame all_run_results: A dataframe with results from all runs.
    """
    
    pool = mp.Pool()
    jobs = []

    for dataset in datasets:

        if not repd_data:
            samples_col = None
        else:
            samples_col = "samples"

        # Run Scanpro or scanpro bootstrapping on 100 datasets
        job = pool.apply_async(scanpro.scanpro, (dataset,), dict(clusters_col="cluster", conds_col="group", 
                                                                 samples_col=samples_col, n_reps=n_reps,
                                                                 transform=transform, verbosity=0))
        jobs.append(job)

    pool.close()
    monitor_jobs(jobs)

    results = [job.get() for job in jobs]
    pool.join()

    # Collect result
    all_run_results = pd.concat([result.results for result in results])

    return all_run_results

In [6]:
#### HELPER STATISTCS FUNCTIONS ####

def compare_p_values(p_values, per_cluster=False):
    """ Compare p-values of simulation with differences with expected output"""
    out = []
    for x in p_values:
        if per_cluster:
            # save hit rate per cluster
            hitrate = [x[0]<.05, x[1]>=.05, x[2]>=.05, x[3]<.05, x[4]>=.05, x[5]>=.05, x[6]<0.05]
        else:
            # save hit rate per run
            hitrate = x[0]<0.05 and x[1]>=0.05 and x[2]>=0.05 and x[3]<0.05 and x[4]>=0.05 and x[5]>=0.05 and x[6]<0.05
        out.append(hitrate)

    return out


def calc_hitrates(results, null=True, compare=None):
    out = {tool: {method: None for method in list(results["rep"].keys())} for tool in results.keys()}
    for tool in results.keys():
        for method in results[tool]:
            p_values = [results[tool][method].iloc[i:i+7, -1] for i in np.arange(0, len(results[tool][method]+1), 7)]
            if null:
                # if null simulation, check if all p-values > 0.05
                out[tool][method] = np.mean([all(p_value > 0.05) for p_value in p_values]) * 100
            else:
                if compare is None:
                    raise ValueError("Please provide a compare function!")
                
                out[tool][method] = np.mean(compare(p_values)) * 100

    return pd.DataFrame(out)


def calc_hitrate_cluster(results, rep=True, null=True, compare=None):
    """Calculate hit rate per cluster for the bootstrapping method"""
    tool = 'rep' if rep else 'norep'
    
    hit_rate_cluster = {method: None for method in results[tool].keys()}
    props = []  # mean cluster proportions
    for method in results[tool]:
        # save mean cluster proportions
        props.append(results[tool][method].reset_index().groupby('clusters').mean()['baseline_props'].to_list())
        # if null, check if all p-values > 0.05
        if null:
            hit_rates = results[tool][method].iloc[:, -1] > 0.05
            # get hit rate for each of the 5 clusters
            hit_rate_cluster[method] = np.mean(np.array(hit_rates.to_list()).reshape(100, 5), axis=0) * 100
        else:
            # get p-values for each run
            p_values = [results[tool][method].iloc[i:i+7, -1] for i in np.arange(0, len(results[tool][method]+1), 7)]
            # check if p-values significance is as expected
            hit_rate_cluster[method] = compare_p_values(p_values, per_cluster=True)
            hit_rate_cluster[method] = np.mean(hit_rate_cluster[method], axis=0) * 100
    
    hit_rate_cluster = pd.DataFrame(hit_rate_cluster)
    hit_rate_cluster['cluster'] = results[tool]['logit_4_reps'].index.unique()
    hit_rate_cluster = hit_rate_cluster.melt(id_vars='cluster')
    hit_rate_cluster.rename(columns={'variable': 'method', 'value': 'hit_rate'}, inplace=True)
    # add mean cluster proportions
    hit_rate_cluster['props'] = list(np.array(props).flat)
    
    return hit_rate_cluster


def calc_pct_significance(results, null=True, n_sims=100):
    """ calculate percentage of significant tests per cluster"""
    n_clusters = 5 if null else 7
    out = {tool: [] for tool in results.keys()}
    for tool in results.keys():
        for method in results[tool]:
            clusters = list(results[tool][method].index.unique())
            # get p_values with simulations as rows and clusters as cols
            df = results[tool][method].iloc[:, -1].reset_index().copy()
            df["sim"] = np.repeat([i for i in range(1, 101)], n_clusters)
            df["sim"] = df["sim"].astype(int)
            df = df.pivot(columns="clusters", index="sim").sort_index()
            df.columns = df.columns.droplevel()

            for cluster in clusters:
                # save proportions of significant p-values for each cluster independently
                try:
                    out[tool].append((df[cluster] < 0.05).value_counts(normalize=True)[True])
                except KeyError:  # if test has 100% hit rate for a cluster, error is 0
                    out[tool].append(0)
            
    
    out = pd.DataFrame(out)
    out = out.melt()
    out['cluster'] = np.tile(clusters, out.shape[0]//len(clusters))
    out['method'] = np.repeat(list(results['rep'].keys())*2, n_clusters)
    out = out.rename(columns={'variable': 'tool', 'value': 'pct_sig'})
    
    return out

## 1. Null simulations

### simulate datasets

In [7]:
np.random.seed(10)

p = np.array([0.01, 0.05, 0.15, 0.34, 0.45])  # clusters proportions in all samples
a = 10
b = a*(1-p)/p
n_reps = [4, 8, 10, 20]

datasets_null = {f"{n_rep}_reps": simulate_data(p, a, b, n_rep, n_conds=2, n=20, mu=5000, n_sims=100, null=True) for n_rep in n_reps}

### 4 replicates

In [8]:
n_reps = 4

#### logit

In [9]:
# replicated
scanpro_4_reps_logit = test_performance(datasets_null[f"{n_reps}_reps"], transform="logit",
                                        n_reps=n_reps, repd_data=True, null=True)

  0%|          | 0/100 [00:00<?, ?it/s]

In [10]:
# unreplicated
scanpro_norep_4_reps_logit = test_performance(datasets_null[f"{n_reps}_reps"], transform="logit",
                                              n_reps=n_reps, repd_data=False, null=True)

  0%|          | 0/100 [00:00<?, ?it/s]

#### arcsine

In [11]:
# replicated
scanpro_4_reps_arcsin = test_performance(datasets_null[f"{n_reps}_reps"], transform="arcsin",
                                         n_reps=n_reps, repd_data=True, null=True)

  0%|          | 0/100 [00:00<?, ?it/s]

In [12]:
# unreplicated
scanpro_norep_4_reps_arcsin = test_performance(datasets_null[f"{n_reps}_reps"], transform="arcsin",
                                               n_reps=n_reps, repd_data=False, null=True)

  0%|          | 0/100 [00:00<?, ?it/s]

### 8 replicates

In [13]:
n_reps = 8

#### logit

In [14]:
# replicated
scanpro_8_reps_logit = test_performance(datasets_null[f"{n_reps}_reps"], transform="logit",
                                        n_reps=n_reps, repd_data=True, null=True)

  0%|          | 0/100 [00:00<?, ?it/s]

In [15]:
# unreplicated
scanpro_norep_8_reps_logit = test_performance(datasets_null[f"{n_reps}_reps"], transform="logit",
                                              n_reps=n_reps, repd_data=False, null=True)

  0%|          | 0/100 [00:00<?, ?it/s]

#### arcsine

In [16]:
# replicated
scanpro_8_reps_arcsin = test_performance(datasets_null[f"{n_reps}_reps"], transform="arcsin",
                                        n_reps=n_reps, repd_data=True, null=True)

  0%|          | 0/100 [00:00<?, ?it/s]

In [17]:
# replicated
scanpro_norep_8_reps_arcsin = test_performance(datasets_null[f"{n_reps}_reps"], transform="arcsin",
                                               n_reps=n_reps, repd_data=False, null=True)

  0%|          | 0/100 [00:00<?, ?it/s]

### 10 reps

In [18]:
n_reps = 10

#### logit

In [19]:
# replicated
scanpro_10_reps_logit = test_performance(datasets_null[f"{n_reps}_reps"], transform="logit",
                                        n_reps=n_reps, repd_data=True, null=True)

  0%|          | 0/100 [00:00<?, ?it/s]

In [20]:
# replicated
scanpro_norep_10_reps_logit = test_performance(datasets_null[f"{n_reps}_reps"], transform="logit",
                                               n_reps=n_reps, repd_data=False, null=True)

  0%|          | 0/100 [00:00<?, ?it/s]

#### arcsine

In [21]:
# replicated
scanpro_10_reps_arcsin = test_performance(datasets_null[f"{n_reps}_reps"], transform="arcsin",
                                        n_reps=n_reps, repd_data=True, null=True)

  0%|          | 0/100 [00:00<?, ?it/s]

In [22]:
# replicated
scanpro_norep_10_reps_arcsin = test_performance(datasets_null[f"{n_reps}_reps"], transform="arcsin",
                                                n_reps=n_reps, repd_data=False, null=True)

  0%|          | 0/100 [00:00<?, ?it/s]

### 20 reps

In [23]:
n_reps = 20

#### logit

In [24]:
# replicated
scanpro_20_reps_logit = test_performance(datasets_null[f"{n_reps}_reps"], transform="logit",
                                         n_reps=n_reps, repd_data=True, null=True)

  0%|          | 0/100 [00:00<?, ?it/s]

In [25]:
# replicated
scanpro_norep_20_reps_logit = test_performance(datasets_null[f"{n_reps}_reps"], transform="logit",
                                               n_reps=n_reps, repd_data=False, null=True)

  0%|          | 0/100 [00:00<?, ?it/s]

#### arcsine

In [26]:
# replicated
scanpro_20_reps_arcsin = test_performance(datasets_null[f"{n_reps}_reps"], transform="arcsin",
                                         n_reps=n_reps, repd_data=True, null=True)

  0%|          | 0/100 [00:00<?, ?it/s]

In [27]:
# replicated
scanpro_norep_20_reps_arcsin = test_performance(datasets_null[f"{n_reps}_reps"], transform="arcsin",
                                                n_reps=n_reps, repd_data=False, null=True)

  0%|          | 0/100 [00:00<?, ?it/s]

### Results

In [28]:
null_sim_results = {"rep": {"logit_4_reps": scanpro_4_reps_logit,
                               "arcsin_4_reps": scanpro_4_reps_arcsin,
                               "logit_8_reps": scanpro_8_reps_logit,
                               "arcsin_8_reps": scanpro_8_reps_arcsin,
                               "logit_10_reps": scanpro_10_reps_logit,
                               "arcsin_10_reps": scanpro_10_reps_arcsin,
                               "logit_20_reps": scanpro_20_reps_logit,
                               "arcsin_20_reps": scanpro_20_reps_arcsin},
                   "norep": {"logit_4_reps": scanpro_norep_4_reps_logit,
                                   "arcsin_4_reps": scanpro_norep_4_reps_arcsin,
                                   "logit_8_reps": scanpro_norep_8_reps_logit,
                                   "arcsin_8_reps": scanpro_norep_8_reps_arcsin,
                                   "logit_10_reps": scanpro_norep_10_reps_logit,
                                   "arcsin_10_reps": scanpro_norep_10_reps_arcsin,
                                   "logit_20_reps": scanpro_norep_20_reps_logit,
                                   "arcsin_20_reps": scanpro_norep_20_reps_arcsin}}

#### Hit Rate
Hit rate is the proportion of tests (out of `n_sims` tests) where the results matched the results with original replicates exactly, that is, all significant and non-significant clusters in the original results where identified as such in the bootstrap simulation (hit)

In [29]:
null_sim_hitrates = calc_hitrates(null_sim_results, null=True)
null_sim_hitrates.to_csv(f'{OUT_PATH}/null_sim_hitrates.tsv', sep='\t')

In [30]:
null_sim_pct_sig_cluster = calc_pct_significance(null_sim_results, n_sims=100)
null_sim_pct_sig_cluster.to_csv(f'{OUT_PATH}/null_sim_pct_sig_cluster.tsv', sep='\t')

In [31]:
# get hit rate per cluster
null_sim_hitrate_cluster_rep = calc_hitrate_cluster(null_sim_results, rep=True, null=True)
null_sim_hitrate_cluster_rep.to_csv(f'{OUT_PATH}/null_sim_rep_hitrate_cluster.tsv', sep='\t')

null_sim_hitrate_cluster_norep = calc_hitrate_cluster(null_sim_results, rep=False, null=True)
null_sim_hitrate_cluster_norep.to_csv(f'{OUT_PATH}/null_sim_norep_hitrate_cluster.tsv', sep='\t')

## 2. Simulation with true differences

### Data preperation
see https://phipsonlab.github.io/propeller-paper-analysis/SimTrueDiff.html

In [32]:
heart_counts = pd.read_csv('/home/yalayou/pypropeller/Alayoubi_et_al_2023/data/heart_counts.tsv', sep='\t')
heart_counts.drop(['Condition', 'Sex'], inplace=True, axis=1)
heart_counts = heart_counts.set_index('Sample').T
heart_counts.drop('Erythroid', inplace=True)  # remove erythroids

# proportions of each cluster in all samples
true_props = heart_counts.sum(axis=1) / heart_counts.sum(axis=1).sum()  # sum of cells in cluster / sum of all cells
true_props = true_props.to_frame(name="props")

# estimate beta paramters from counts
params = estimate_params_from_counts(heart_counts)  # rows are clusters
a = params[1]
b = params[2]

# Set up true proportions for the two groups
grp1_trueprops = true_props.values.flatten()
grp2_trueprops = true_props.values.flatten()

grp2_trueprops[0] = grp1_trueprops[0]/2
grp2_trueprops[3] = grp2_trueprops[3]*2
grp2_trueprops[6] = grp1_trueprops[6]*3

grp2_trueprops[0] = grp2_trueprops[0] + (1-grp2_trueprops.sum())/2
grp2_trueprops[3] = grp2_trueprops[3] + (1-grp2_trueprops.sum())
 
# calculate beta for both groups
b1 = a*(1-grp1_trueprops)/grp1_trueprops
b2 = a*(1-grp2_trueprops)/grp2_trueprops
b_grps = [b1, b2]

b_grps

[Cardiomyocytes           1.769390
 Endothelial cells      107.760684
 Epicardial cells        47.297371
 Fibroblast              14.138098
 Immune cells            16.442437
 Neurons                143.134155
 Smooth muscle cells    458.574736
 dtype: float64,
 Cardiomyocytes           4.742325
 Endothelial cells      107.760684
 Epicardial cells        47.297371
 Fibroblast               4.666523
 Immune cells            16.442437
 Neurons                143.134155
 Smooth muscle cells    150.405074
 dtype: float64]

### Simulate datasets

In [33]:
np.random.seed(10)

n_reps = [4, 8, 10, 20]

datasets_difs = {f"{n_rep}_reps": simulate_data(true_props, a, b_grps, n_rep, n_conds=2, n=20, mu=5000, n_sims=100, null=False) for n_rep in n_reps}

### 4 replicates

In [34]:
n_reps = 4

#### logit

In [35]:
# replicated
scanpro_4_reps_logit_difs = test_performance(datasets_difs[f"{n_reps}_reps"], transform="logit",
                                             n_reps=n_reps, repd_data=True, null=False)

  0%|          | 0/100 [00:00<?, ?it/s]

In [36]:
# unreplicated
scanpro_norep_4_reps_logit_difs = test_performance(datasets_difs[f"{n_reps}_reps"], transform="logit",
                                                   n_reps=n_reps, repd_data=False, null=False)

  0%|          | 0/100 [00:00<?, ?it/s]

#### arcsin

In [37]:
# replicated
scanpro_4_reps_arcsin_difs = test_performance(datasets_difs[f"{n_reps}_reps"], transform="arcsin",
                                             n_reps=n_reps, repd_data=True, null=False)

  0%|          | 0/100 [00:00<?, ?it/s]

In [38]:
# unreplicated
scanpro_norep_4_reps_arcsin_difs = test_performance(datasets_difs[f"{n_reps}_reps"], transform="arcsin",
                                                    n_reps=n_reps, repd_data=False, null=False)

  0%|          | 0/100 [00:00<?, ?it/s]

### 8 replicates

In [39]:
n_reps = 8

#### logit

In [40]:
# replicated
scanpro_8_reps_logit_difs = test_performance(datasets_difs[f"{n_reps}_reps"], transform="logit",
                                             n_reps=n_reps, repd_data=True, null=False)

  0%|          | 0/100 [00:00<?, ?it/s]

In [41]:
# unreplicated
scanpro_norep_8_reps_logit_difs = test_performance(datasets_difs[f"{n_reps}_reps"], transform="logit",
                                                    n_reps=n_reps, repd_data=False, null=False)

  0%|          | 0/100 [00:00<?, ?it/s]

#### arcsin

In [42]:
# replicated
scanpro_8_reps_arcsin_difs = test_performance(datasets_difs[f"{n_reps}_reps"], transform="arcsin",
                                              n_reps=n_reps, repd_data=True, null=False)

  0%|          | 0/100 [00:00<?, ?it/s]

In [43]:
# unreplicated
scanpro_norep_8_reps_arcsin_difs = test_performance(datasets_difs[f"{n_reps}_reps"], transform="arcsin",
                                                    n_reps=n_reps, repd_data=False, null=False)

  0%|          | 0/100 [00:00<?, ?it/s]

### 10 replicates

In [44]:
n_reps = 10

#### logit

In [45]:
# replicated
scanpro_10_reps_logit_difs = test_performance(datasets_difs[f"{n_reps}_reps"], transform="logit",
                                              n_reps=n_reps, repd_data=True, null=False)

  0%|          | 0/100 [00:00<?, ?it/s]

In [46]:
# unreplicated
scanpro_norep_10_reps_logit_difs = test_performance(datasets_difs[f"{n_reps}_reps"], transform="logit",
                                                    n_reps=n_reps, repd_data=False, null=False)

  0%|          | 0/100 [00:00<?, ?it/s]

#### arcsin

In [47]:
# replicated
scanpro_10_reps_arcsin_difs = test_performance(datasets_difs[f"{n_reps}_reps"], transform="arcsin",
                                              n_reps=n_reps, repd_data=True, null=False)

  0%|          | 0/100 [00:00<?, ?it/s]

In [48]:
# unreplicated
scanpro_norep_10_reps_arcsin_difs = test_performance(datasets_difs[f"{n_reps}_reps"], transform="arcsin",
                                                     n_reps=n_reps, repd_data=False, null=False)

  0%|          | 0/100 [00:00<?, ?it/s]

### 20 replicates

In [49]:
n_reps = 20

#### logit

In [50]:
# replicated
scanpro_20_reps_logit_difs = test_performance(datasets_difs[f"{n_reps}_reps"], transform="logit",
                                              n_reps=n_reps, repd_data=True, null=False)

  0%|          | 0/100 [00:00<?, ?it/s]

In [51]:
# unreplicated
scanpro_norep_20_reps_logit_difs = test_performance(datasets_difs[f"{n_reps}_reps"], transform="logit",
                                                    n_reps=n_reps, repd_data=False, null=False)

  0%|          | 0/100 [00:00<?, ?it/s]

#### arcsin

In [52]:
# replicated
scanpro_20_reps_arcsin_difs = test_performance(datasets_difs[f"{n_reps}_reps"], transform="arcsin",
                                               n_reps=n_reps, repd_data=True, null=False)

  0%|          | 0/100 [00:00<?, ?it/s]

In [53]:
# replicated
scanpro_norep_20_reps_arcsin_difs = test_performance(datasets_difs[f"{n_reps}_reps"], transform="arcsin",
                                                     n_reps=n_reps, repd_data=False, null=False)

  0%|          | 0/100 [00:00<?, ?it/s]

### Results

In [54]:
difs_sim_results = {"rep": {"logit_4_reps": scanpro_4_reps_logit_difs,
                               "arcsin_4_reps": scanpro_4_reps_arcsin_difs,
                               "logit_8_reps": scanpro_8_reps_logit_difs,
                               "arcsin_8_reps": scanpro_8_reps_arcsin_difs,
                               "logit_10_reps": scanpro_10_reps_logit_difs,
                               "arcsin_10_reps": scanpro_10_reps_arcsin_difs,
                               "logit_20_reps": scanpro_20_reps_logit_difs,
                               "arcsin_20_reps": scanpro_20_reps_arcsin_difs},
                   "norep": {"logit_4_reps": scanpro_norep_4_reps_logit_difs,
                                   "arcsin_4_reps": scanpro_norep_4_reps_arcsin_difs,
                                   "logit_8_reps": scanpro_norep_8_reps_logit_difs,
                                   "arcsin_8_reps": scanpro_norep_8_reps_arcsin_difs,
                                   "logit_10_reps": scanpro_norep_10_reps_logit_difs,
                                   "arcsin_10_reps": scanpro_norep_10_reps_arcsin_difs,
                                   "logit_20_reps": scanpro_norep_20_reps_logit_difs,
                                   "arcsin_20_reps": scanpro_norep_20_reps_arcsin_difs}}

In [55]:
difs_sim_hitrates = calc_hitrates(difs_sim_results, null=False, compare=compare_p_values)
difs_sim_hitrates.to_csv(f'{OUT_PATH}/difs_sim_hitrates.tsv', sep='\t')

In [56]:
difs_sim_pct_sig_cluster = calc_pct_significance(difs_sim_results, n_sims=100, null=False)
difs_sim_pct_sig_cluster.to_csv(f'{OUT_PATH}/difs_sim_pct_sig_cluster.tsv', sep='\t')

In [57]:
difs_sim_hitrate_cluster_rep = calc_hitrate_cluster(difs_sim_results, rep=True, null=False, compare=compare_p_values)
difs_sim_hitrate_cluster_rep.to_csv(f'{OUT_PATH}/difs_sim_rep_hitrate_cluster.tsv', sep='\t')

difs_sim_hitrate_cluster_norep = calc_hitrate_cluster(difs_sim_results, rep=False, null=False, compare=compare_p_values)
difs_sim_hitrate_cluster_norep.to_csv(f'{OUT_PATH}/difs_sim_norep_hitrate_cluster.tsv', sep='\t')

### save simulation results

In [58]:
# null simulation
for tool in null_sim_results.keys():
    for method in null_sim_results[tool]:
        results = null_sim_results[tool][method]
        results.to_csv(f'{OUT_PATH}/raw/null_sim_{tool}_{method}.tsv', sep='\t')

In [59]:
# simulation with differences
for tool in difs_sim_results.keys():
    for method in difs_sim_results[tool]:
        results = difs_sim_results[tool][method]
        results.to_csv(f'{OUT_PATH}/raw/difs_sim_{tool}_{method}.tsv', sep='\t')