In [None]:
import pandas as pd
import numpy as np
import os
import csv
from itertools import product
from pathlib import Path

In [None]:
# We read in the Sanity-inferred deltas for the CITE-seq dataset:
input_folder = '/scicore/home/nimwegen/GROUP/Projects/bonsai_runs/paper_figures_datasets/hao_satija_2021_paper_figure/Sanity'
deltas_path = os.path.join(input_folder, 'delta_vmax.txt')
d_deltas_path = os.path.join(input_folder, 'd_delta_vmax.txt')
cell_id_path = os.path.join(input_folder, 'cellID.txt')
gene_id_path = os.path.join(input_folder, 'geneID.txt')
variance_path = os.path.join(input_folder, 'variance_vmax.txt')


deltas = pd.read_csv(deltas_path, sep='\t', header=None).values.astype(dtype=float)
d_deltas = pd.read_csv(d_deltas_path, sep='\t', header=None).values.astype(dtype=float)
variances = pd.read_csv(variance_path, sep='\t', header=None).values.astype(dtype=float).flatten()

In [None]:
cell_ids = []
with open(cell_id_path, 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    for row in reader:
        cell_ids.append(row[0])
        
gene_ids = []
with open(gene_id_path, 'r') as f:
    reader = csv.reader(f, delimiter='\t')
    for row in reader:
        gene_ids.append(row[0])

### Take subsets of the data with varying gene and cell numbers

In [None]:
n_cells = len(cell_ids)
n_genes = len(gene_ids)

In [None]:
n_cells, n_genes

In [None]:
cell_numbers = [1000]
gene_numbers = [10, 100, 1000, 10000]

In [None]:
# Set a seed, then draw a random ordering of both cells and genes.
# We will sample the first N of this list, such that the bigger models will contain the smaller models.
rng = np.random.default_rng(seed=1231)
cell_order = np.arange(n_cells)
rng.shuffle(cell_order)

In [None]:
# Order randomly
delta_vars = d_deltas ** 2
zscores = np.sqrt(np.sum((deltas - np.mean(deltas, axis=1, keepdims=True)) ** 2 / delta_vars, axis=1) / n_cells)
# Order by zscore
gene_order = np.argsort(-zscores)

In [None]:
# Sample random genes with zscore > .1
# gene_order = np.where(zscores > .1)[0]
# rng.shuffle(gene_order)

In [None]:
np.random.seed(2462)
# sampled_deltas = np.random.normal(deltas, d_deltas)
sampled_deltas = deltas

In [None]:
ordered_deltas = sampled_deltas[gene_order, :][:, cell_order]
ordered_cell_ids = [cell_ids[ind] for ind in cell_order]
ordered_gene_ids = [gene_ids[ind] for ind in gene_order]
d_deltas = np.ones_like(ordered_deltas) * 1e-6

In [None]:
for cell_num, gene_num in product(cell_numbers, gene_numbers):
    deltas_subset = ordered_deltas[:gene_num, :cell_num]
    cell_ids_subset = ordered_cell_ids[:cell_num]
    gene_ids_subset = ordered_gene_ids[:gene_num]
    d_deltas_subset = d_deltas[:gene_num, :cell_num]
    variance_subset = np.ones(gene_num)
    
    dataset_name = 'no_noise_nGene{}_nCell{}_zscoregenesnosampling'.format(gene_num, cell_num)
    print(os.path.join('no_noise_simulated', dataset_name))
    base_folder_name = '/scicore/home/nimwegen/degroo0000/Bonsai-data-representation/slurm_runs_pipeline/output/no_noise_simulated'
    folder_name = os.path.join(base_folder_name, dataset_name, 'Sanity')
    Path(folder_name).mkdir(parents=True, exist_ok=True)
    np.savetxt(os.path.join(folder_name, "delta_vmax.txt"), deltas_subset, delimiter='\t')
    np.savetxt(os.path.join(folder_name, "d_delta_vmax.txt"), d_deltas_subset, delimiter='\t')

    means_subset = np.zeros_like(variance_subset)
    with open(os.path.join(folder_name, 'mu_vmax.txt'), 'w') as f:
        for mean in means_subset:
            f.write("%s\n" % mean)
    
    with open(os.path.join(folder_name, 'variance_vmax.txt'), 'w') as f:
        for var in variance_subset:
            f.write("%s\n" % var)
    
    with open(os.path.join(folder_name, 'cellID.txt'), 'w') as f:
        for ID in cell_ids_subset:
            f.write("%s\n" % ID)

    with open(os.path.join(folder_name, 'geneID.txt'), 'w') as f:
        for ID in gene_ids_subset:
            f.write("%s\n" % ID)

In [None]:
len(gene_ids_subset)