# Downsample ATAC modality by dropping non-zero (binary) observations 

Aim: Investigate how a low quality/coverage modality impacts joint vs. concatenated models

This is a heuristic strategy mimicing lower coverage data. If one had indeed measured a lower coverage dataset, the features selected would differ due to differences in the signal to noise ratios. Here, we called features with full "high quality" dataset, and drop observations from that.

Randomly drop
- 90% 

of observations.

In [1]:
# Create multiple subsamples for 10% 

In [2]:
# Imports
import liam_NeurIPS2021_challenge_reproducibility
import anndata as ad
import scanpy as sc
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import torch

Global seed set to 0
OMP: Info #276: omp_set_nested routine deprecated, please use omp_set_max_active_levels instead.


In [3]:
def which_observations_to_drop(indices, percent):
    """
    Helper function to drop certain percent of non-zero observations from a sparse array.
    
    Parameters
    ----------
    indices: list
        (sorted) list of column indices of non-zero elements (per cell)
    
    percent: float
        How many nonzero observations to drop (percent).
    
    Returns
    -------
    np.array
        Sorted array of nonzero indices to drop.
    """
    # Convert list to array
    arr = np.array(indices)
    # How many nonzero observations to drop - absolute
    drop = int(len(indices) * percent)
    # Randomly permute the observations
    np.random.shuffle(arr)
    # Return sorted array of indices of non-zero elements to drop
    return np.sort(arr[:drop])
    

In [4]:
seeds = [8831, 234, 11, 9631, 94]

In [5]:
# Set random seed for reproducibility

full = ad.read_h5ad("./../../data/original/neurips_competition/openproblems_bmmc_multiome_phase2/openproblems_bmmc_multiome_phase2.censor_dataset.output_mod2.h5ad")

nonzero_index_list_per_cell = full.X.tolil().rows
# List of Lists Format (LIL) format
# each row is a Python list (sorted) of column indices of non-zero elements
# https://scipy-lectures.org/advanced/scipy_sparse/lil_matrix.html?highlight=lil

input_backup = full.copy()



## Subsampling to 10% of the data
### In other words dropping 90% of the observations
for seed in seeds:
    print(seed)
    np.random.seed(seed)

    full = input_backup.copy()

    # Iterate over cells
    for cell in range(full.X.shape[0]):
        drop = which_observations_to_drop(nonzero_index_list_per_cell[cell], 0.9)
        # Which elements to set to zero (inplace)
        full.X[cell, drop.tolist()] = 0

    full.write_h5ad("./../../data/derived/neurips_competition/openproblems_bmmc_multiome_phase2.censor_dataset.output_mod2_10_subsample_seed_{}.h5ad".format(seed))


    sum_after = full.X.sum(axis=1)

    sum_before = input_backup.X.sum(axis=1)

    print(sum_before)
    print(sum_after)
    print()


8831
[[1951.]
 [3965.]
 [2279.]
 ...
 [4216.]
 [8949.]
 [3157.]]
[[196.]
 [397.]
 [228.]
 ...
 [422.]
 [895.]
 [316.]]

234
[[1951.]
 [3965.]
 [2279.]
 ...
 [4216.]
 [8949.]
 [3157.]]
[[196.]
 [397.]
 [228.]
 ...
 [422.]
 [895.]
 [316.]]

11
[[1951.]
 [3965.]
 [2279.]
 ...
 [4216.]
 [8949.]
 [3157.]]
[[196.]
 [397.]
 [228.]
 ...
 [422.]
 [895.]
 [316.]]

9631
[[1951.]
 [3965.]
 [2279.]
 ...
 [4216.]
 [8949.]
 [3157.]]
[[196.]
 [397.]
 [228.]
 ...
 [422.]
 [895.]
 [316.]]

94
[[1951.]
 [3965.]
 [2279.]
 ...
 [4216.]
 [8949.]
 [3157.]]
[[196.]
 [397.]
 [228.]
 ...
 [422.]
 [895.]
 [316.]]

