* Esophagus single-cell expression data obtained from:
https://data.humancellatlas.org/explore/projects/c4077b3c-5c98-4d26-a614-246d12c2e5d7/project-matrices

* Subsample single-cell gene expression data without replacement.

Desired output sizes:
* n = 20k, 40k, 87 947
* f = 2.5k, 5k, 10k, 24 245

Cross all these values.

In [1]:
import gc
import scanpy as sc
scgexp_data = sc.read("oesophagus.cellxgene.h5ad")
scgexp_df = scgexp_data.to_df()
del scgexp_data
gc.collect()
# indices samples, columns genes
print(scgexp_df.shape)
print(scgexp_df.isnull().sum().sum())

(87947, 24245)
0


In [2]:
import dask.dataframe as dd

for n in [40000, 60000, scgexp_df.shape[0]]:
    for f in [10000, 15000, 20000, scgexp_df.shape[1]]:
        ss_df = scgexp_df.sample(n=n, axis='index', random_state=42).sample(n=f, axis='columns', random_state=42)
        ss_df.to_csv(f'n{n}xf{f}.csv')
        ss_df.to_parquet(f'n{n}xf{f}_whole.parquet')
        ss_df_dask = dd.from_pandas(ss_df, npartitions=1).repartition(partition_size='100MB')
        del ss_df
        gc.collect()
        dd.to_parquet(ss_df_dask, f'n{n}xf{f}.parquet', overwrite=True)
        del ss_df_dask
        gc.collect()

(40000, 10000)
(40000, 15000)
(40000, 20000)
(40000, 24245)
(60000, 10000)
(60000, 15000)
(60000, 20000)
(60000, 24245)
(87947, 10000)
(87947, 15000)
(87947, 20000)
(87947, 24245)
