In [1]:
import pandas as pd
import numpy as np
import h5py as h5
import re

In [2]:
species = "mouse"
version = "10"

single_cell_prob_thresh = 0.5

In [3]:
f = h5.File(species+"_matrix_v"+version+".h5", "r")
gse_scprob = np.array([
    f["meta"]["samples"]["series_id"], 
    f["meta"]["samples"]["geo_accession"],
    f["meta"]["samples"]["singlecellprobability"],
    f["meta"]["samples"]["title"]
]).T
f.close()

In [4]:
gse_scprob[:, 0:2] = gse_scprob[:, 0:2].astype(str)
gse_scprob[:, 3] = pd.Series(gse_scprob[:, 3]).str.decode("utf-8")
print(gse_scprob.shape, "\n", gse_scprob[:10])

(405640, 4) 
 [['GSE54390' 'GSM1314260' 0.1859949380159378 'TRACK 43-75 kidney 1']
 ['GSE75330' 'GSM2164318' 0.9992627501487732 'C1-1772117-029-A02']
 ['GSE75330' 'GSM2161048' 0.9992627501487732 'C1-1772072-242-F03']
 ['GSE64027' 'GSM1563082' 0.18679091334342957 '20NNK_LPS_Sil_I3C9']
 ['GSE36025\tGSE49417\tGSE49847' 'GSM1000563' 0.16356061398983002
  'CSHL_RnaSeq_Cortex_adult-8wks (superseded by GSE90205)']
 ['GSE87069' 'GSM2320580' 0.9992627501487732 'F125_RNA-seq']
 ['GSE75330' 'GSM2164283' 0.9992627501487732 'C1-1772117-028-A05']
 ['GSE60361' 'GSM1474588' 0.9992627501487732 'SingleCellNo338']
 ['GSE76381' 'GSM2273393' 0.9992627501487732 '1772075301_G03']
 ['GSE75330' 'GSM2162146' 0.9992627501487732 'C1-1772096-087-D08']]


In [5]:
single_cell_samp = np.argwhere(gse_scprob[:, 2] > single_cell_prob_thresh)       #identify samples w/single cell prob > thresh
single_cell_study = np.unique(gse_scprob[single_cell_samp][:,:,0])               #identify studies corresponding to sc samples
bulk_study_bool = np.isin(gse_scprob[:, 0], single_cell_study, invert = True)    #boolean mask s.t. {T = bulk, F = sc}
bulk_study_idx = np.arange(0, len(bulk_study_bool))[bulk_study_bool]             #index corresponding to bulk RNA-seq (T)
bulk_study_meta = np.append(gse_scprob[bulk_study_bool],                         #filtering out scRNA-seq via boolean indexing
                            bulk_study_idx[:, np.newaxis], axis = 1)             #and appending corresponding h5 index

In [6]:
pd.DataFrame(bulk_study_meta).to_csv(
    species+"_bulk_study_meta.csv", 
    header=["series_id", "geo_accession", "singlecellprobability", "sample_title", "h5_idx"])

In [7]:
bulk_study_meta.shape, len(np.unique(bulk_study_meta[:, 0]))

((130012, 5), 8971)