In [2]:
"""
This script is used to take the normalized data available in the Human Cell Atlas, and convert this back into raw counts
authors: Roy Oelen
"""

In [None]:
# import the libraries required
import scanpy as sc
import pandas as pd
import numpy as np
import scipy.sparse as sparse
import scipy.io as sio

In [3]:
# locations of relevant object
hca_loc = '/groups/umcg-franke-scrna/tmp02/external_datasets/hca/Global_lognormalised.h5ad'

In [4]:
# read object
hca = sc.read_h5ad(hca_loc)

In [27]:
# check how may cells we have
ncells = hca.obs.shape[0]
# set the slice size, how many cells will we read and write at a time
slice_size = 10000
# set where to store the results
slices_location = "/groups/umcg-franke-scrna/tmp02/releases/blokland-2020/v1/epicardial_fat/ongoing/rtcd/references/hca/"

In [None]:
# loop through slices
for i_left in range(0, ncells, slice_size):
    # set the right side of the slice
    i_right = i_left + slice_size
    # unless of course we are at the last slice, which is a bit smaller (remember 0-based indexing and the right bound being exclusive)
    if ncells < i_right:
        i_right = ncells
    # message which slice we are doing
    print(' '.join(['doing slice', str(i_left + 1), 'to', str(i_right)]))

    # take a small slice
    hca_count_slice = hca.X.tocsr()[np.arange(i_left, (i_right), 1),:]
    # reverse natural log to the power of the number to get back to the fractions
    hca_count_slice = hca_count_slice.expm1()

    # create an np matrix from the total counts
    total_counts = hca.obs.iloc[i_left : i_right, 15].to_numpy()
    # multiply the fractions by the total counts
    hca_count_slice = hca_count_slice.multiply(total_counts[:, np.newaxis])
    # undo the scaling factor
    hca_count_slice = hca_count_slice.multiply(1/10000)

    # round to the nearest integer
    hca_count_slice.data = np.round(hca_count_slice.data, 0)

    # get the location to store the slice
    slice_loc = ''.join([slices_location, 'expression_slice_', str(i_left), '_', str(i_right), '.mtx'])
    # save the result
    sio.mmwrite(slice_loc, hca_count_slice)

doing slice 1 to 10000
doing slice 10001 to 20000
doing slice 20001 to 30000
doing slice 30001 to 40000
doing slice 40001 to 50000
doing slice 50001 to 60000
doing slice 60001 to 70000
doing slice 70001 to 80000
doing slice 80001 to 90000
doing slice 90001 to 100000
doing slice 100001 to 110000
doing slice 110001 to 120000
doing slice 120001 to 130000
doing slice 130001 to 140000
doing slice 140001 to 150000
doing slice 150001 to 160000
doing slice 160001 to 170000
doing slice 170001 to 180000
doing slice 180001 to 190000
doing slice 190001 to 200000
doing slice 200001 to 210000
doing slice 210001 to 220000
doing slice 220001 to 230000
doing slice 230001 to 240000
doing slice 240001 to 250000
doing slice 250001 to 260000
doing slice 260001 to 270000
doing slice 270001 to 280000
doing slice 280001 to 290000
doing slice 290001 to 300000
doing slice 300001 to 310000
doing slice 310001 to 320000
doing slice 320001 to 330000
doing slice 330001 to 340000
doing slice 340001 to 350000
doing sl

In [106]:
# check the total counts as reported in the metadata
total_counts

array([6094., 6254., 5864., 4922., 4713.], dtype=float32)

In [114]:
# and see that these are the same as the sums of each cell
hca_count_slice.sum(axis = 1)

matrix([[6092.],
        [6250.],
        [5863.],
        [4920.],
        [4712.]], dtype=float32)

In [33]:
# also write the metadata
metadata = hca.obs
hca.obs.to_csv(''.join([slices_location, 'hca_metadata.tsv']), sep = '\t', header = True, index = True)

In [51]:
hca.var['gene_id'].tolist()

['ENSG00000243485',
 'ENSG00000237613',
 'ENSG00000186092',
 'ENSG00000238009',
 'ENSG00000239945',
 'ENSG00000239906',
 'ENSG00000241599',
 'ENSG00000236601',
 'ENSG00000284733',
 'ENSG00000235146',
 'ENSG00000284662',
 'ENSG00000229905',
 'ENSG00000237491',
 'ENSG00000177757',
 'ENSG00000225880',
 'ENSG00000230368',
 'ENSG00000272438',
 'ENSG00000230699',
 'ENSG00000241180',
 'ENSG00000223764',
 'ENSG00000187634',
 'ENSG00000188976',
 'ENSG00000187961',
 'ENSG00000187583',
 'ENSG00000187642',
 'ENSG00000272512',
 'ENSG00000188290',
 'ENSG00000187608',
 'ENSG00000224969',
 'ENSG00000188157',
 'ENSG00000273443',
 'ENSG00000237330',
 'ENSG00000131591',
 'ENSG00000223823',
 'ENSG00000272141',
 'ENSG00000205231',
 'ENSG00000162571',
 'ENSG00000186891',
 'ENSG00000186827',
 'ENSG00000078808',
 'ENSG00000176022',
 'ENSG00000184163',
 'ENSG00000260179',
 'ENSG00000160087',
 'ENSG00000230415',
 'ENSG00000162572',
 'ENSG00000131584',
 'ENSG00000169972',
 'ENSG00000127054',
 'ENSG00000224051',


In [48]:
# get the genes (they are the indices of the var variable)
hca_genes = hca.var['gene_id'].tolist()
# set where to store
genes_loc = ''.join([slices_location, 'hca_genes.tsv'])
# by writing each gene to a line
with open(genes_loc, 'w') as fp:
    fp.write('\n'.join(hca_genes))
fp.close()
# same for barcodes, they are the index of the obs variable
hca_barcodes = hca.obs.index.tolist()
barcodes_loc = ''.join([slices_location, 'hca_barcodes.tsv'])
with open(barcodes_loc, 'w') as fp:
    fp.write('\n'.join(hca_barcodes))
fp.close()