In [1]:
%load_ext autotime

import tiledb
import numpy as np
import scarf
from tqdm.auto import tqdm
import sparse
from numba import jit

time: 1.36 s (started: 2021-08-18 02:43:10 +02:00)


In [2]:
# scarf.fetch_dataset('tenx_10K_pbmc_atacseq', save_path='scarf_datasets')

time: 0 ns (started: 2021-08-18 02:43:12 +02:00)


In [3]:
def make_tildb_schema(fn: str, n_cells: int, n_features: int,
                      cell_t_size: int = 1000, feature_t_size: int = 20000,
                      dtype: str = 'uint32', use_compression: bool = False):
    
    import pathlib
    import shutil
    
    fn = pathlib.Path(fn)
    if fn.exists():
        shutil.rmtree(str(fn.absolute()))
        
    if use_compression:
        filter_list = tiledb.FilterList([tiledb.LZ4Filter(level=10)])
    else:
        filter_list = []
        
    if feature_t_size is None:
        feature_t_size = n_features
    
    cell_t_size = max(1, min(cell_t_size, n_cells))
    feature_t_size = max(1, min(feature_t_size, n_features))

    dim1 = tiledb.Dim(name="cells", domain=(0, n_cells-1), tile=cell_t_size, dtype=np.uint64)
    dim2 = tiledb.Dim(name="features", domain=(0, n_features-1), tile=feature_t_size, dtype=np.uint64)
    dom = tiledb.Domain(dim1, dim2)
    attr = tiledb.Attr(name="counts", dtype=dtype, filters=filter_list)

    schema = tiledb.ArraySchema(domain=dom, attrs=[attr], sparse=True, 
                                tile_order='row-major', cell_order='row-major')
    schema.check()
    tiledb.Array.create(array_name, schema)

time: 0 ns (started: 2021-08-18 02:43:12 +02:00)


In [13]:
# reader = scarf.CrH5Reader("scarf_datasets/tenx_5K_pbmc_rnaseq/data.h5", 'rna')
reader = scarf.CrH5Reader("scarf_datasets/tenx_10K_pbmc_atacseq/data.h5", 'atac')
array_name = 'tildb_pbmc_10k'
make_tildb_schema(array_name, reader.nCells, reader.nFeatures, use_compression=False, cell_t_size=500, feature_t_size=None)

time: 156 ms (started: 2021-08-18 12:15:51 +02:00)


In [14]:
@jit(nopython=True)
def update_vals(coords, coords_map, cell_coords):
    m = coords.shape[0]
    cur = coords_map.max() + 1
    updated_coords = np.zeros(m).astype(coords.dtype)
    for i in range(m):
        if coords_map[coords[i]] == -1:
            coords_map[coords[i]] = cur
            cur += 1
        updated_coords[i] = coords_map[coords[i]]
    
    diff = cell_coords[1:] - cell_coords[:-1]
    p = np.where(diff == 1)[0] + 1
    s = 0
    sorted_coords = -1 * np.ones(m)
    for i in range(p.shape[0]):
        e = p[i]
        sorted_coords[s:e] = sorted(updated_coords[s:e])
        s = e
    sorted_coords[s:m] = sorted(updated_coords[s:m])
    return sorted_coords.astype(coords.dtype), coords_map

time: 0 ns (started: 2021-08-18 12:15:53 +02:00)


In [15]:
s = 0
cm = -1*np.ones(reader.nFeatures).astype(int)
t = {}
with tiledb.open(array_name, 'w') as A:
    for i in tqdm(reader.consume(batch_size=500, lines_in_mem=1000000)):
        uc, cm = update_vals(i.coords[1], cm, i.coords[0])
        for j in i.coords[1]:
            t[j] = None    
        assert np.unique(cm).shape[0] - cm.max() == 2
        assert uc.shape == i.coords[1].shape
        assert (uc == -1).sum() == 0
        A[i.coords[0]+s, uc] = i.data
        s += i.shape[0]

0it [00:00, ?it/s]

time: 1min 13s (started: 2021-08-18 12:15:53 +02:00)


In [None]:
batch_size = 500
s = 0
with tiledb.open(array_name) as A:
    for e in tqdm(np.arange(batch_size, reader.nCells+batch_size, batch_size)):
        if e > reader.nCells:
            e = reader.nCells
        a = A[s:e, :]
#         a = sparse.COO([a['cells']-s, a['features']], data=a['counts'], shape=(e-s, n_features))

  0%|          | 0/20 [00:00<?, ?it/s]

In [153]:
len(t), reader.nFeatures- len(t)

(90449, 237)

time: 15 ms (started: 2021-08-18 02:34:34 +02:00)


In [7]:
uc

array([    1,    13,    17, ..., 88572, 89438, 89727], dtype=int64)

time: 15 ms (started: 2021-08-18 12:08:16 +02:00)


In [148]:
i.coords[1]

array([   10,    22,    23, ..., 90677, 90678, 90680], dtype=int64)

time: 0 ns (started: 2021-08-18 02:31:46 +02:00)


In [149]:
(cm == -1).sum()

237

time: 0 ns (started: 2021-08-18 02:32:01 +02:00)


In [77]:
uc[112100-n:112100+n]

array([86169, 86197, 86237, 86239, 86271, 86277, 86277, 86295, 86328,
       86354], dtype=int64)

time: 0 ns (started: 2021-08-18 02:08:13 +02:00)


In [97]:
i[12].coords[0]

array([    2,     9,    19, ..., 90657, 90662, 90674], dtype=int64)

time: 16 ms (started: 2021-08-18 02:11:23 +02:00)


In [99]:
i[12]

0,1
Format,coo
Data Type,int32
Shape,"(90686,)"
nnz,10882
Density,0.11999647134067
Read-only,True
Size,127.5K
Storage ratio,0.4


time: 0 ns (started: 2021-08-18 02:12:01 +02:00)


In [113]:
idx = i.coords[0] == 12
idx.sum()

10882

time: 0 ns (started: 2021-08-18 02:13:57 +02:00)


In [107]:
i.coords[0][idx]

array([12, 12, 12, ..., 12, 12, 12], dtype=int64)

time: 0 ns (started: 2021-08-18 02:12:49 +02:00)


In [121]:
np.unique(i.coords[1][idx]).shape

(10882,)

time: 0 ns (started: 2021-08-18 02:15:56 +02:00)


In [120]:
np.unique(test[idx]).shape

(10881,)

time: 0 ns (started: 2021-08-18 02:15:39 +02:00)


In [122]:
np.unique(uc[idx]).shape

(10881,)

time: 0 ns (started: 2021-08-18 02:16:11 +02:00)


In [117]:
np.alltrue(np.array([cm[x] for x in  i.coords[1][idx]]) == test[idx])

True

time: 0 ns (started: 2021-08-18 02:15:05 +02:00)


In [118]:
np.alltrue(sorted([cm[x] for x in  i.coords[1][idx]]) == uc[idx])

True

time: 0 ns (started: 2021-08-18 02:15:21 +02:00)


In [79]:
i.coords[1][112100-n:112100+n]

array([89922, 89982, 89990, 89996, 90000, 90004, 90009, 90028, 90054,
       90055], dtype=int64)

time: 15 ms (started: 2021-08-18 02:08:13 +02:00)


In [80]:
test[112100-n:112100+n]

array([21708, 38601, 70483,  7186, 82005, 21717, 11295, 37645, 44677,
        7191], dtype=int64)

time: 0 ns (started: 2021-08-18 02:08:13 +02:00)


In [81]:
cm[21717], cm[11295], cm[90000], cm[90009]

(13761, 31385, 82005, 11295)

time: 0 ns (started: 2021-08-18 02:08:14 +02:00)


In [82]:
i.coords

array([[    0,     0,     0, ...,    99,    99,    99],
       [   15,    19,    23, ..., 90629, 90644, 90662]], dtype=int64)

time: 0 ns (started: 2021-08-18 02:08:14 +02:00)


In [71]:
uc

array([    1,     4,     5, ..., 87956, 87957, 87958], dtype=int64)

time: 0 ns (started: 2021-08-18 02:07:43 +02:00)


In [72]:
(cm == -1).sum()

2725

time: 0 ns (started: 2021-08-18 02:07:43 +02:00)


In [73]:
len(set(cm))

87960

time: 16 ms (started: 2021-08-18 02:07:44 +02:00)


In [74]:
uc

array([    1,     4,     5, ..., 87956, 87957, 87958], dtype=int64)

time: 0 ns (started: 2021-08-18 02:07:44 +02:00)


In [75]:
np.diff(i.coords[0]) != 0

array([False, False, False, ..., False, False, False])

time: 0 ns (started: 2021-08-18 02:07:44 +02:00)


In [156]:
x = np.array([1, 1, 1, 2, 2, 2,2, 3, 3, 4, 4, 4])
y = np.array([11, 29, 23, 48, 44, 45,99, 34, 39, 36, 0, 1,])
diff = x[1:] - x[:-1]
p = np.where(diff == 1)[0]
# p = np.hstack([p, [len(y)]])
p

array([2, 6, 8], dtype=int64)

time: 16 ms (started: 2021-08-18 02:36:12 +02:00)


In [157]:
s = 0
z = np.zeros(y.shape[0])
for e in p+1:
    z[s:e] = sorted(y[s:e])
    s = e
z[s:y.shape[0]] = sorted(y[s:y.shape[0]])
z

array([11., 23., 29., 44., 45., 48., 99., 34., 39.,  0.,  1., 36.])

time: 0 ns (started: 2021-08-18 02:36:13 +02:00)


In [88]:
@jit(nopython=True)
def update_vals(coords, coords_map):
    m = coords.shape[0]
    cur = max(0, coords_map.max())
    updated_coords = np.zeros(m).astype(coords.dtype)
    for i in range(m):
        if coords_map[coords[i]] == -1:
            coords_map[coords[i]] = cur
            cur += 1
        updated_coords[i] = coords_map[coords[i]]
    return updated_coords, coords_map

time: 0 ns (started: 2021-08-18 01:11:50 +02:00)


time: 516 ms (started: 2021-08-18 01:11:50 +02:00)


In [84]:
uc

array([    0,     1,     2, ...,  7228, 21792, 40576], dtype=int64)

time: 0 ns (started: 2021-08-18 01:11:19 +02:00)


In [66]:
nc = {}
v = 0
c = i.coords[1]
mc = []
for x in c:
    if x not in nc:
        nc[x] = v
        v += 1
    mc.append(nc[x])

time: 2.59 s (started: 2021-08-18 01:03:36 +02:00)


7355004

time: 0 ns (started: 2021-08-18 00:59:49 +02:00)


In [48]:
c = i.coords[1]

time: 0 ns (started: 2021-08-18 00:53:03 +02:00)


In [50]:
c.min(), c.max()

(0, 90685)

time: 16 ms (started: 2021-08-18 00:53:11 +02:00)


In [52]:
len(set(c))

89666

time: 563 ms (started: 2021-08-18 00:53:24 +02:00)


In [29]:
i.shape[0]

169

time: 0 ns (started: 2021-08-18 00:38:55 +02:00)


In [19]:
i.data

array([ 2,  3,  1, ..., 54, 28, 51])

time: 0 ns (started: 2021-08-18 00:17:27 +02:00)


In [14]:
s = 0
for i in tqdm(reader.consume(batch_size=500, lines_in_mem=10000)):
    s += i.shape[0]

0it [00:00, ?it/s]

time: 1 s (started: 2021-08-18 00:16:46 +02:00)


0it [00:00, ?it/s]

time: 9.47 s (started: 2021-08-18 00:16:47 +02:00)


In [5]:
with tiledb.open(array_name) as A:
    shape=(
        A.dim('cells').size,
        A.dim('features').size
    )
    print (shape)

(5025, 33538)


In [6]:
batch_size = 1000
with tiledb.open(array_name) as A:
    for s in tqdm(np.arange(0, n_cells, batch_size)):
        e = s + batch_size
        if e > n_cells:
            e = n_cells
        a = A[s:e, :]
#         a = sparse.COO([a['cells']-s, a['features']], data=a['counts'], shape=(e-s, n_features))

  0%|          | 0/6 [00:00<?, ?it/s]

In [9]:
batch_size = 1000
with tiledb.open(array_name) as A:
    for s in tqdm(np.arange(0, n_cells, batch_size)):
        e = s + batch_size
        if e > n_cells:
            e = n_cells
        a = A[s:e, :]
        a = sparse.COO([a['cells']-s, a['features']], data=a['counts'], shape=(e-s, n_features))

  0%|          | 0/6 [00:00<?, ?it/s]

In [16]:
a.sum(axis=1).data.shape

(25,)

In [19]:
a.sum(axis=0).data

array([   2,    3,    7, ..., 1588,    1,    2], dtype=uint32)

In [17]:
a.su(m)

0,1
Format,coo
Data Type,uint32
Shape,"(25, 33538)"
nnz,55692
Density,0.06642256544814837
Read-only,True
Size,652.6K
Storage ratio,0.2
