In [None]:
#import functools
import numpy as np
import pandas as pd
from vpolo.alevin import parser
import scanpy as sc
import anndata


sc.settings.verbosity = 3             # verbosity: errors (0), warnings (1), info (2), hints (3)
sc.logging.print_header()


In [None]:
samples_df = pd.read_csv('/ceph/projects/organoid_sc_rnaseq/input/metadata/tables/_m/samples.csv')
print(samples_df.shape)
samples_df.head(3)

In [None]:
assays_df = pd.read_csv('/ceph/projects/organoid_sc_rnaseq/input/metadata/tables/_m/10X_assays.csv')
print(assays_df.shape)
assays_df

In [None]:
#@functools.lru_cache
def read_alevin_counts(directory):
    return parser.read_quants_bin(directory)

In [None]:
def read_alevin_counts_and_barcodes(samples_df, assays_df, assay_id):
    barcodes = pd.read_csv('../../_m/%s.singlets.tsv' % assay_id, sep='\t', index_col=0)
    counts = read_alevin_counts("../../../_m/%s/rna" % assay_id).loc[barcodes.index].copy()
    assert all(counts.index == barcodes.index)
    
    obs = barcodes\
        .reset_index()\
        .merge(samples_df, left_on=['HTO_classification'], right_on=['New_Sample_ID'])\
        .set_index('barcode')\
        .loc[counts.index]\
        .drop(columns=['HTO_classification'])
    assert all(obs.index == barcodes.index)
    obs.index = obs['New_Sample_ID'] + '-' + obs.index
    
    counts.index = obs.index
    
    var = pd.DataFrame({'gene_symbol':counts.columns}, index = counts.columns)
    
    return anndata.AnnData(counts, obs, var)    


In [None]:
def adata_iter(samples_df, assays_df):
    for assay_id in assays_df['10X_Assay_ID']:
        yield read_alevin_counts_and_barcodes(samples_df, assays_df, assay_id)

In [None]:
adata = anndata.AnnData.concatenate(* (adata_iter(samples_df, assays_df)) )

In [None]:
adata.write_h5ad('scanpy_object.h5')