In [1]:
%pylab inline

import os,sys
import numpy as np
import scipy as sp
import pandas as pd
import matplotlib as mpl
import matplotlib.gridspec as gridspec

from collections import defaultdict, Counter, OrderedDict

import cytograph as cg
import loompy
import palettable

sys.path.append(os.path.realpath(os.path.join(os.getcwd(), '..', '..')))
from scbeta_scrnaseq import utils
from scbeta_scrnaseq import vis
import scbeta_scrnaseq.cytograph_inmem_utils as cgm
import scbeta_scrnaseq.cytograph_analyses as cga



Populating the interactive namespace from numpy and matplotlib


#### Load loompy objects

We parse the inDrops pipeline output files, and create Loom objects for these datasets.

In [2]:
# Load the Loom datasets (and immeditely load the counts to memory)
samples = ["x1_S6c", "x1_S5c", "x1_S4c", "x1_S3c","x2_S6c","x2_S5c", "x2_S4c", "x2_S3c"]

tds = OrderedDict()
for tp in samples:
    loom_fn = f'../data/complete_processing/{tp}.processed.loom'
    tds[tp] = loompy.connect(loom_fn)
    tds[tp].vals = sp.sparse.csr_matrix(tds[tp].layers[""][:, :])

In [15]:
tds[tp].ca.keys()


['CellBatch',
 'CellID',
 'CellProtocol',
 'CellStage',
 'DetailedLabels',
 'HighVarPCA',
 'HighVarTSNE',
 'Labels',
 'NormalizerTotals',
 'TSNE',
 '_TrainFilter',
 '_Valid']

In [None]:
for tp in samples:
    pds = tds[tp]
    full_labels = cgm.CellLabels(pds.ca.Labels, null_label="")

for _labels, label_name in [(full_labels, 'labels'),
                        (full_labels, 'labels'),
                        (detailed_labels, 'det_labels')]:
    

    tp_pb = cga.pseudobulk_from_label(pds, _labels, norm_total=10**6)
    tp_expr = cga.expressed_fraction_from_label(pds, _labels)
    
    utils.save_df(tp_pb, f'../data/complete_processing/stage6.{label_name}.pseudobulk_tpm.df.npz')
    utils.save_df(tp_expr, f'../data/complete_processing/stage6.{label_name}.expr_frac.df.npz')

#### Across-datasets comparison

We are going to need to make a giant merged object to find high-var genes across all datasets, and normalize correctly

In [5]:
# This here is a hacky (but fast) way of combining the sparse counts arrays from each individual dataset
all_genes_across_datasets = []
for tp in tds.keys():
    _tds = tds[tp]
    all_genes_across_datasets += list(_tds.ra.Gene)
all_genes_across_datasets = sorted(set(all_genes_across_datasets))
gene_index = pd.Series(range(len(all_genes_across_datasets)), index=all_genes_across_datasets)

new_row = []
new_col = []
new_data = []
n_cells = 0
for tp in tds.keys():
    _tds = tds[tp]
    _coo = _tds.vals.tocoo()
    
    conv_row = gene_index[_tds.ra.Gene].values[_coo.row]
    conv_col = _coo.col + n_cells
    n_cells = max(conv_col)
    
    new_row.append(conv_row)
    new_col.append(conv_col)
    new_data.append(_coo.data)
all_vals = sp.sparse.coo_matrix((np.concatenate(new_data), (np.concatenate(new_row), np.concatenate(new_col)))).tocsr()

In [8]:
feature_selection = cgm.FeatureSelection()
feature_selection.fit(None, all_vals)

In [9]:
n_genes = 2000
_valid = (feature_selection.mu > 0.001)
_valid_score = feature_selection.score * _valid[feature_selection.ok].astype(float)
high_var_genes = np.where(feature_selection.ok)[0][np.argsort(_valid_score)][-n_genes: ]
high_var_genes = np.array(all_genes_across_datasets)[high_var_genes]

#### Read in the pseudo-bulk TPMs 

In [10]:
import glob

pb_tpm = {}
for pbf in glob.glob('../../*/data/complete_processing/*.pseudobulk_tpm.df.npz'):
    key = os.path.basename(pbf)[:-(len('.pseudobulk_tpm.df.npz'))]
    pb_tpm[key] = utils.load_df(pbf)
    

In [11]:
all_clusters = []
for tp in samples:
    for cl in pb_tpm[tp].index:
        all_clusters.append((tp, cl))
samples_tpm = utils.combine_rows(pb_tpm, all_clusters)

In [12]:
pb_tpm_z = samples_tpm.copy()
pb_tpm_z -= pb_tpm_z.mean()
pb_tpm_z /= pb_tpm_z.std(ddof=0)

In [13]:
utils.save_df(pb_tpm_z[high_var_genes], '../data/complete_processing/x1x2.pb_tpm.z_scores.highvar.df.npz')

In [14]:
from scipy.spatial.distance import pdist
from fastcluster import linkage
from scipy.cluster.hierarchy import dendrogram, leaves_list, fcluster, to_tree, leaders

import polo
pb_dist = pdist(pb_tpm_z.loc[high_var_genes].values.T, 'correlation')
pb_link = linkage(pb_dist, 'average')
pb_link = polo.polo.optimal_leaf_ordering(pb_link, pb_dist)