Alternative (interactive) skript for demuxing.

In [None]:
import numpy as np
import matplotlib.pyplot as pl
import scvelo as scv
import pandas as pd
import scanpy as sc
from tqdm import tnrange, tqdm_notebook
from IPython.core.display import display, HTML
display(HTML('<style>.container {width:90% !important;}</style>'))

scv.settings.set_figure_params('scvelo')
scv.settings.verbosity = 1

data_path='../data/'
letter = 'C'

In [None]:
def apply_demux(adata, letter, data_path='../data/'):
    annot = {'C': {
              'Hashtag10': 'p009ot',
              'Hashtag11': 'p013ot',
              'Hashtag12': 'NCO',
              },
        'E': {
              'Hashtag7': 'p009ot',
              'Hashtag8': 'p013ot',
              'Hashtag9': 'NCO',
              },
        'W': {
              'Hashtag4': 'p009ot',
              'Hashtag5': 'p013ot',
              'Hashtag6': 'NCO',
              }
        }
    
    SNPdemux = pd.read_csv(data_path+letter+'_donor_ids_SNP.tsv', sep='\t')
    SNPdemux = SNPdemux.set_index('cell', drop=True)
    new_index = [index[:-2] for index in SNPdemux.index]  # clean index names
    SNPdemux.index=new_index

    adata.obs['SNPdemux'] = SNPdemux['donor_id']
    
    seuratdemux = pd.read_csv(data_path+letter+'_donor_ids_seurat.csv', sep=',')
    for key in annot[letter].keys():
        seuratdemux[seuratdemux==key]=annot[letter][key] 

    for name in pd.unique(seuratdemux['x']):
        if name==name:
            if '_' in name:
                seuratdemux[seuratdemux==name]='doublet' 

    adata.obs['seuratdemux'] = seuratdemux
    adata.obs['seuratdemux'][pd.isna(adata.obs['seuratdemux'])]='Negative'
    return adata

def identify(adata, letter):
    annot = {'C': {
              'Hashtag10': 'p009ot',
              'Hashtag11': 'p013ot',
              'Hashtag12': 'NCO',
              },
        'E': {
              'Hashtag7': 'p009ot',
              'Hashtag8': 'p013ot',
              'Hashtag9': 'NCO',
              },
        'W': {
              'Hashtag4': 'p009ot',
              'Hashtag5': 'p013ot',
              'Hashtag6': 'NCO',
              }
        }
    # apply seurat id to SNP
    donor_names = np.array(list(annot[letter].values()))
    donor_ids = np.array(['donor0', 'donor1', 'donor2'])
    mat=np.zeros((len(donor_ids), len(donor_names)))
    for i, sn in enumerate(donor_ids):
        for j, se in enumerate(donor_names):
            mat[i][j] = len(set(np.where(adata.obs['SNPdemux']==sn)[0]).intersection(set(np.where(adata.obs['seuratdemux']==se)[0])))
    # pl.imshow(mat)
    donors_dict = dict(list(zip(donor_ids[np.argmax(mat, axis=0)], donor_names)))
    for key in donors_dict.keys():
        adata.obs['SNPdemux'][adata.obs['SNPdemux']==key]=donors_dict[key]
    return adata


# Check demux

In [None]:
data_path='../data/'
cdata = scv.read(data_path+'NB_AS_C.loom')
new_index = [index[25:-1] for index in cdata.obs.index]  # clean index names
cdata.obs_names=new_index
apply_demux(cdata)
identify(cdata)

In [None]:
# C
adata = scv.read(data_path+'NB_AS_'+'C'+'_demuxed.h5')
sc.pp.filter_cells(adata, min_genes=10)
sc.pp.filter_genes(adata, min_cells=1000)
sc.pp.normalize_total(adata)
scv.pp.neighbors(adata)
scv.tl.umap(adata)
scv.pl.scatter(adata, basis='umap', color='SNPdemux', show=False)

In [None]:
# E
letter='E'
adata = scv.read(data_path+'NB_AS_'+letter+'_demuxed.h5')
sc.pp.filter_cells(adata, min_genes=10)
sc.pp.filter_genes(adata, min_cells=1000)
sc.pp.normalize_total(adata)
scv.pp.neighbors(adata)
scv.tl.umap(adata)
scv.pl.scatter(adata, basis='umap', color='SNPdemux', show=False)

In [None]:
# W
adata = scv.read(data_path+'NB_AS_'+'W'+'_demuxed.h5')
sc.pp.filter_cells(adata, min_genes=10)
sc.pp.filter_genes(adata, min_cells=1000)
sc.pp.normalize_total(adata)
scv.pp.neighbors(adata)
scv.tl.umap(adata)
scv.pl.scatter(adata, basis='umap', color='SNPdemux', show=False)

In [None]:
seu = pd.unique(cdata.obs['seuratdemux'])
print(seu)
snp = pd.unique(cdata.obs['SNPdemux'])
print(snp)

#Align for plot
snp = snp[[3, 4, 0, 1, 2]]
print(snp)
seu = seu[[4, 3, 0, 2, 1]]
print(seu)

In [None]:
mat = np.zeros((len(seu), len(snp)))
for i, sn in enumerate(snp):
    for j, se in enumerate(seu):
        mat[i,j] = len(set(np.where(cdata.obs['SNPdemux']==sn)[0]).intersection(set(np.where(cdata.obs['seuratdemux']==se)[0])))

im = pl.imshow(np.log(mat+1))
pl.xticks(range(len(seu)), seu, rotation=45)
pl.yticks(range(len(snp)), snp, rotation=0)
pl.ylabel('SNP Demux')
pl.xlabel('seurat Demux')
pl.title('Demux comparison\nshowing log (counts)+1')
pl.colorbar(im)

In [None]:
mat = np.zeros((len(seu[:3]), len(snp[:3])))
for i, sn in enumerate(snp[:3]):
    for j, se in enumerate(seu[:3]):
        mat[i,j] = len(set(np.where(cdata.obs['SNPdemux']==sn)[0]).intersection(set(np.where(cdata.obs['seuratdemux']==se)[0])))

im = pl.imshow(np.log(mat+1))
pl.xticks(range(len(seu[:3])), seu[:3], rotation=45)
pl.yticks(range(len(snp[:3])), snp[:3], rotation=0)
pl.ylabel('SNP Demux')
pl.xlabel('seurat Demux')
pl.title('Demux comparison\nshowing log (counts)+1')
pl.colorbar(im)

# Load, pp and save data

In [None]:
data_path='../data/'
letters = ['C', 'E', 'W']
adatas = [scv.read(data_path+'NB_AS_'+letter+'.loom') for letter in letters]
for cdata, letter in zip(adatas, letters): 
    new_index = [index[25:-1] for index in cdata.obs.index]  # clean index names
    cdata.obs_names=new_index
    apply_demux(cdata, letter)
    identify(cdata, letter)

In [None]:
cdata

In [None]:
for cdata, letter in zip(adatas, letters): 
    sc.write('../data/NB_AS_'+letter+'_demuxed.h5', cdata)

# (Velocity moved to scvelo folder) deprecated

## load data

In [None]:
cdata = scv.read('../data/NB_AS_C_demuxed.h5')
cdata.obs.head()

In [None]:
# subset
adata = cdata[cdata.obs['seuratdemux']=='NCO'].copy()

In [None]:
scv.pp.filter_and_normalize(adata, min_shared_counts=20, min_shared_cells=20)
sc.pp.pca(adata)
sc.pp.neighbors(adata)
sc.tl.umap(adata)
scv.tl.velocity(adata)
scv.tl.velocity_graph(adata)

tab = pd.read_excel('regev-colon-mmc2.xlsx')
ct_annotate(adata, tab, show=False)

In [None]:
scv.pl.scatter(adata, basis='umap', color=['annot_test', 'KRT20', 'CD44'])

In [None]:
scv.pl.velocity_embedding_stream(adata, color=['annot_test', 'KRT20', 'CD44'])

In [None]:
scv.pl.scatter(adata, basis=['KRT20', 'CD44'], color='annot_test')

In [None]:
sc.pp.highly_variable_genes(adata, n_top_genes=10)

In [None]:
adata.var[adata.var['highly_variable']].head()

In [None]:
adata.var[adata.var['velocity_genes']].head()

In [None]:
velo_genes = adata.var_names[adata.var['velocity_genes']]
k=20*0
scv.pl.scatter(adata, basis=velo_genes[k:k+20], color='annot_test')

In [None]:
tier_0 = 'RPS3, EPCAM, RPS21, TFF3, RPL30, RPS4X, EMP1, ITM2B, LMO7, AGPAT2'.split(', ')  # most interesting IMO
tier_1 = 'RPS8, RPL5, COL17A1, AKR1C1, FTH1, TSPAN8, PHGR1, B2M, HSD17B2, KRT20, ITGB4, SELENOW, MYH14, RPS23, FABP6, SLC39A4'.split(', ')
tier_2 = 'TMEM54, ANXA11, MYL6, KTN1, CA12, JUND, RABAC1, FHL2, MYO7B, LIPH, FLNB, PLS1, EDF1'.split(', ')

## Runs on C

In [None]:
cdata = scv.read('../data/NB_AS_C_demuxed.h5')
cdata.obs.head()

## p009ot

In [None]:
# pt009ot
p009ot_adata = cdata[cdata.obs['seuratdemux']=='p009ot']
scv.pp.filter_and_normalize(p009ot_adata, min_shared_counts=20, min_shared_cells=20)
sc.pp.pca(p009ot_adata)
sc.pp.neighbors(p009ot_adata)
sc.tl.umap(p009ot_adata)

tab = pd.read_excel('regev-colon-mmc2.xlsx')
ct_annotate(p009ot_adata, tab, show=False)

scv.tl.velocity(p009ot_adata)
scv.tl.velocity_graph(p009ot_adata)

scv.pl.velocity_embedding_stream(p009ot_adata, color='annot_test')

In [None]:
#velo_genes = adata.var_names[adata.var['velocity_genes']]
#k=20*2
#scv.pl.scatter(adata, basis=velo_genes[k:k+20], color='annot_test')
# found interesting genes by visual inspection of velocity_genes
p009ot_tier_0 = 'RCN1, RPS3, EMP1, PHGR1, HSD17B2, EIF1, EPCAM, TFF3, LIPH, SNHG8, RPS23, FABP6, CENPW, ABCB1, CA1, LCN15, LCN2, RPS4X, FLNA, RPL10'.split(', ')

In [None]:
scv.pl.scatter(p009ot_adata, basis=p009ot_tier_0, color='annot_test', use_raw=False)

## p013ot

In [None]:
p013ot_adata = cdata[cdata.obs['seuratdemux']=='p013ot']
scv.pp.filter_and_normalize(p013ot_adata, min_shared_counts=20, min_shared_cells=20)
sc.pp.pca(p013ot_adata)
sc.pp.neighbors(p013ot_adata)
sc.tl.umap(p013ot_adata)

tab = pd.read_excel('regev-colon-mmc2.xlsx')
ct_annotate(p013ot_adata, tab, show=False)

scv.tl.velocity(p013ot_adata)
scv.tl.velocity_graph(p013ot_adata)

scv.pl.velocity_embedding_stream(p013ot_adata, color='annot_test')

In [None]:
#velo_genes = adata.var_names[adata.var['velocity_genes']]
#k=20*1
#scv.pl.scatter(adata, basis=velo_genes[k:k+20], color='annot_test')
# found interesting genes by visual inspection of velocity_genes
pt013ot_tier_0 = 'MALAT1, PHGR1, EPCAM, TNNC2, TFF3, AREG, RPL30, FABP6, RPS4X'.split(', ')

In [None]:
scv.pl.scatter(p013ot_adata, basis=pt013ot_tier_0, color='annot_test', use_raw=False)

## NCO

In [None]:
# NCO
NCO_adata = cdata[cdata.obs['seuratdemux']=='NCO']
scv.pp.filter_and_normalize(NCO_adata, min_shared_counts=20, min_shared_cells=20)
sc.pp.pca(NCO_adata)
sc.pp.neighbors(NCO_adata)
sc.tl.umap(NCO_adata)

tab = pd.read_excel('regev-colon-mmc2.xlsx')
ct_annotate(NCO_adata, tab, show=False)

scv.tl.velocity(NCO_adata)
scv.tl.velocity_graph(NCO_adata)

scv.pl.velocity_embedding_stream(NCO_adata, color='annot_test')

In [None]:
#velo_genes = adata.var_names[adata.var['velocity_genes']]
#k=20*0
#scv.pl.scatter(adata, basis=velo_genes[k:k+20], color='annot_test')
# found interesting genes by visual inspection of velocity_genes

NCO_tier_0 = 'RPS3, EPCAM, RPS21, TFF3, RPL30, RPS4X, EMP1, ITM2B, LMO7, AGPAT2'.split(', ')  # most interesting IMO
NCO_tier_1 = 'RPS8, RPL5, COL17A1, AKR1C1, FTH1, TSPAN8, PHGR1, B2M, HSD17B2, KRT20, ITGB4, SELENOW, MYH14, RPS23, FABP6, SLC39A4'.split(', ')
NCO_tier_2 = 'TMEM54, ANXA11, MYL6, KTN1, CA12, JUND, RABAC1, FHL2, MYO7B, LIPH, FLNB, PLS1, EDF1'.split(', ')

In [None]:
scv.pl.scatter(NCO_adata, basis=NCO_tier_0, color='annot_test', use_raw=False)

## Comparison between donors

### Velocity genes

In [None]:
from matplotlib_venn import venn3
set1 = set(p009ot_adata.var_names[p009ot_adata.var['velocity_genes']])
set2 = set(p013ot_adata.var_names[p013ot_adata.var['velocity_genes']])
set3 = set(NCO_adata.var_names[NCO_adata.var['velocity_genes']])

venn3([set1, set2, set3], ('p009ot', 'p013ot', 'NCO'))
pl.title('Velocity Genes')
pl.show()

In [None]:
com = list(set1.intersection(set2).intersection(set3))
fig, axs = pl.subplots(3, len(com), figsize=[5*len(com), 15])

for i, data in enumerate([p009ot_adata, p013ot_adata, NCO_adata]):
    for j, gene in enumerate(com):
        scv.pl.scatter(data, basis=gene, color='annot_test', show=False, ax=axs[i][j])

## splitup lineages NCO

In [None]:
# NCO
NCO_adata = cdata[cdata.obs['seuratdemux']=='NCO']

scv.pp.filter_and_normalize(NCO_adata, min_shared_counts=20, min_shared_cells=20)

sc.pp.pca(NCO_adata)
sc.pp.neighbors(NCO_adata)
sc.tl.umap(NCO_adata)

tab = pd.read_excel('regev-colon-mmc2.xlsx')
ct_annotate(NCO_adata, tab, show=False)

scv.pl.scatter(NCO_adata, color='annot_test')

In [None]:
scv.tl.velocity(NCO_adata)
scv.tl.velocity_graph(NCO_adata)
scv.pl.velocity_embedding_stream(NCO_adata, color='annot_test')

In [None]:
pd.unique(NCO_adata.obs['annot_test'])

In [None]:
stem_lineage = ['Stem', 'Secretory TA']
NCO_adata_stem = NCO_adata[np.isin(NCO_adata.obs['annot_test'], stem_lineage)]

scv.pl.scatter(NCO_adata_stem, color='annot_test')

diff_lineage = ['TA 2', 'Enterocytes', 'E.Epithelial']
NCO_adata_diff = NCO_adata[np.isin(NCO_adata.obs['annot_test'], diff_lineage)]

scv.pl.scatter(NCO_adata_diff, color='annot_test')

In [None]:
scv.tl.velocity(NCO_adata_stem)
scv.tl.velocity_graph(NCO_adata_stem)
scv.pl.velocity_embedding_stream(NCO_adata_stem, color='annot_test')

In [None]:
scv.tl.velocity(NCO_adata_diff)
scv.tl.velocity_graph(NCO_adata_diff)
scv.pl.velocity_embedding_stream(NCO_adata_diff, color='annot_test')

## Dyn velo NCO full

In [None]:
# NCO
NCO_adata = cdata[cdata.obs['seuratdemux']=='NCO']
scv.pp.filter_and_normalize(NCO_adata, min_shared_counts=20, min_shared_cells=20)
sc.pp.pca(NCO_adata)
sc.pp.neighbors(NCO_adata)
sc.tl.umap(NCO_adata)

tab = pd.read_excel('regev-colon-mmc2.xlsx')
ct_annotate(NCO_adata, tab, show=False)

In [None]:
scv.tl.velocity(NCO_adata, vkey='stoch_velo', mode='stochastic')
scv.tl.velocity_graph(NCO_adata, vkey='stoch_velo')

scv.pl.velocity_embedding_stream(NCO_adata, color='annot_test', vkey='stoch_velo')

In [None]:
scv.tl.recover_dynamics(NCO_adata, var_names='velocity_genes')

In [None]:
scv.tl.velocity(NCO_adata, vkey='dyn_velo', mode='dynamical')
scv.tl.velocity_graph(NCO_adata, vkey='dyn_velo')

scv.pl.velocity_embedding_stream(NCO_adata, color='annot_test', vkey='dyn_velo')