In [None]:
import scvelo as scv
from IPython.display import clear_output
import matplotlib.backends.backend_pdf
from tqdm import tnrange, tqdm_notebook
import scanpy as sc
import matplotlib.pyplot as pl
import pandas as pd
import numpy as np
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:90% !important; }</style>"))
%matplotlib inline


scv.logging.print_version()
scv.settings.set_figure_params(
    'scvelo', dpi_save=100, dpi=80, transparent=True)
scv.settings.verbosity = 2

In [None]:
# paths and names
data_path='G:/data/scSLAMseq/revision/'
signatures_path='G:/data/scrnaseq_signature_collection/'
libraries = ['AB', 'CE', 'DF']
donors=['B2-040', 'C2-019', 'OT227', 'OT302', 'P009T', 'P013T']

# Manual

In [None]:
# load preformated and filtered data (not normalized and logscaled)
donor = donors[-1]
adata=sc.read(data_path+'by_donors/SLAM_'+donor+'.h5')
#adata = adata[~np.isin(adata.obs.perturbation, ['DMSO', 'BRAF'])].copy()

In [None]:
adata.layers['unspliced']=adata.layers['new']
adata.layers['spliced']=adata.layers['old']
scv.pl.proportions(adata, dpi=200, layers=['old', 'new'])

In [None]:
sc.pp.normalize_total(adata)
scv.pp.normalize_per_cell(adata)
#scv.pp.filter_genes_dispersion(adata, n_top_genes=2000)
scv.pp.log1p(adata)

In [None]:
# Annotations

# single genes of interest from Markus Morkel
single_genes = ['LGR5', 'OLFM4', 'TFF3', 'FABP1', 'EPHB2', 'AXIN1', 'AXIN2', 'EGR1']

k = sc.settings.verbosity
sc.settings.verbosity = 0

# cc score
cell_cycle_genes = [x.strip() for x in open(signatures_path+'cell_cycle_genes/regev_lab_cell_cycle_genes.txt')]
s_genes = cell_cycle_genes[:43]
g2m_genes = cell_cycle_genes[43:]
cell_cycle_genes = [x for x in cell_cycle_genes if x in adata.var_names]
adata.obs_names_make_unique()
sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)
# is it a problem that I score signatures before regressing out?
#sc.pp.regress_out(adata, ['S_score', 'G2M_score'])
# adata = adata[adata.obs.phase=='G1'].copy()

# Stem sig
tab=pd.read_excel(signatures_path+'cell_type_markers/CRC-related_stem_cell_signatures.xlsx', header=0)
tab = tab.drop(0)
sigs = {'Stem_'+x: list(tab[x][~pd.isna(tab[x])].values) for x in tab.columns}
for ct in ['Stem_Lgr5_ISC-Munoz', 'Stem_Lgr5_ISC-Merlos']:  #sigs.keys():
    sc.tl.score_genes(adata, sigs[ct], score_name=ct)

# Flo sig
tab=pd.read_excel(signatures_path+'cell_type_markers/colonoid_cancer_uhlitz_markers.xlsx', header=1)
flo_sigs={x: list(tab[tab['cell_type_epi']==x].gene.values) for x in pd.unique(tab['cell_type_epi'])}
for ct in ['Stem', 'Enterocytes 1', 'Enterocytes 2', 'TC1', 'TC4', 'Goblet']:  #flo_sigs.keys():
    sc.tl.score_genes(adata, flo_sigs[ct], score_name=ct)

# YAP target from Markus Morkel, Liberali Lab
yap_targets = ['CTGF', 'GGTA1', 'WWC2', 'ANXA8', 'CLU', 'CXCL16', 'IL33', 'LY6A', 'LY6C1', 'MSLN', 'TNFRSF12A', 'CTGF', 'GGTA1', 'WWC2', 'ANXA5', 'TACSTD2', 'ANXA10', 'EREG', 'IL33', 'ANXA1', 'ANXA3']
sc.tl.score_genes(adata, yap_targets, score_name='YAP_targets')

sc.settings.verbosity = k

In [None]:
scv.pp.pca(adata)
scv.pp.neighbors(adata)
# scv.tl.umap(adata)

# umap on 2000 HVGs
bdata=scv.pp.filter_genes_dispersion(adata, n_top_genes=2000, copy=True)
scv.pp.pca(bdata)
scv.pp.neighbors(bdata)
scv.tl.umap(bdata)
adata.obsm['X_umap']=bdata.obsm['X_umap']
del bdata

scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
scv.tl.velocity(adata)
scv.tl.velocity_graph(adata)

In [None]:
scv.pl.scatter(adata, color='perturbation')

In [None]:
color=[
    'Stem_Lgr5_ISC-Munoz', 'Stem_Lgr5_ISC-Merlos', 'Stem', 'Enterocytes 1', 'Enterocytes 2', 'TC1', 'TC4', 'Goblet',
    'phase', 'percent_ribosomal', 'percent.mt', 'perturbation', 'YAP_targets'
]

scv.pl.velocity_embedding_stream(adata, basis='umap', color=color, legend_loc='right_margin', perc=[1,99], ncols=4)

In [None]:
axs=scv.pl.scatter(adata, basis='umap', color=color, legend_loc='right_margin', perc=[1,99], ncols=4, show=False, s=100)
axs=scv.pl.velocity_embedding(adata, basis='umap', color=color, legend_loc='right_margin', perc=[1,99], ncols=4, show=False, ax=axs, size=0, arrow_length=5, arrow_size=5)

In [None]:
# restrict KNN connectivities to within perturbations only
from scipy.sparse import csr_matrix
adata = adata[np.argsort(adata.obs.perturbation)].copy()  # sort by perturbation
A=adata.obsp['connectivities'].A
pl.imshow(A*5000, vmax=0.1)
pl.show()
for pert in pd.unique(adata.obs.perturbation):
    a=np.where(adata.obs.perturbation==pert)[0]
    b=np.where(adata.obs.perturbation!=pert)[0]
    # remove neighborhoods
    A[np.min(a):np.max(a), b]=0
    A[b, np.min(a):np.max(a)]=0
adata.obsp['connectivities'] = csr_matrix(A)
pl.imshow(A*5000, vmax=0.1)

In [None]:
scv.pp.moments(adata)

In [None]:
pert = 'DMSO'
scv.tl.velocity(adata, groupby='perturbation', groups=[pert])
scv.tl.velocity_graph(adata)

In [None]:
# this convenient plot only works if color is perturbation...
scv.pl.velocity_embedding_stream(adata, basis='umap', color='perturbation', legend_loc='right_margin', perc=[1,99], ncols=4, groups=[pert])

In [None]:
colors=['phase', 'Stem'
    #'Stem_Lgr5_ISC-Munoz', 'Stem_Lgr5_ISC-Merlos', 'Stem', 'Enterocytes 1', 'Enterocytes 2', 'TC1', 'TC4', 'Goblet',
    #'phase', 'percent_ribosomal', 'percent.mt', 'perturbation', 'YAP_targets'
]
n=len(colors)
axs=scv.pl.scatter(adata, basis='umap', title=list(np.array(np.arange(0,n), dtype=str)), ncols=4, show=False, size=250, alpha=0, dpi=200)
for ax, color in zip(axs, colors):
    scv.pl.scatter(adata[adata.obs.perturbation!=pert], basis='umap', color='grey', show=False, size=250, alpha=0.07, dpi=200, ax=ax)
    ax=scv.pl.velocity_embedding_grid(adata[adata.obs.perturbation==pert], basis='umap', size=250, color=color, legend_loc='right_margin', perc=[1,99], 
                                      ncols=4, show=False, ax=ax, arrow_size=5, arrow_length=5, arrow_color='k', density=0.4, dpi=200, min_mass=20, alpha=.4)
# fig=pl.gcf()
# pl.savefig("test.png")

In [None]:
pert='EGFR'

scv.tl.velocity(adata, groupby='perturbation', groups=[pert])
scv.tl.velocity_graph(adata)

colors=['phase', 'Stem_Lgr5_ISC-Munoz', 'Stem_Lgr5_ISC-Merlos', 'Stem', 
        # 'Enterocytes 1', 'Enterocytes 2', 'TC1', 'TC4', 'Goblet',
    #'phase', 'percent_ribosomal', 'percent.mt', 'perturbation', 'YAP_targets'
]
n=len(colors)
axs=scv.pl.scatter(adata, basis='umap', title=list(np.array(np.arange(0,n), dtype=str)), ncols=4, show=False, size=250, alpha=0)
for ax, color in zip(axs, colors):
    scv.pl.scatter(adata[adata.obs.perturbation!=pert].copy(), basis='umap', color='grey', show=False, size=250, alpha=0.07, ax=ax)
    ax=scv.pl.velocity_embedding_grid(adata[adata.obs.perturbation==pert].copy(), basis='umap', size=250, color=color, legend_loc='right_margin', perc=[1,99], 
                                      ncols=4, show=False, ax=ax, arrow_size=5, arrow_length=5, arrow_color='k', density=0.4, min_mass=20, alpha=.4)
#fig=pl.gcf()
#pl.savefig("test.png")

In [None]:
for pert in pd.unique(adata.obs.perturbation[:2]):
    scv.tl.velocity(adata, groupby='perturbation', groups=[pert])
    scv.tl.velocity_graph(adata)

    colors=['phase', 'Stem_Lgr5_ISC-Munoz', 'Stem_Lgr5_ISC-Merlos', 'Stem', 
            # 'Enterocytes 1', 'Enterocytes 2', 'TC1', 'TC4', 'Goblet',
        #'phase', 'percent_ribosomal', 'percent.mt', 'perturbation', 'YAP_targets'
    ]
    n=len(colors)
    axs=scv.pl.scatter(adata, basis='umap', title=list(np.array(np.arange(0,n), dtype=str)), ncols=4, show=False, size=250, alpha=0)
    bdata = adata[adata.obs.perturbation==pert].copy()
    for ax, color in zip(axs, colors):
        scv.pl.scatter(bdata, basis='umap', color='grey', show=False, size=250, alpha=0.07, ax=ax)
        ax=scv.pl.velocity_embedding_grid(bdata, basis='umap', size=250, color=color, legend_loc='right_margin', perc=[1,99], 
                                          ncols=4, show=False, ax=ax, arrow_size=5, arrow_length=5, arrow_color='k', density=0.4, min_mass=20, alpha=.4, title=donor+' '+pert+' '+color)
    pl.show()
    fig=pl.gcf()
    pl.savefig(donor+'_'+pert+".png")

In [None]:
scv.pl.velocity_embedding(adata, basis='umap', color='perturbation', legend_loc='right_margin', perc=[1,99], ncols=4, groups=['DMSO', 'BRAF'], size=200, arrow_size=200, arrow_length=7)

In [None]:
scv.pl.velocity_embedding_grid(adata, basis='umap', color='perturbation', legend_loc='right_margin', perc=[1,99], ncols=4, groups=['DMSO', 'BRAF'], size=200, arrow_size=5, arrow_length=5, density=0.2, min_mass=20, arrow_color='k')

### debug h5

In [None]:
# load preformated and filtered data (not normalized and logscaled)
donor = donors[0]
adata=sc.read(data_path+'by_donors/SLAM_'+donor+'.h5')

In [None]:
adata

In [None]:
# load preformated and filtered data (not normalized and logscaled)
donor = donors[0]
cdata=sc.read(data_path+'by_donors/from_cluster/SLAM_'+donor+'.h5')  # from cluster

In [None]:
cdata

In [None]:
# load preformated and filtered data (not normalized and logscaled)
donor = donors[0]
xdata=sc.read(data_path+'by_donors/SLAM_'+donor+'preprocessed.h5')

In [None]:
xdata

## build function

In [None]:
def plot(adata, donor, subset=None, vsubset=None, single_genes=None, singles_too=False):
    subset = [subset] if type(subset) is str else subset
    vsubset = [vsubset] if type(vsubset) is str else vsubset
    
    adata = adata[np.isin(adata.obs.perturbation, subset)].copy() if subset is not None else adata
    adata.layers['unspliced']=adata.layers['new']
    adata.layers['spliced']=adata.layers['old']
    
    # prepare
    sc.pp.normalize_total(adata)
    scv.pp.normalize_per_cell(adata)
    scv.pp.log1p(adata)
    scv.pp.pca(adata)
    scv.pp.neighbors(adata)

    # umap on 2000 HVGs
    bdata=scv.pp.filter_genes_dispersion(adata, n_top_genes=2000, copy=True)
    scv.pp.pca(bdata)
    scv.pp.neighbors(bdata)
    scv.tl.umap(bdata)
    adata.obsm['X_umap']=bdata.obsm['X_umap']
    del bdata
    
    # velocity
    scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
    if vsubset is not None:
        scv.tl.velocity(adata, groupby='perturbation', groups=vsubset)
    else:
        scv.tl.velocity(adata)
    scv.tl.velocity_graph(adata)
    
    # Annotations
    k = sc.settings.verbosity
    sc.settings.verbosity = 0
    # Stem sig
    tab=pd.read_excel(signatures_path+'cell_type_markers/CRC-related_stem_cell_signatures.xlsx', header=0)
    tab = tab.drop(0)
    sigs = {'Stem_'+x: list(tab[x][~pd.isna(tab[x])].values) for x in tab.columns}
    for ct in ['Stem_Lgr5_ISC-Munoz', 'Stem_Lgr5_ISC-Merlos']:  #sigs.keys():
        sc.tl.score_genes(adata, sigs[ct], score_name=ct)

    # Flo sig
    tab=pd.read_excel(signatures_path+'cell_type_markers/colonoid_cancer_uhlitz_markers.xlsx', header=1)
    flo_sigs={x: list(tab[tab['cell_type_epi']==x].gene.values) for x in pd.unique(tab['cell_type_epi'])}
    for ct in ['Stem', 'Enterocytes 1', 'Enterocytes 2', 'TC1', 'TC4', 'Goblet']:  #flo_sigs.keys():
        sc.tl.score_genes(adata, flo_sigs[ct], score_name=ct)

    # cc score
    cell_cycle_genes = [x.strip() for x in open(signatures_path+'cell_cycle_genes/regev_lab_cell_cycle_genes.txt')]
    s_genes = cell_cycle_genes[:43]
    g2m_genes = cell_cycle_genes[43:]
    cell_cycle_genes = [x for x in cell_cycle_genes if x in adata.var_names]
    adata.obs_names_make_unique()
    sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)

    # YAP target from Markus Morkel, Liberali Lab
    yap_targets = ['CTGF', 'GGTA1', 'WWC2', 'ANXA8', 'CLU', 'CXCL16', 'IL33', 'LY6A', 'LY6C1', 'MSLN', 'TNFRSF12A', 'CTGF', 'GGTA1', 'WWC2', 'ANXA5', 'TACSTD2', 'ANXA10', 'EREG', 'IL33', 'ANXA1', 'ANXA3']
    sc.tl.score_genes(adata, yap_targets, score_name='YAP_targets')
    sc.settings.verbosity = k
    
    color=[
        'Stem_Lgr5_ISC-Munoz', 'Stem_Lgr5_ISC-Merlos', 'Stem', 'Enterocytes 1', 'Enterocytes 2', 'TC1', 'TC4', 'Goblet',
        'phase', 'percent_ribosomal', 'percent.mt', 'perturbation', 'YAP_targets'
    ]

    scv.pl.velocity_embedding_stream(adata, basis='umap', color=color, legend_loc='right_margin', perc=[1,99], ncols=4, title=[donor+' '+c for c in color])
    scv.pl.velocity_embedding_stream(adata, basis='umap', color=single_genes, legend_loc='right_margin', perc=[1,99], ncols=4, title=[donor+' '+c for c in color])
    
    if singles_too:
        for perturbation in pd.unique(adata.obs.perturbation):
            # velocity
            scv.tl.velocity(adata, groupby='perturbation', groups=perturbation)

In [None]:
donor = donors[-1]
adata=sc.read(data_path+'by_donors/SLAM_'+donor+'.h5')

In [None]:
plot(adata, donor, single_genes=single_genes)

In [None]:
scv.pl.velocity_embedding_stream(adata, basis='umap', color=color, legend_loc='right_margin', perc=[1,99], ncols=4, title=[donor+' '+c for c in color], groups=['DMSO'])

# Systematic

## All perturbs

In [None]:
# load preformated and filtered data (not normalized and logscaled)
for donor in tqdm_notebook(donors):
    adata=sc.read(data_path+'by_donors/SLAM_'+donor+'.h5')
    #adata = adata[~np.isin(adata.obs.perturbation, ['DMSO', 'BRAF'])].copy()
    adata.layers['unspliced']=adata.layers['new']
    adata.layers['spliced']=adata.layers['old']
    #scv.pl.proportions(adata, dpi=200, layers=['old', 'new'])
    sc.pp.normalize_total(adata)
    scv.pp.normalize_per_cell(adata)
    #scv.pp.filter_genes_dispersion(adata, n_top_genes=2000)
    scv.pp.log1p(adata)
    scv.pp.pca(adata)
    scv.pp.neighbors(adata)
    # scv.tl.umap(adata)

    # umap on 2000 HVGs
    bdata=scv.pp.filter_genes_dispersion(adata, n_top_genes=2000, copy=True)
    scv.pp.pca(bdata)
    scv.pp.neighbors(bdata)
    scv.tl.umap(bdata)
    adata.obsm['X_umap']=bdata.obsm['X_umap']
    del bdata

    scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
    scv.tl.velocity(adata)
    scv.tl.velocity_graph(adata)
    
    # Annotations
    k = sc.settings.verbosity
    sc.settings.verbosity = 0
    # Stem sig
    tab=pd.read_excel(signatures_path+'cell_type_markers/CRC-related_stem_cell_signatures.xlsx', header=0)
    tab = tab.drop(0)
    sigs = {'Stem_'+x: list(tab[x][~pd.isna(tab[x])].values) for x in tab.columns}
    for ct in sigs.keys():
        sc.tl.score_genes(adata, sigs[ct], score_name=ct)
    # Flo sig
    tab=pd.read_excel(signatures_path+'cell_type_markers/colonoid_cancer_uhlitz_markers.xlsx', header=1)
    flo_sigs={x: list(tab[tab['cell_type_epi']==x].gene.values) for x in pd.unique(tab['cell_type_epi'])}
    for ct in flo_sigs.keys():
        sc.tl.score_genes(adata, flo_sigs[ct], score_name=ct)
    # cc score
    cell_cycle_genes = [x.strip() for x in open(signatures_path+'cell_cycle_genes/regev_lab_cell_cycle_genes.txt')]
    s_genes = cell_cycle_genes[:43]
    g2m_genes = cell_cycle_genes[43:]
    cell_cycle_genes = [x for x in cell_cycle_genes if x in adata.var_names]
    adata.obs_names_make_unique()
    sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)
    sc.settings.verbosity = k
    
    color=[
        'Stem_Lgr5_ISC-Munoz', 'Stem_Lgr5_ISC-Merlos', 'Stem', 'Enterocytes 2',
        'phase', 'percent_ribosomal', 'percent.mt', 'perturbation'
    ]

    scv.pl.velocity_embedding_stream(adata, basis='umap', color=color, legend_loc='right_margin', perc=[1,99], ncols=4, title=[donor+' '+c for c in color])

In [None]:
adata

## Everything with Mek or EGFR

In [None]:
# load preformated and filtered data (not normalized and logscaled)
for donor in tqdm_notebook(donors):
    adata=sc.read(data_path+'by_donors/SLAM_'+donor+'.h5')
    adata = adata[~np.isin(adata.obs.perturbation, ['DMSO', 'BRAF'])].copy()
    adata.layers['unspliced']=adata.layers['new']
    adata.layers['spliced']=adata.layers['old']
    #scv.pl.proportions(adata, dpi=200, layers=['old', 'new'])
    sc.pp.normalize_total(adata)
    scv.pp.normalize_per_cell(adata)
    #scv.pp.filter_genes_dispersion(adata, n_top_genes=2000)
    scv.pp.log1p(adata)
    scv.pp.pca(adata)
    scv.pp.neighbors(adata)
    # scv.tl.umap(adata)

    # umap on 2000 HVGs
    bdata=scv.pp.filter_genes_dispersion(adata, n_top_genes=2000, copy=True)
    scv.pp.pca(bdata)
    scv.pp.neighbors(bdata)
    scv.tl.umap(bdata)
    adata.obsm['X_umap']=bdata.obsm['X_umap']
    del bdata

    scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
    scv.tl.velocity(adata)
    scv.tl.velocity_graph(adata)
    
    # Annotations
    k = sc.settings.verbosity
    sc.settings.verbosity = 0
    # Stem sig
    tab=pd.read_excel(signatures_path+'cell_type_markers/CRC-related_stem_cell_signatures.xlsx', header=0)
    tab = tab.drop(0)
    sigs = {'Stem_'+x: list(tab[x][~pd.isna(tab[x])].values) for x in tab.columns}
    for ct in sigs.keys():
        sc.tl.score_genes(adata, sigs[ct], score_name=ct)
    # Flo sig
    tab=pd.read_excel(signatures_path+'cell_type_markers/colonoid_cancer_uhlitz_markers.xlsx', header=1)
    flo_sigs={x: list(tab[tab['cell_type_epi']==x].gene.values) for x in pd.unique(tab['cell_type_epi'])}
    for ct in flo_sigs.keys():
        sc.tl.score_genes(adata, flo_sigs[ct], score_name=ct)
    # cc score
    cell_cycle_genes = [x.strip() for x in open(signatures_path+'cell_cycle_genes/regev_lab_cell_cycle_genes.txt')]
    s_genes = cell_cycle_genes[:43]
    g2m_genes = cell_cycle_genes[43:]
    cell_cycle_genes = [x for x in cell_cycle_genes if x in adata.var_names]
    adata.obs_names_make_unique()
    sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)
    sc.settings.verbosity = k
    
    color=[
        'Stem_Lgr5_ISC-Munoz', 'Stem_Lgr5_ISC-Merlos', 'Stem', 'Enterocytes 2',
        'phase', 'percent_ribosomal', 'percent.mt', 'perturbation'
    ]

    scv.pl.velocity_embedding_stream(adata, basis='umap', color=color, legend_loc='right_margin', perc=[1,99], ncols=4, title=[donor+' '+c for c in color])

## Everything without MEK and without EGFR

In [None]:
# load preformated and filtered data (not normalized and logscaled)
for donor in tqdm_notebook(donors):
    adata=sc.read(data_path+'by_donors/SLAM_'+donor+'.h5')
    adata = adata[np.isin(adata.obs.perturbation, ['DMSO', 'BRAF'])].copy()
    adata.layers['unspliced']=adata.layers['new']
    adata.layers['spliced']=adata.layers['old']
    #scv.pl.proportions(adata, dpi=200, layers=['old', 'new'])
    sc.pp.normalize_total(adata)
    scv.pp.normalize_per_cell(adata)
    #scv.pp.filter_genes_dispersion(adata, n_top_genes=2000)
    scv.pp.log1p(adata)
    scv.pp.pca(adata)
    scv.pp.neighbors(adata)
    # scv.tl.umap(adata)

    # umap on 2000 HVGs
    bdata=scv.pp.filter_genes_dispersion(adata, n_top_genes=2000, copy=True)
    scv.pp.pca(bdata)
    scv.pp.neighbors(bdata)
    scv.tl.umap(bdata)
    adata.obsm['X_umap']=bdata.obsm['X_umap']
    del bdata

    scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
    scv.tl.velocity(adata)
    scv.tl.velocity_graph(adata)
    
    # Annotations
    k = sc.settings.verbosity
    sc.settings.verbosity = 0
    # Stem sig
    tab=pd.read_excel(signatures_path+'cell_type_markers/CRC-related_stem_cell_signatures.xlsx', header=0)
    tab = tab.drop(0)
    sigs = {'Stem_'+x: list(tab[x][~pd.isna(tab[x])].values) for x in tab.columns}
    for ct in sigs.keys():
        sc.tl.score_genes(adata, sigs[ct], score_name=ct)
    # Flo sig
    tab=pd.read_excel(signatures_path+'cell_type_markers/colonoid_cancer_uhlitz_markers.xlsx', header=1)
    flo_sigs={x: list(tab[tab['cell_type_epi']==x].gene.values) for x in pd.unique(tab['cell_type_epi'])}
    for ct in flo_sigs.keys():
        sc.tl.score_genes(adata, flo_sigs[ct], score_name=ct)
    # cc score
    cell_cycle_genes = [x.strip() for x in open(signatures_path+'cell_cycle_genes/regev_lab_cell_cycle_genes.txt')]
    s_genes = cell_cycle_genes[:43]
    g2m_genes = cell_cycle_genes[43:]
    cell_cycle_genes = [x for x in cell_cycle_genes if x in adata.var_names]
    adata.obs_names_make_unique()
    sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)
    sc.settings.verbosity = k
    
    color=[
        'Stem_Lgr5_ISC-Munoz', 'Stem_Lgr5_ISC-Merlos', 'Stem', 'Enterocytes 2',
        'phase', 'percent_ribosomal', 'percent.mt', 'perturbation'
    ]

    scv.pl.velocity_embedding_stream(adata, basis='umap', color=color, legend_loc='right_margin', perc=[1,99], ncols=4, title=[donor+' '+c for c in color])

## DMSO only

In [None]:
# load preformated and filtered data (not normalized and logscaled)
for donor in tqdm_notebook(donors):
    adata=sc.read(data_path+'by_donors/SLAM_'+donor+'.h5')
    adata = adata[np.isin(adata.obs.perturbation, ['DMSO'])].copy()
    adata.layers['unspliced']=adata.layers['new']
    adata.layers['spliced']=adata.layers['old']
    #scv.pl.proportions(adata, dpi=200, layers=['old', 'new'])
    sc.pp.normalize_total(adata)
    scv.pp.normalize_per_cell(adata)
    #scv.pp.filter_genes_dispersion(adata, n_top_genes=2000)
    scv.pp.log1p(adata)
    scv.pp.pca(adata)
    scv.pp.neighbors(adata)
    # scv.tl.umap(adata)

    # umap on 2000 HVGs
    bdata=scv.pp.filter_genes_dispersion(adata, n_top_genes=2000, copy=True)
    scv.pp.pca(bdata)
    scv.pp.neighbors(bdata)
    scv.tl.umap(bdata)
    adata.obsm['X_umap']=bdata.obsm['X_umap']
    del bdata

    scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
    scv.tl.velocity(adata)
    scv.tl.velocity_graph(adata)
    
    # Annotations
    k = sc.settings.verbosity
    sc.settings.verbosity = 0
    # Stem sig
    tab=pd.read_excel(signatures_path+'cell_type_markers/CRC-related_stem_cell_signatures.xlsx', header=0)
    tab = tab.drop(0)
    sigs = {'Stem_'+x: list(tab[x][~pd.isna(tab[x])].values) for x in tab.columns}
    for ct in sigs.keys():
        sc.tl.score_genes(adata, sigs[ct], score_name=ct)
    # Flo sig
    tab=pd.read_excel(signatures_path+'cell_type_markers/colonoid_cancer_uhlitz_markers.xlsx', header=1)
    flo_sigs={x: list(tab[tab['cell_type_epi']==x].gene.values) for x in pd.unique(tab['cell_type_epi'])}
    for ct in flo_sigs.keys():
        sc.tl.score_genes(adata, flo_sigs[ct], score_name=ct)
    # cc score
    cell_cycle_genes = [x.strip() for x in open(signatures_path+'cell_cycle_genes/regev_lab_cell_cycle_genes.txt')]
    s_genes = cell_cycle_genes[:43]
    g2m_genes = cell_cycle_genes[43:]
    cell_cycle_genes = [x for x in cell_cycle_genes if x in adata.var_names]
    adata.obs_names_make_unique()
    sc.tl.score_genes_cell_cycle(adata, s_genes=s_genes, g2m_genes=g2m_genes)
    sc.settings.verbosity = k
    
    color=[
        'Stem_Lgr5_ISC-Munoz', 'Stem_Lgr5_ISC-Merlos', 'Stem', 'Enterocytes 2',
        'phase', 'percent_ribosomal', 'percent.mt', 'perturbation'
    ]

    scv.pl.velocity_embedding_stream(adata, basis='umap', color=color, legend_loc='right_margin', perc=[1,99], ncols=4, title=[donor+' '+c for c in color])

# Compare without SLAM

In [None]:
library=libraries[0]
adata = sc.read_10x_h5(data_path+'cellranger_output_MM_ML_revision_'+library+'/outs/filtered_feature_bc_matrix.h5')
adata.var_names_make_unique()
adata.obs_names = [x[:-2] for x in adata.obs_names]

# add annotation
tab = pd.read_csv(data_path+'annotation.tsv', sep='\t')
subtab=tab[tab.cell.str.startswith(library)]
subtab.cell=subtab.cell.str.replace(library+'_', '')
subtab['library'] = library
subtab = subtab.set_index('cell')
subtab = subtab[np.isin(subtab.index, adata.obs_names)]
adata.obs=pd.concat([adata.obs, subtab], axis=1, join='outer')

# throw out cell that nils filtered and hence did not annotate
adata = adata[~pd.isna(adata.obs.library)]

# throw out Doublets and negatives from HTO demux
adata = adata[adata.obs['HTO_classification.global']=='Singlet'].copy()

# annotate ribosomal
ribo_genes = np.logical_or(adata.var_names.str.startswith('RPS'), adata.var_names.str.startswith('RPL'))
adata.obs['percent_ribosomal'] = np.sum(adata[:, ribo_genes].X, axis=1).A1 / np.sum(adata.X, axis=1).A1

In [None]:
adata = adata[adata.obs.organoid==donor].copy()

In [None]:
adata