# Contents

Make sure to run `Prepare_data.ipynb` beforehand, as this produces the required intermediate .h5 objects `data_path+'by_donors/processed/SLAMv2_'+donor+'_processed'+ccreg+'.h5'`.

In this script, the full analysis pipeline is applied to selected perturbations and samples of interest. Ultimately, pdfs and excel tables with data are produced for multiple runs of scvelo models, containing:
- dynamical / steady state models of SLAM or Classical RNA velocity, plotted on UMAPs
- UMAPs with informative colors (signatures, cell cycle phase, etc)
- phase plots of known signature genes and of the top likelihood genes
- time series plots and heatmaps of signatures against recovered latent time from the dynamical SLAM velocity model.

# Imports and Settings (run these first!)

In [14]:
import scvelo as scv
from IPython.display import clear_output
import matplotlib.backends.backend_pdf
from tqdm import tnrange, tqdm_notebook
import scanpy as sc
import matplotlib.pyplot as pl
import pandas as pd
import numpy as np
from IPython.core.display import display, HTML
display(HTML("<style>.container { width:97% !important; }</style>"))
%matplotlib inline


scv.logging.print_version()
scv.settings.set_figure_params(
    'scvelo', dpi_save=100, dpi=80, transparent=True)
scv.settings.verbosity = 1

figure_path='G:/data/figures/split_dynamics/'
scv.settings.figdir=figure_path
scv.settings.plot_prefix=''

Running scvelo 0.1.16.dev32+c00a55e.dirty (python 3.6.6) on 2021-01-27 11:11.


In [15]:
# paths and names, change accordingly
data_path='G:/data/scSLAMseq/revision/'
signatures_path='G:/data/scrnaseq_signature_collection/'
figure_path='G:/data/figures/'
donors=['B2-040', 'C2-019', 'OT227', 'OT302', 'P009T', 'P013T']

# Fit Selected Samples

In [16]:
# Preload signatures, see preprocessing ipynb for details.

# Stem sig
tab=pd.read_excel(signatures_path+'cell_type_markers/CRC-related_stem_cell_signatures.xlsx', header=0)
tab = tab.drop(0)
sigs = {'Stem_'+x: list(tab[x][~pd.isna(tab[x])].values) for x in ['Lgr5_ISC-Munoz', 'Lgr5_ISC-Merlos']}

# Flo sig
# tab=pd.read_excel(signatures_path+'cell_type_markers/colonoid_cancer_uhlitz_markers.xlsx', header=1)
tab=pd.read_excel(signatures_path+'cell_type_markers/colonoid_cancer_uhlitz_markers_revised.xlsx', header=1, sheet_name=2)
flo_sigs={x: list(tab[tab['cell_type_epi']==x].gene.values) for x in ['Stem', 'Enterocytes 1', 'TC1', 'TC4', 'Goblet', 'Stem/TA 1']}

single_genes = ['LGR5', 'OLFM4', 'TFF3', 'FABP1', 'EPHB2', 'AXIN1', 'AXIN2', 'EGR1']

# YAP target from Markus Morkel, Liberali Lab
yap_targets = ['CTGF', 'GGTA1', 'WWC2', 'ANXA8', 'CLU', 'CXCL16', 'IL33', 'LY6A', 'LY6C1', 
               'MSLN', 'TNFRSF12A', 'CTGF', 'GGTA1', 'WWC2', 'ANXA5', 'TACSTD2', 'ANXA10', 'EREG', 'IL33', 'ANXA1', 'ANXA3']

# Hallmark apoptosis for rescoring
tab=pd.read_csv(signatures_path+'msigdb/hallmark/h.all.v6.2.symbols.gmt', sep='\t', header=None, index_col=0).T['HALLMARK_APOPTOSIS']
hallmark_apo = tab[~pd.isna(tab)].values[1:]

In [23]:
# Define dynamic analysis pipeline
def dynamic_analysis(donor, names, pert_combis, do_ccreg, mode, sigs, flo_sigs, 
                     hallmark_apo, single_genes, yap_targets, export_path, save_model=False):
    from scvelo.plotting.utils import default_size
    from matplotlib.backends.backend_pdf import PdfPages

    high_shared_vars = None
    top_dmso_genes = None

    for perts, name in zip(pert_combis, names):
        # reg cell cycle selection
        ccreg='_ccreg' if do_ccreg else ''

        # name format
        print(name)
        name = name+'_'+donor +ccreg

        # read preprocessed data
        adata=sc.read(data_path+'by_donors/processed/SLAMv2_'+donor+'_processed'+ccreg+'.h5')
        if 'X_diffmap' in adata.obsm.keys(): del adata.obsm['X_diffmap']

        # rescore hallmark apoptosis
        sc.tl.score_genes(adata, gene_list=hallmark_apo, score_name='HALLMARK_APOPTOSIS')
        
        # save embedding of all perts for plotting canvas (this is a hack)
        from scipy.sparse import csr_matrix
        xdata = sc.AnnData(X=csr_matrix(np.zeros(adata.X.shape)))
        xdata.obsm['X_umap'] = adata.obsm['X_umap']
        
        # subset to selected perts
        adata = adata[np.isin(adata.obs.perturbation, perts)].copy()
        scv.pp.neighbors(adata)
        scv.pp.moments(adata, n_pcs=30, n_neighbors=30)

        # real ss velocity
        scv.tl.velocity(adata, vkey='recomputed_real_velocity')
        scv.tl.velocity_graph(adata, vkey='recomputed_real_velocity')
        scv.tl.velocity_embedding(adata, basis='umap', vkey='recomputed_real_velocity')

        # select SLAM
        if mode == 'SLAM':
            adata.layers['unspliced']=adata.layers['new']
            adata.layers['spliced']=adata.layers['old']
            scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
        for x in ['new', 'old', 'real_ambiguous', 'real_spliced', 'real_unspliced']: del adata.layers[x]

        # SLAM ss velocity
        scv.tl.velocity(adata, vkey='recomputed_SLAM_velocity')
        scv.tl.velocity_graph(adata, vkey='recomputed_SLAM_velocity')
        scv.tl.velocity_embedding(adata, basis='umap', vkey='recomputed_SLAM_velocity')

        # filtering and select genes to fit
        cdata=sc.AnnData(adata.X, var=adata.var[[]], layers={'unspliced': adata.layers['unspliced'],'spliced': adata.layers['spliced']})
        scv.pp.filter_genes(cdata, min_shared_counts=20)
        # Why does this not work???
        try:
            scv.pp.filter_genes_dispersion(cdata, n_top_genes=2000)
        except:
            try:
                scv.pp.filter_genes_dispersion(cdata, n_top_genes=1000)
                print('Using 1000 HVgenes instead of 2000')
            except:
                try:
                    scv.pp.filter_genes_dispersion(cdata, n_top_genes=750)
                    print('Using 750 HVgenes instead of 2000')
                except:
                    scv.pp.filter_genes_dispersion(cdata, n_top_genes=300)
                    print('Using 300 HVgenes instead of 2000')
                
            # print(donor, ' filtering key error. Prolly not enough cells(?)')
            # continue
        high_shared_vars=cdata.var_names.values 
        del cdata
        print('High shared vars: ', len(high_shared_vars))

        # dynamical model and latent time computation
        scv.tl.recover_dynamics(adata, var_names=high_shared_vars)
        scv.tl.velocity(adata, mode='dynamical')
        scv.tl.velocity_graph(adata)
        scv.tl.velocity_embedding(adata, basis='umap')
        scv.tl.recover_latent_time(adata)
        
        # Produce pdfs and excels:
        writer = pd.ExcelWriter(data_path+'tables/'+name+'_tables.xlsx', engine='xlsxwriter')
        with PdfPages(figure_path+'pdfs/'+name+'_dynamics.pdf') as pdf:
            # latent time series of signatures
            from scipy.stats import zscore
            obs = ['Stem', 'Goblet', 'MAPK_progeny', 'YAP_targets', 'Stem_Lgr5_ISC-Merlos', 'TC1', 'TC4', 'percent_ribosomal']
            colors = ['red', 'blue', 'green', 'black', 'grey', 'orange', 'yellow', 'cyan']
            ax=None
            for o, c in zip(obs,colors):
                ax=scv.pl.scatter(adata, x=adata.obs.latent_time, y=zscore(adata.obs[o]), alpha=0.5, figsize=[10,5], size=50,
                                  color=c, n_convolve=50, show=False, xlabel='latent time', ylabel='zscore', ax=ax)
            ax.legend(obs)
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

            # heatmap time series
            time = adata.obs.latent_time
            Y=adata.obs[obs].iloc[np.argsort(time)]
            #Y/=np.max(np.abs(Y), axis=0)
            Y=zscore(Y)
            df = pd.DataFrame(Y, columns=obs)
            if True:
                n_convolve=50
                weights = np.ones(n_convolve) / n_convolve
                for o in obs:
                    try:
                        df[o] = np.convolve(df[o].values, weights, mode="same")
                    except:
                        pass  # e.g. all-zero counts or nans cannot be convolved
            from seaborn import clustermap
            cm = clustermap(df.T, row_cluster=True, col_cluster=False, vmin=-2, vmax=2, cmap='bwr')#, **kwargs
            cm.ax_heatmap.set_xlabel('latent time');
            cm.ax_heatmap.set_title(donor+' heatmap');
            cm.ax_heatmap.set_xticks([]);
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

            # export Sigs vs time
            dat=adata.obs[['latent_time']+obs].sort_values('latent_time')
            dat.to_excel(writer, sheet_name='signatures_vs_latenttime')

            ### UMAPs Signatures etc ###
            size = 4 * default_size(adata)
            alpha = 0.4
            ncols=4
            show=False
            umap_kwargs = {'size':size, 'alpha':alpha, 'ncols':ncols, 'show': show}

            # umaps signatures
            color = ['Stem', 'Goblet', 'MAPK_progeny', 'YAP_targets', 'Stem_Lgr5_ISC-Merlos', 'TC1', 'TC2', 'TC3', 'TC4', 
                     'percent_ribosomal', 'latent_time', 'WNT_progeny', 'HALLMARK_APOPTOSIS', 'HALLMARK_DNA_REPAIR', 'phase']
            scv.pl.scatter(adata, color=color, **umap_kwargs)
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

            # UMAP genes
            single_genes = np.array(['LGR5', 'OLFM4', 'TFF3', 'FABP1', 'EPHB2', 'AXIN1', 'AXIN2', 'EGR1'])
            single_genes=single_genes[np.isin(single_genes, adata.var_names)]
            scv.pl.scatter(adata, color=single_genes, **umap_kwargs)
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

            scv.pl.scatter(adata, color=single_genes, layer='Ms', **umap_kwargs)
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

            scv.pl.scatter(adata, color=single_genes, layer='Mu', **umap_kwargs)
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

            ### Dynamics / Velocity plots

            # dynamical velocity plot
            ax = scv.pl.scatter(xdata, alpha=0, show=False)
            from scvelo.plotting.utils import default_size
            scv.pl.velocity_embedding_grid(adata, color='latent_time', arrow_length=5, arrow_size=5, density=0.4, arrow_color='black', show=False, title=donor+' dynamical velocity', ax=ax)
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()
            
            # steady state SLAM velocity plot
            scv.pl.velocity_embedding_grid(adata, vkey='SLAM_velocity', color='latent_time', arrow_length=5, arrow_size=5, density=0.4, 
                                           arrow_color='black', show=False, title=donor+' steady state velocity')
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

            # steady state SLAM velocity plot recomputed
            scv.pl.velocity_embedding_grid(adata, vkey='recomputed_SLAM_velocity', color='latent_time', arrow_length=5, arrow_size=5, density=0.4, 
                                           arrow_color='black', show=False, title=donor+' steady state velocity recomputed')
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

            # steady state classical/real RNA velocity plot
            scv.pl.velocity_embedding_grid(adata, vkey='real_velocity', color='latent_time', arrow_length=5, arrow_size=5, density=0.4, 
                                           arrow_color='black', show=False, title=donor+' classical steady state velocity')
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

            # steady state classical/real RNA velocity plot recomputed
            scv.pl.velocity_embedding_grid(adata, vkey='recomputed_real_velocity', color='latent_time', arrow_length=5, arrow_size=5, density=0.4, 
                                           arrow_color='black', show=False, title=donor+' classical steady state velocity recomputed')
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

            ### PHASE PLOTS ###

            # overall top likelihood genes
            topgenes = list(adata.var.fit_likelihood[high_shared_vars].sort_values()[-20:].index)
            scv.pl.scatter(adata, basis=topgenes, color='latent_time', ncols=4, show=False, title=[x+' top'+str(20-i) for i,x in enumerate(topgenes)])
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

            # save top 20 DMSO likelihood genes. Plot these for the other conditions.
            if len(perts)==1 and perts[0]=='DMSO':
                top_dmso_genes = topgenes
            else:
                scv.pl.scatter(adata, basis=top_dmso_genes, color='latent_time', ncols=4, show=False, title=[x+' top'+str(20-i)+' DMSO gene' for i,x in enumerate(top_dmso_genes)])
                pdf.savefig()  # saves the current figure into a pdf page
                pl.close()

            # export gene fit likelihoods of highest quantile
            genes_likelihoods=adata.var[~pd.isna(adata.var['fit_likelihood'])]['fit_likelihood'].sort_values()
            top_genes_likelihoods=genes_likelihoods[genes_likelihoods>np.percentile(genes_likelihoods,75)]
            top_genes_likelihoods.to_excel(writer, sheet_name='top_quantile_genes_likelihoods')

            # calculate gene overlaps with sigs
            genes = top_genes_likelihoods #genes_likelihoods
            #print('Selected fitted genes', '\t', len(genes))
            dat = []
            index = []
            intersets = []
            signatures = {**sigs, **flo_sigs, **{'yap_targets':yap_targets}, **{'single_genes': single_genes}}
            for k in signatures.keys():
                K = np.sum(np.isin(genes.index, signatures[k]))
                dat.append([K, len(signatures[k])])
                index.append(k)
                intersets.append(genes.index.values[np.isin(genes.index, signatures[k])])
            tab=pd.DataFrame(data=dat, index=index, columns=['overlap', 'total_in_sig'])
            tab['percent_coverage']=np.round(tab.overlap/tab.total_in_sig*100,2)
            tab['intersecting_genes']=intersets
            tab.to_excel(writer, sheet_name='sigantures_overlap_statistics')

            subtab=tab.loc[['Stem_Lgr5_ISC-Merlos', 'Stem_Lgr5_ISC-Munoz', 'Stem', 'Stem/TA 1', 'Enterocytes 1', 'Goblet', 'TC1', 'TC4', 'yap_targets', 'single_genes']]
            for k in subtab.index:
                if tab.loc[k]['overlap']>0:
                    genes_plot = subtab.loc[k]['intersecting_genes']
                    for i in range(int(np.ceil(len(genes_plot) / 20))):
                        mi=i*20
                        ma=np.min([i*20+20, len(genes_plot)])
                        try:
                            scv.pl.scatter(adata, basis=genes_plot[mi:ma], color='latent_time', ncols=4, show=False, title=[x+' '+k for x in genes_plot])
                            pdf.savefig()  # saves the current figure into a pdf page
                            pl.close()
                        except:
                            pass
        writer.save()
        if save_model:
            adata.write(data_path+name+'_savemodel.h5')

from pathlib import Path
Path(data_path+'tables/').mkdir(parents=True, exist_ok=True)
Path(figure_path+'pdfs/').mkdir(parents=True, exist_ok=True)

## P013T: EGFRi/DMSO

Dann können wir jetzt zu dem nächsten Schritt. P013T DMSO alleine, EGFRi alleine und beide conditions zusammen. Diese trajectories mit den joint velocities vergleichen. Auch Augenmerk darauf, ob sich die bisherigen Widersprüche zwischen dynamical, steady state und classical steady state dann immer noch zeigen.

In [7]:
# P013T compare DMSO, EGFR and DMSO+EGFR fits.
# Note that here we safe the model fits for later plots directly in the data path

from scvelo.plotting.utils import default_size
from matplotlib.backends.backend_pdf import PdfPages
export_path='G:/data/scSLAMseq/revision/exports/26_01_repair_umap/'

mode='SLAM'
do_ccreg=True
pert_combis = [['DMSO'], ['EGFR'], ['DMSO', 'EGFR']]
names = ['DMSO_only', 'EGFR_only', 'DMSOandEGFR']
donor= 'P013T'

dynamic_analysis(donor, names, pert_combis, do_ccreg, mode, sigs, flo_sigs, hallmark_apo, single_genes, yap_targets, export_path, save_model=True)

DMSO_only
Using 1000 HVgenes instead of 2000
High shared vars:  1000
EGFR_only
Using 1000 HVgenes instead of 2000
High shared vars:  1000
DMSOandEGFR
High shared vars:  2000


## OT227: MEKi/DMSO

In [8]:
from scvelo.plotting.utils import default_size
from matplotlib.backends.backend_pdf import PdfPages
export_path='G:/data/scSLAMseq/revision/exports/26_01_repair_umap/'

mode='SLAM'
do_ccreg=True
pert_combis = [['DMSO'], ['MEK'], ['DMSO', 'MEK']]
names = ['DMSO_only', 'MEK_only', 'DMSOandMEK']
donor= 'OT227'

dynamic_analysis(donor, names, pert_combis, do_ccreg, mode, sigs, flo_sigs, hallmark_apo, single_genes, yap_targets, export_path)

DMSO_only
High shared vars:  1388
MEK_only
Using 1000 HVgenes instead of 2000
High shared vars:  1000
DMSOandMEK
Using 1000 HVgenes instead of 2000
High shared vars:  1000


## OT302: MEKi/DMSO

In [9]:
export_path='G:/data/scSLAMseq/revision/exports/26_01_repair_umap/'
mode='SLAM'
do_ccreg=True
pert_combis = [['DMSO'], ['MEK'], ['DMSO', 'MEK']]
names = ['DMSO_only', 'MEK_only', 'DMSOandMEK']
donor= 'OT302'

dynamic_analysis(donor, names, pert_combis, do_ccreg, mode, sigs, flo_sigs, hallmark_apo, single_genes, yap_targets, export_path)

DMSO_only
Using 750 HVgenes instead of 2000
High shared vars:  750
MEK_only
Using 1000 HVgenes instead of 2000
High shared vars:  1000
DMSOandMEK
Using 1000 HVgenes instead of 2000
High shared vars:  1000


## P009T: EGFRi/DMSO

In [22]:
export_path='G:/data/scSLAMseq/revision/exports/26_01_repair_umap/'
mode='SLAM'
do_ccreg=True
pert_combis = [['DMSO'], ['EGFR'], ['DMSO', 'EGFR']]
names = ['DMSO_only', 'EGFR_only', 'DMSOandEGFR']
donor= 'P009T'

dynamic_analysis(donor, names, pert_combis, do_ccreg, mode, sigs, flo_sigs, hallmark_apo, single_genes, yap_targets, export_path)

DMSO_only
High shared vars:  1502


## B2-040: EGFRi+BRAFi/DMSO

In [11]:
export_path='G:/data/scSLAMseq/revision/exports/26_01_repair_umap/'
mode='SLAM'
do_ccreg=True
pert_combis = [['DMSO'], ['EGFR + BRAF'], ['DMSO', 'EGFR + BRAF']]
names = ['DMSO_only', 'EGFR_BRAF_only', 'DMSOandEGFR_BRAF']
donor= 'B2-040'

dynamic_analysis(donor, names, pert_combis, do_ccreg, mode, sigs, flo_sigs, hallmark_apo, single_genes, yap_targets, export_path)

DMSO_only
High shared vars:  1939
EGFR_BRAF_only
High shared vars:  1700
DMSOandEGFR_BRAF
Using 1000 HVgenes instead of 2000
High shared vars:  1000


## C2-019: Does not show response to inhibition

Hence, it is skipped. If one wants to compute it anyway, just adapt existing snippets.

## Add additional Umaps with metadata

In [11]:
from scvelo.plotting.utils import default_size
from matplotlib.backends.backend_pdf import PdfPages
mode='SLAM'
do_ccreg=True
# reg cell cycle selection
ccreg='_ccreg' if do_ccreg else ''
export_path='G:/data/scSLAMseq/revision/exports/07_01_investigation_dynamics/'
pert_colors=['#fff989', '#858585', '#c31f26', '#ff7f26', '#af3cb1', '#00a8f3']

def plot_umap(donor, names, pert_combis):
    # read
    adata=sc.read(data_path+'by_donors/processed/SLAMv2_'+donor+'_processed'+ccreg+'.h5')
    adata.uns['perturbation_colors']=pert_colors
    if 'X_diffmap' in adata.obsm.keys(): del adata.obsm['X_diffmap']
    with PdfPages(export_path+donor+'_extra_umaps.pdf') as pdf:
        for perts, name in zip(pert_combis, names):
            # subset to selected perts
            bdata = adata[np.isin(adata.obs.perturbation, perts)].copy()

            size = 4 * default_size(bdata)
            alpha = 0.4
            ncols=4
            show=False
            umap_kwargs = {'size':size, 'alpha':alpha, 'ncols':ncols, 'show': show}

            # umaps signatures phase
            scv.pl.scatter(bdata, color='phase', title=name+' cell cycle phase', **umap_kwargs)
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

            if len(perts)==2:
                scv.pl.scatter(bdata, color='perturbation', title=name+' perturbations', **umap_kwargs)
                pdf.savefig()  # saves the current figure into a pdf page
                pl.close()

In [13]:
pert_combis = [['DMSO'], ['EGFR + BRAF'], ['DMSO', 'EGFR + BRAF']]
names = ['DMSO_only', 'EGFR_BRAF_only', 'DMSOandEGFR_BRAF']
donor= 'B2-040'
plot_umap(donor, names, pert_combis)

pert_combis = [['DMSO'], ['EGFR'], ['DMSO', 'EGFR']]
names = ['DMSO_only', 'EGFR_only', 'DMSOandEGFR']
donor= 'P013T'
plot_umap(donor, names, pert_combis)

pert_combis = [['DMSO'], ['EGFR'], ['DMSO', 'EGFR']]
names = ['DMSO_only', 'EGFR_only', 'DMSOandEGFR']
donor= 'P009T'
plot_umap(donor, names, pert_combis)
        
pert_combis = [['DMSO'], ['MEK'], ['DMSO', 'MEK']]
names = ['DMSO_only', 'MEK_only', 'DMSOandMEK']
donor= 'OT227'
plot_umap(donor, names, pert_combis)
        
pert_combis = [['DMSO'], ['MEK'], ['DMSO', 'MEK']]
names = ['DMSO_only', 'MEK_only', 'DMSOandMEK']
donor= 'OT302'
plot_umap(donor, names, pert_combis)


# (Fit all Samples) deprecated and takes too long

We can fit separate models for the different perturbations of a single donor or we can fit them jointly. This Code explores these possible configurations both with and without cell cycle regressed data.

In [3]:
# Preload signatures

# Stem sig
tab=pd.read_excel(signatures_path+'cell_type_markers/CRC-related_stem_cell_signatures.xlsx', header=0)
tab = tab.drop(0)
sigs = {'Stem_'+x: list(tab[x][~pd.isna(tab[x])].values) for x in ['Lgr5_ISC-Munoz', 'Lgr5_ISC-Merlos']}

# Flo sig
# tab=pd.read_excel(signatures_path+'cell_type_markers/colonoid_cancer_uhlitz_markers.xlsx', header=1)
tab=pd.read_excel(signatures_path+'cell_type_markers/colonoid_cancer_uhlitz_markers_revised.xlsx', header=1, sheet_name=2)
flo_sigs={x: list(tab[tab['cell_type_epi']==x].gene.values) for x in ['Stem', 'Enterocytes 1', 'TC1', 'TC4', 'Goblet', 'Stem/TA 1']}

single_genes = ['LGR5', 'OLFM4', 'TFF3', 'FABP1', 'EPHB2', 'AXIN1', 'AXIN2', 'EGR1']

# YAP target from Markus Morkel, Liberali Lab
yap_targets = ['CTGF', 'GGTA1', 'WWC2', 'ANXA8', 'CLU', 'CXCL16', 'IL33', 'LY6A', 'LY6C1', 
               'MSLN', 'TNFRSF12A', 'CTGF', 'GGTA1', 'WWC2', 'ANXA5', 'TACSTD2', 'ANXA10', 'EREG', 'IL33', 'ANXA1', 'ANXA3']

## single perts, cc regressed

In [12]:
# single perts, cc regressed

from matplotlib.backends.backend_pdf import PdfPages
export_path='G:/data/scSLAMseq/revision/exports/'

mode='SLAM'
do_ccreg=True

for donor in tqdm_notebook(donors):
    # read
    ccreg='_ccreg' if do_ccreg else ''
    data=sc.read(data_path+'by_donors/processed/SLAMv2_'+donor+'_processed'+ccreg+'.h5')
    if 'X_diffmap' in data.obsm.keys(): del data.obsm['X_diffmap']
    if mode == 'SLAM':
        data.layers['unspliced']=data.layers['new']
        data.layers['spliced']=data.layers['old']
        scv.pp.moments(data, n_pcs=30, n_neighbors=30)
    for x in ['new', 'old', 'real_ambiguous', 'real_spliced', 'real_unspliced']: del data.layers[x]

    for pert in tqdm_notebook(pd.unique(data.obs.perturbation)):
        name = donor + '_'+pert+ccreg
        # subset to a single perturbation
        adata = data[data.obs.perturbation==pert].copy()
        scv.pp.pca(adata)
        scv.pp.neighbors(adata)
        scv.pp.moments(adata, n_pcs=30, n_neighbors=30)

        # filtering and select genes to fit
        cdata=sc.AnnData(adata.X, var=adata.var[[]], layers={'unspliced': adata.layers['unspliced'],'spliced': adata.layers['spliced']})
        scv.pp.filter_genes(cdata, min_shared_counts=30)
        try:
            scv.pp.filter_genes_dispersion(cdata, n_top_genes=1000)
        except:
            print(donor, pert, ' filtering key error. Prolly not enough cells(?)')
            continue
        high_shared_vars=cdata.var_names.values
        del cdata
        print('High shared vars: ', len(high_shared_vars))
        non_ribo_genes_ids=np.array([x[:2]!='RP' for x in high_shared_vars])
        non_ribo_genes=high_shared_vars[non_ribo_genes_ids]

        # dynamical model and latent time computation
        scv.tl.recover_dynamics(adata, var_names=non_ribo_genes)
        scv.tl.velocity(adata, mode='dynamical')
        scv.tl.velocity_graph(adata)
        scv.tl.velocity_embedding(adata, basis='umap')
        scv.tl.recover_latent_time(adata)

        writer = pd.ExcelWriter(export_path+name'_tables.xlsx', engine='xlsxwriter')
        with PdfPages(export_path+name+'_dynamics.pdf') as pdf:
            # latent time series of signatures
            from scipy.stats import zscore
            obs = ['Stem', 'Goblet', 'MAPK_progeny', 'YAP_targets', 'Stem_Lgr5_ISC-Merlos', 'TC1', 'TC4', 'percent_ribosomal']
            colors = ['red', 'blue', 'green', 'black', 'grey', 'orange', 'yellow', 'cyan']
            ax=None
            for o, c in zip(obs,colors):
                ax=scv.pl.scatter(adata, x=adata.obs.latent_time, y=zscore(adata.obs[o]), alpha=0.5, figsize=[10,5], size=50,
                                  color=c, n_convolve=50, show=False, xlabel='latent time', ylabel='zscore', ax=ax)
            ax.legend(obs)
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

            # heatmap time series
            time = adata.obs.latent_time
            Y=adata.obs[obs].iloc[np.argsort(time)]
            #Y/=np.max(np.abs(Y), axis=0)
            Y=zscore(Y)
            df = pd.DataFrame(Y, columns=obs)
            if True:
                n_convolve=50
                weights = np.ones(n_convolve) / n_convolve
                for o in obs:
                    try:
                        df[o] = np.convolve(df[o].values, weights, mode="same")
                    except:
                        pass  # e.g. all-zero counts or nans cannot be convolved
            from seaborn import clustermap
            cm = clustermap(df.T, row_cluster=True, col_cluster=False, vmin=-2, vmax=2, cmap='bwr')#, **kwargs
            cm.ax_heatmap.set_xlabel(pert+' latent time');
            cm.ax_heatmap.set_title(pert+' '+ donor+' heatmap');
            cm.ax_heatmap.set_xticks([]);
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

            # export Sigs vs time
            dat=adata.obs[['latent_time']+obs].sort_values('latent_time')
            dat.to_excel(writer, sheet_name='signatures_vs_latenttime')

            # umap
            scv.pl.scatter(adata, color=obs+['latent_time'], ncols=5, show=False)
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

            # dynamical velocity plot
            from scvelo.plotting.utils import default_size
            ax=scv.pl.scatter(data[data.obs.perturbation!=pert], color='lightgrey', show=False, alpha=0.2, size=4 * default_size(adata))
            scv.pl.velocity_embedding_grid(adata, color='latent_time', arrow_length=5, arrow_size=5, density=0.4, arrow_color='black', show=False, title=donor+' '+pert+' dynamical velocity', ax=ax)
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()
            
            # steady state SLAM velocity plot
            ax=scv.pl.scatter(data[data.obs.perturbation!=pert], color='lightgrey', show=False, alpha=0.2, size=4 * default_size(adata))
            scv.pl.velocity_embedding_grid(adata, vkey='SLAM_velocity', color='latent_time', arrow_length=5, arrow_size=5, density=0.4, 
                                           arrow_color='black', show=False, title=donor+' '+pert+' steady state velocity', ax=ax)
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()
            
            # steady state classical/real RNA velocity plot
            ax=scv.pl.scatter(data[data.obs.perturbation!=pert], color='lightgrey', show=False, alpha=0.2, size=4 * default_size(adata))
            scv.pl.velocity_embedding_grid(adata, vkey='real_velocity', color='latent_time', arrow_length=5, arrow_size=5, density=0.4, 
                                           arrow_color='black', show=False, title=donor+' '+pert+' classical steady state velocity', ax=ax)
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

            # overall top likelihood genes
            topgenes = list(adata.var.fit_likelihood[non_ribo_genes].sort_values()[-20:].index)
            scv.pl.scatter(adata, basis=topgenes, color='latent_time', ncols=4, show=False, title=[x+' top'+str(20-i) for i,x in enumerate(topgenes)])
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

            # export gene fit likelihoods of highest quantile
            genes_likelihoods=adata.var[~pd.isna(adata.var['fit_likelihood'])]['fit_likelihood'].sort_values()
            top_genes_likelihoods=genes_likelihoods[genes_likelihoods>np.percentile(genes_likelihoods,75)]
            top_genes_likelihoods.to_excel(writer, sheet_name='top_quantile_genes_likelihoods')

            # calculate gene overlaps with sigs
            genes = top_genes_likelihoods #genes_likelihoods
            #print('Selected fitted genes', '\t', len(genes))
            dat = []
            index = []
            intersets = []
            signatures = {**sigs, **flo_sigs, **{'yap_targets':yap_targets}, **{'single_genes': single_genes}}
            for k in signatures.keys():
                K = np.sum(np.isin(genes.index, signatures[k]))
                dat.append([K, len(signatures[k])])
                index.append(k)
                intersets.append(genes.index.values[np.isin(genes.index, signatures[k])])
            tab=pd.DataFrame(data=dat, index=index, columns=['overlap', 'total_in_sig'])
            tab['percent_coverage']=np.round(tab.overlap/tab.total_in_sig*100,2)
            tab['intersecting_genes']=intersets
            tab.to_excel(writer, sheet_name='sigantures_overlap_statistics')

            subtab=tab.loc[['Stem_Lgr5_ISC-Merlos', 'Stem_Lgr5_ISC-Munoz', 'Stem', 'Stem/TA 1', 'Enterocytes 1', 'Goblet', 'TC1', 'TC4', 'yap_targets', 'single_genes']]
            for k in tqdm_notebook(subtab.index):
                if tab.loc[k]['overlap']>0:
                    genes_plot = subtab.loc[k]['intersecting_genes']
                    for i in range(int(np.ceil(len(genes_plot) / 20))):
                        mi=i*20
                        ma=np.min([i*20+20, len(genes_plot)])
                        try:
                            scv.pl.scatter(adata, basis=genes_plot[mi:ma], color='latent_time', ncols=4, show=False, title=[x+' '+k for x in genes_plot])
                            pdf.savefig()  # saves the current figure into a pdf page
                            pl.close()
                        except:
                            pass
        writer.save()

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

High shared vars:  877


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

B2-040 DMSO  filtering key error. Prolly not enough cells(?)
B2-040 EGFR  filtering key error. Prolly not enough cells(?)
B2-040 EGFR + BRAF  filtering key error. Prolly not enough cells(?)
B2-040 EGFR + MEK  filtering key error. Prolly not enough cells(?)
B2-040 MEK  filtering key error. Prolly not enough cells(?)


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

High shared vars:  323


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

High shared vars:  518


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

High shared vars:  338


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

High shared vars:  300


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

High shared vars:  314


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

High shared vars:  258


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

OT227 BRAF  filtering key error. Prolly not enough cells(?)
OT227 DMSO  filtering key error. Prolly not enough cells(?)
OT227 EGFR  filtering key error. Prolly not enough cells(?)
High shared vars:  1000


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

OT227 EGFR + MEK  filtering key error. Prolly not enough cells(?)
OT227 MEK  filtering key error. Prolly not enough cells(?)


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

High shared vars:  1000


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

OT302 DMSO  filtering key error. Prolly not enough cells(?)
High shared vars:  1000


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

High shared vars:  1000


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

OT302 EGFR + MEK  filtering key error. Prolly not enough cells(?)
OT302 MEK  filtering key error. Prolly not enough cells(?)


HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

P009T BRAF  filtering key error. Prolly not enough cells(?)
P009T DMSO  filtering key error. Prolly not enough cells(?)
High shared vars:  845


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

P009T EGFR + BRAF  filtering key error. Prolly not enough cells(?)
High shared vars:  556


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

High shared vars:  713


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

P013T BRAF  filtering key error. Prolly not enough cells(?)
High shared vars:  1000


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

High shared vars:  1000


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

High shared vars:  1000


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

High shared vars:  1000


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

High shared vars:  1000


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

## single perts, cc not regressed

In [None]:
# single perts, cc NOT regressed

from matplotlib.backends.backend_pdf import PdfPages
export_path='G:/data/scSLAMseq/revision/exports/'

mode='SLAM'
do_ccreg=False

for donor in tqdm_notebook(donors):
    # read
    ccreg='_ccreg' if do_ccreg else ''
    data=sc.read(data_path+'by_donors/processed/SLAMv2_'+donor+'_processed'+ccreg+'.h5')
    if 'X_diffmap' in data.obsm.keys(): del data.obsm['X_diffmap']
    if mode == 'SLAM':
        data.layers['unspliced']=data.layers['new']
        data.layers['spliced']=data.layers['old']
        scv.pp.moments(data, n_pcs=30, n_neighbors=30)
    for x in ['new', 'old', 'real_ambiguous', 'real_spliced', 'real_unspliced']: del data.layers[x]

    for pert in tqdm_notebook(pd.unique(data.obs.perturbation)):
        name = donor + '_'+pert+ccreg
        # subset to a single perturbation
        adata = data[data.obs.perturbation==pert].copy()
        scv.pp.pca(adata)
        scv.pp.neighbors(adata)
        scv.pp.moments(adata, n_pcs=30, n_neighbors=30)

        # filtering and select genes to fit
        cdata=sc.AnnData(adata.X, var=adata.var[[]], layers={'unspliced': adata.layers['unspliced'],'spliced': adata.layers['spliced']})
        scv.pp.filter_genes(cdata, min_shared_counts=30)
        try:
            scv.pp.filter_genes_dispersion(cdata, n_top_genes=1000)
        except:
            print(donor, pert, ' filtering key error. Prolly not enough cells(?)')
            continue
        high_shared_vars=cdata.var_names.values
        del cdata
        print('High shared vars: ', len(high_shared_vars))
        non_ribo_genes_ids=np.array([x[:2]!='RP' for x in high_shared_vars])
        non_ribo_genes=high_shared_vars[non_ribo_genes_ids]

        # dynamical model and latent time computation
        scv.tl.recover_dynamics(adata, var_names=non_ribo_genes)
        scv.tl.velocity(adata, mode='dynamical')
        scv.tl.velocity_graph(adata)
        scv.tl.velocity_embedding(adata, basis='umap')
        scv.tl.recover_latent_time(adata)

        writer = pd.ExcelWriter(export_path+name'_tables.xlsx', engine='xlsxwriter')
        with PdfPages(export_path+name+'_dynamics.pdf') as pdf:
            # latent time series of signatures
            from scipy.stats import zscore
            obs = ['Stem', 'Goblet', 'MAPK_progeny', 'YAP_targets', 'Stem_Lgr5_ISC-Merlos', 'TC1', 'TC4', 'percent_ribosomal']
            colors = ['red', 'blue', 'green', 'black', 'grey', 'orange', 'yellow', 'cyan']
            ax=None
            for o, c in zip(obs,colors):
                ax=scv.pl.scatter(adata, x=adata.obs.latent_time, y=zscore(adata.obs[o]), alpha=0.5, figsize=[10,5], size=50,
                                  color=c, n_convolve=50, show=False, xlabel='latent time', ylabel='zscore', ax=ax)
            ax.legend(obs)
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

            # heatmap time series
            time = adata.obs.latent_time
            Y=adata.obs[obs].iloc[np.argsort(time)]
            #Y/=np.max(np.abs(Y), axis=0)
            Y=zscore(Y)
            df = pd.DataFrame(Y, columns=obs)
            if True:
                n_convolve=50
                weights = np.ones(n_convolve) / n_convolve
                for o in obs:
                    try:
                        df[o] = np.convolve(df[o].values, weights, mode="same")
                    except:
                        pass  # e.g. all-zero counts or nans cannot be convolved
            from seaborn import clustermap
            cm = clustermap(df.T, row_cluster=True, col_cluster=False, vmin=-2, vmax=2, cmap='bwr')#, **kwargs
            cm.ax_heatmap.set_xlabel(pert+' latent time');
            cm.ax_heatmap.set_title(pert+' '+ donor+' heatmap');
            cm.ax_heatmap.set_xticks([]);
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

            # export Sigs vs time
            dat=adata.obs[['latent_time']+obs].sort_values('latent_time')
            dat.to_excel(writer, sheet_name='signatures_vs_latenttime')

            # umap
            scv.pl.scatter(adata, color=obs+['latent_time'], ncols=5, show=False)
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

            # dynamical velocity plot
            from scvelo.plotting.utils import default_size
            ax=scv.pl.scatter(data[data.obs.perturbation!=pert], color='lightgrey', show=False, alpha=0.2, size=4 * default_size(adata))
            scv.pl.velocity_embedding_grid(adata, color='latent_time', arrow_length=5, arrow_size=5, density=0.4, arrow_color='black', show=False, title=donor+' '+pert+' dynamical velocity', ax=ax)
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()
            
            # steady state SLAM velocity plot
            ax=scv.pl.scatter(data[data.obs.perturbation!=pert], color='lightgrey', show=False, alpha=0.2, size=4 * default_size(adata))
            scv.pl.velocity_embedding_grid(adata, vkey='SLAM_velocity', color='latent_time', arrow_length=5, arrow_size=5, density=0.4, 
                                           arrow_color='black', show=False, title=donor+' '+pert+' steady state velocity', ax=ax)
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()
            
            # steady state classical/real RNA velocity plot
            ax=scv.pl.scatter(data[data.obs.perturbation!=pert], color='lightgrey', show=False, alpha=0.2, size=4 * default_size(adata))
            scv.pl.velocity_embedding_grid(adata, vkey='real_velocity', color='latent_time', arrow_length=5, arrow_size=5, density=0.4, 
                                           arrow_color='black', show=False, title=donor+' '+pert+' classical steady state velocity', ax=ax)
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

            # overall top likelihood genes
            topgenes = list(adata.var.fit_likelihood[non_ribo_genes].sort_values()[-20:].index)
            scv.pl.scatter(adata, basis=topgenes, color='latent_time', ncols=4, show=False, title=[x+' top'+str(20-i) for i,x in enumerate(topgenes)])
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

            # export gene fit likelihoods of highest quantile
            genes_likelihoods=adata.var[~pd.isna(adata.var['fit_likelihood'])]['fit_likelihood'].sort_values()
            top_genes_likelihoods=genes_likelihoods[genes_likelihoods>np.percentile(genes_likelihoods,75)]
            top_genes_likelihoods.to_excel(writer, sheet_name='top_quantile_genes_likelihoods')

            # calculate gene overlaps with sigs
            genes = top_genes_likelihoods #genes_likelihoods
            #print('Selected fitted genes', '\t', len(genes))
            dat = []
            index = []
            intersets = []
            signatures = {**sigs, **flo_sigs, **{'yap_targets':yap_targets}, **{'single_genes': single_genes}}
            for k in signatures.keys():
                K = np.sum(np.isin(genes.index, signatures[k]))
                dat.append([K, len(signatures[k])])
                index.append(k)
                intersets.append(genes.index.values[np.isin(genes.index, signatures[k])])
            tab=pd.DataFrame(data=dat, index=index, columns=['overlap', 'total_in_sig'])
            tab['percent_coverage']=np.round(tab.overlap/tab.total_in_sig*100,2)
            tab['intersecting_genes']=intersets
            tab.to_excel(writer, sheet_name='sigantures_overlap_statistics')

            subtab=tab.loc[['Stem_Lgr5_ISC-Merlos', 'Stem_Lgr5_ISC-Munoz', 'Stem', 'Stem/TA 1', 'Enterocytes 1', 'Goblet', 'TC1', 'TC4', 'yap_targets', 'single_genes']]
            for k in tqdm_notebook(subtab.index):
                if tab.loc[k]['overlap']>0:
                    genes_plot = subtab.loc[k]['intersecting_genes']
                    for i in range(int(np.ceil(len(genes_plot) / 20))):
                        mi=i*20
                        ma=np.min([i*20+20, len(genes_plot)])
                        try:
                            scv.pl.scatter(adata, basis=genes_plot[mi:ma], color='latent_time', ncols=4, show=False, title=[x+' '+k for x in genes_plot])
                            pdf.savefig()  # saves the current figure into a pdf page
                            pl.close()
                        except:
                            pass
        writer.save()

## jointly, cc regressed

In [4]:
# jointly, cc regressed

from matplotlib.backends.backend_pdf import PdfPages
export_path='G:/data/scSLAMseq/revision/exports/03_01_complete/'

mode='SLAM'
do_ccreg=True

for donor in tqdm_notebook(donors):
    print(donor)
    # read
    ccreg='_ccreg' if do_ccreg else ''
    adata=sc.read(data_path+'by_donors/processed/SLAMv2_'+donor+'_processed'+ccreg+'.h5')
    if 'X_diffmap' in adata.obsm.keys(): del adata.obsm['X_diffmap']
    if mode == 'SLAM':
        adata.layers['unspliced']=adata.layers['new']
        adata.layers['spliced']=adata.layers['old']
        scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
    for x in ['new', 'old', 'real_ambiguous', 'real_spliced', 'real_unspliced']: del adata.layers[x]
    
    name = donor +ccreg

    # filtering and select genes to fit
    cdata=sc.AnnData(adata.X, var=adata.var[[]], layers={'unspliced': adata.layers['unspliced'],'spliced': adata.layers['spliced']})
    scv.pp.filter_genes(cdata, min_shared_counts=20)
    try:
        scv.pp.filter_genes_dispersion(cdata, n_top_genes=2000)
    except:
        print(donor, ' filtering key error. Prolly not enough cells(?)')
        continue
    high_shared_vars=cdata.var_names.values
    del cdata
    print('High shared vars: ', len(high_shared_vars))
    non_ribo_genes_ids=np.array([x[:2]!='RP' for x in high_shared_vars])
    non_ribo_genes=high_shared_vars[non_ribo_genes_ids]

    # dynamical model and latent time computation
    adata = adata[:, high_shared_vars].copy()  # subset to gene selection
    scv.tl.recover_dynamics(adata, var_names=high_shared_vars)
    scv.tl.velocity(adata, mode='dynamical')
    scv.tl.velocity_graph(adata)
    scv.tl.velocity_embedding(adata, basis='umap')
    scv.tl.recover_latent_time(adata)

    writer = pd.ExcelWriter(export_path+name+'_tables.xlsx', engine='xlsxwriter')
    with PdfPages(export_path+name+'_dynamics.pdf') as pdf:
        # latent time series of signatures
        from scipy.stats import zscore
        obs = ['Stem', 'Goblet', 'MAPK_progeny', 'YAP_targets', 'Stem_Lgr5_ISC-Merlos', 'TC1', 'TC4', 'percent_ribosomal']
        colors = ['red', 'blue', 'green', 'black', 'grey', 'orange', 'yellow', 'cyan']
        ax=None
        for o, c in zip(obs,colors):
            ax=scv.pl.scatter(adata, x=adata.obs.latent_time, y=zscore(adata.obs[o]), alpha=0.5, figsize=[10,5], size=50,
                              color=c, n_convolve=50, show=False, xlabel='latent time', ylabel='zscore', ax=ax)
        ax.legend(obs)
        pdf.savefig()  # saves the current figure into a pdf page
        pl.close()

        # heatmap time series
        time = adata.obs.latent_time
        Y=adata.obs[obs].iloc[np.argsort(time)]
        #Y/=np.max(np.abs(Y), axis=0)
        Y=zscore(Y)
        df = pd.DataFrame(Y, columns=obs)
        if True:
            n_convolve=50
            weights = np.ones(n_convolve) / n_convolve
            for o in obs:
                try:
                    df[o] = np.convolve(df[o].values, weights, mode="same")
                except:
                    pass  # e.g. all-zero counts or nans cannot be convolved
        from seaborn import clustermap
        cm = clustermap(df.T, row_cluster=True, col_cluster=False, vmin=-2, vmax=2, cmap='bwr')#, **kwargs
        cm.ax_heatmap.set_xlabel('latent time');
        cm.ax_heatmap.set_title(donor+' heatmap');
        cm.ax_heatmap.set_xticks([]);
        pdf.savefig()  # saves the current figure into a pdf page
        pl.close()

        # export Sigs vs time
        dat=adata.obs[['latent_time']+obs].sort_values('latent_time')
        dat.to_excel(writer, sheet_name='signatures_vs_latenttime')

        # umap
        scv.pl.scatter(adata, color=obs+['latent_time'], ncols=5, show=False)
        pdf.savefig()  # saves the current figure into a pdf page
        pl.close()

        # dynamical velocity plot
        from scvelo.plotting.utils import default_size
        scv.pl.velocity_embedding_grid(adata, color='latent_time', arrow_length=5, arrow_size=5, density=0.4, arrow_color='black', show=False, title=donor+' dynamical velocity')
        pdf.savefig()  # saves the current figure into a pdf page
        pl.close()

        # steady state SLAM velocity plot
        scv.pl.velocity_embedding_grid(adata, vkey='SLAM_velocity', color='latent_time', arrow_length=5, arrow_size=5, density=0.4, 
                                       arrow_color='black', show=False, title=donor+' steady state velocity')
        pdf.savefig()  # saves the current figure into a pdf page
        pl.close()

        # steady state classical/real RNA velocity plot
        scv.pl.velocity_embedding_grid(adata, vkey='real_velocity', color='latent_time', arrow_length=5, arrow_size=5, density=0.4, 
                                       arrow_color='black', show=False, title=donor+' classical steady state velocity')
        pdf.savefig()  # saves the current figure into a pdf page
        pl.close()

        # overall top likelihood genes
        topgenes = list(adata.var.fit_likelihood[non_ribo_genes].sort_values()[-20:].index)
        scv.pl.scatter(adata, basis=topgenes, color='latent_time', ncols=4, show=False, title=[x+' top'+str(20-i) for i,x in enumerate(topgenes)])
        pdf.savefig()  # saves the current figure into a pdf page
        pl.close()

        # export gene fit likelihoods of highest quantile
        genes_likelihoods=adata.var[~pd.isna(adata.var['fit_likelihood'])]['fit_likelihood'].sort_values()
        top_genes_likelihoods=genes_likelihoods[genes_likelihoods>np.percentile(genes_likelihoods,75)]
        top_genes_likelihoods.to_excel(writer, sheet_name='top_quantile_genes_likelihoods')

        # calculate gene overlaps with sigs
        genes = top_genes_likelihoods #genes_likelihoods
        #print('Selected fitted genes', '\t', len(genes))
        dat = []
        index = []
        intersets = []
        signatures = {**sigs, **flo_sigs, **{'yap_targets':yap_targets}, **{'single_genes': single_genes}}
        for k in signatures.keys():
            K = np.sum(np.isin(genes.index, signatures[k]))
            dat.append([K, len(signatures[k])])
            index.append(k)
            intersets.append(genes.index.values[np.isin(genes.index, signatures[k])])
        tab=pd.DataFrame(data=dat, index=index, columns=['overlap', 'total_in_sig'])
        tab['percent_coverage']=np.round(tab.overlap/tab.total_in_sig*100,2)
        tab['intersecting_genes']=intersets
        tab.to_excel(writer, sheet_name='sigantures_overlap_statistics')

        subtab=tab.loc[['Stem_Lgr5_ISC-Merlos', 'Stem_Lgr5_ISC-Munoz', 'Stem', 'Stem/TA 1', 'Enterocytes 1', 'Goblet', 'TC1', 'TC4', 'yap_targets', 'single_genes']]
        for k in subtab.index:
            if tab.loc[k]['overlap']>0:
                genes_plot = subtab.loc[k]['intersecting_genes']
                for i in range(int(np.ceil(len(genes_plot) / 20))):
                    mi=i*20
                    ma=np.min([i*20+20, len(genes_plot)])
                    try:
                        scv.pl.scatter(adata, basis=genes_plot[mi:ma], color='latent_time', ncols=4, show=False, title=[x+' '+k for x in genes_plot])
                        pdf.savefig()  # saves the current figure into a pdf page
                        pl.close()
                    except:
                        pass
    writer.save()

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

High shared vars:  2000


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

High shared vars:  1538


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

High shared vars:  2000


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

High shared vars:  2000


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

P009T  filtering key error. Prolly not enough cells(?)
High shared vars:  2000


HBox(children=(IntProgress(value=0, max=10), HTML(value='')))




## jointly, cc not regressed

In [5]:
# jointly, cc regressed

from matplotlib.backends.backend_pdf import PdfPages
export_path='G:/data/scSLAMseq/revision/exports/03_01_complete/'

mode='SLAM'
do_ccreg=False

for donor in tqdm_notebook(donors):
    print(donor)
    # read
    ccreg='_ccreg' if do_ccreg else ''
    adata=sc.read(data_path+'by_donors/processed/SLAMv2_'+donor+'_processed'+ccreg+'.h5')
    if 'X_diffmap' in adata.obsm.keys(): del adata.obsm['X_diffmap']
    if mode == 'SLAM':
        adata.layers['unspliced']=adata.layers['new']
        adata.layers['spliced']=adata.layers['old']
        scv.pp.moments(adata, n_pcs=30, n_neighbors=30)
    for x in ['new', 'old', 'real_ambiguous', 'real_spliced', 'real_unspliced']: del adata.layers[x]
    
    name = donor +ccreg

    # filtering and select genes to fit
    cdata=sc.AnnData(adata.X, var=adata.var[[]], layers={'unspliced': adata.layers['unspliced'],'spliced': adata.layers['spliced']})
    scv.pp.filter_genes(cdata, min_shared_counts=20)
    try:
        scv.pp.filter_genes_dispersion(cdata, n_top_genes=2000)
    except:
        print(donor, ' filtering key error. Prolly not enough cells(?)')
        continue
    high_shared_vars=cdata.var_names.values
    del cdata
    print('High shared vars: ', len(high_shared_vars))
    non_ribo_genes_ids=np.array([x[:2]!='RP' for x in high_shared_vars])
    non_ribo_genes=high_shared_vars[non_ribo_genes_ids]

    # dynamical model and latent time computation
    adata = adata[:, high_shared_vars].copy()  # subset to gene selection
    scv.tl.recover_dynamics(adata, var_names=high_shared_vars)
    scv.tl.velocity(adata, mode='dynamical')
    scv.tl.velocity_graph(adata)
    scv.tl.velocity_embedding(adata, basis='umap')
    scv.tl.recover_latent_time(adata)

    writer = pd.ExcelWriter(export_path+name+'_tables.xlsx', engine='xlsxwriter')
    with PdfPages(export_path+name+'_dynamics.pdf') as pdf:
        # latent time series of signatures
        from scipy.stats import zscore
        obs = ['Stem', 'Goblet', 'MAPK_progeny', 'YAP_targets', 'Stem_Lgr5_ISC-Merlos', 'TC1', 'TC4', 'percent_ribosomal']
        colors = ['red', 'blue', 'green', 'black', 'grey', 'orange', 'yellow', 'cyan']
        ax=None
        for o, c in zip(obs,colors):
            ax=scv.pl.scatter(adata, x=adata.obs.latent_time, y=zscore(adata.obs[o]), alpha=0.5, figsize=[10,5], size=50,
                              color=c, n_convolve=50, show=False, xlabel='latent time', ylabel='zscore', ax=ax)
        ax.legend(obs)
        pdf.savefig()  # saves the current figure into a pdf page
        pl.close()

        # heatmap time series
        time = adata.obs.latent_time
        Y=adata.obs[obs].iloc[np.argsort(time)]
        #Y/=np.max(np.abs(Y), axis=0)
        Y=zscore(Y)
        df = pd.DataFrame(Y, columns=obs)
        if True:
            n_convolve=50
            weights = np.ones(n_convolve) / n_convolve
            for o in obs:
                try:
                    df[o] = np.convolve(df[o].values, weights, mode="same")
                except:
                    pass  # e.g. all-zero counts or nans cannot be convolved
        from seaborn import clustermap
        cm = clustermap(df.T, row_cluster=True, col_cluster=False, vmin=-2, vmax=2, cmap='bwr')#, **kwargs
        cm.ax_heatmap.set_xlabel('latent time');
        cm.ax_heatmap.set_title(donor+' heatmap');
        cm.ax_heatmap.set_xticks([]);
        pdf.savefig()  # saves the current figure into a pdf page
        pl.close()

        # export Sigs vs time
        dat=adata.obs[['latent_time']+obs].sort_values('latent_time')
        dat.to_excel(writer, sheet_name='signatures_vs_latenttime')

        # umaps
        scv.pl.scatter(adata, color=obs+['latent_time'], ncols=5, show=False)
        pdf.savefig()  # saves the current figure into a pdf page
        pl.close()
        scv.pl.scatter(adata, color='perturbation', ncols=5, show=False)
        pdf.savefig()  # saves the current figure into a pdf page
        pl.close()

        # dynamical velocity plot
        from scvelo.plotting.utils import default_size
        scv.pl.velocity_embedding_grid(adata, color='latent_time', arrow_length=5, arrow_size=5, density=0.4, arrow_color='black', show=False, title=donor+' dynamical velocity')
        pdf.savefig()  # saves the current figure into a pdf page
        pl.close()

        # steady state SLAM velocity plot
        scv.pl.velocity_embedding_grid(adata, vkey='SLAM_velocity', color='latent_time', arrow_length=5, arrow_size=5, density=0.4, 
                                       arrow_color='black', show=False, title=donor+' steady state velocity')
        pdf.savefig()  # saves the current figure into a pdf page
        pl.close()

        # steady state classical/real RNA velocity plot
        scv.pl.velocity_embedding_grid(adata, vkey='real_velocity', color='latent_time', arrow_length=5, arrow_size=5, density=0.4, 
                                       arrow_color='black', show=False, title=donor+' classical steady state velocity')
        pdf.savefig()  # saves the current figure into a pdf page
        pl.close()

        # overall top likelihood genes
        topgenes = list(adata.var.fit_likelihood[non_ribo_genes].sort_values()[-20:].index)
        scv.pl.scatter(adata, basis=topgenes, color='latent_time', ncols=4, show=False, title=[x+' top'+str(20-i) for i,x in enumerate(topgenes)])
        pdf.savefig()  # saves the current figure into a pdf page
        pl.close()

        # export gene fit likelihoods of highest quantile
        genes_likelihoods=adata.var[~pd.isna(adata.var['fit_likelihood'])]['fit_likelihood'].sort_values()
        top_genes_likelihoods=genes_likelihoods[genes_likelihoods>np.percentile(genes_likelihoods,75)]
        top_genes_likelihoods.to_excel(writer, sheet_name='top_quantile_genes_likelihoods')

        # calculate gene overlaps with sigs
        genes = top_genes_likelihoods #genes_likelihoods
        #print('Selected fitted genes', '\t', len(genes))
        dat = []
        index = []
        intersets = []
        signatures = {**sigs, **flo_sigs, **{'yap_targets':yap_targets}, **{'single_genes': single_genes}}
        for k in signatures.keys():
            K = np.sum(np.isin(genes.index, signatures[k]))
            dat.append([K, len(signatures[k])])
            index.append(k)
            intersets.append(genes.index.values[np.isin(genes.index, signatures[k])])
        tab=pd.DataFrame(data=dat, index=index, columns=['overlap', 'total_in_sig'])
        tab['percent_coverage']=np.round(tab.overlap/tab.total_in_sig*100,2)
        tab['intersecting_genes']=intersets
        tab.to_excel(writer, sheet_name='sigantures_overlap_statistics')

        subtab=tab.loc[['Stem_Lgr5_ISC-Merlos', 'Stem_Lgr5_ISC-Munoz', 'Stem', 'Stem/TA 1', 'Enterocytes 1', 'Goblet', 'TC1', 'TC4', 'yap_targets', 'single_genes']]
        for k in subtab.index:
            if tab.loc[k]['overlap']>0:
                genes_plot = subtab.loc[k]['intersecting_genes']
                for i in range(int(np.ceil(len(genes_plot) / 20))):
                    mi=i*20
                    ma=np.min([i*20+20, len(genes_plot)])
                    try:
                        scv.pl.scatter(adata, basis=genes_plot[mi:ma], color='latent_time', ncols=4, show=False, title=[x+' '+k for x in genes_plot])
                        pdf.savefig()  # saves the current figure into a pdf page
                        pl.close()
                    except:
                        pass
    writer.save()

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

B2-040
High shared vars:  2000
C2-019
High shared vars:  1538
OT227
High shared vars:  2000
OT302
High shared vars:  2000
P009T
High shared vars:  2000
P013T
High shared vars:  2000



In [3]:
# add umaps
from matplotlib.backends.backend_pdf import PdfPages
export_path='G:/data/scSLAMseq/revision/exports/03_01_complete/'

for do_ccreg in [True, False]:
    ccreg='_ccreg' if do_ccreg else ''
    for donor in tqdm_notebook(donors):
        with PdfPages(export_path+donor+'_additional_umaps_'+ccreg+'.pdf') as pdf:
            print(donor)
            adata=sc.read(data_path+'by_donors/processed/SLAMv2_'+donor+'_processed'+ccreg+'.h5')
            scv.pl.scatter(adata, color=['perturbation', 'phase'], ncols=5, show=False, title=[donor+ccreg+' pertubations umap', donor+ccreg+' cellcycle umap'])
            pdf.savefig()  # saves the current figure into a pdf page
            pl.close()

HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

B2-040
C2-019
OT227
OT302
P009T
P013T



HBox(children=(IntProgress(value=0, max=6), HTML(value='')))

B2-040
C2-019
OT227
OT302
P009T
P013T

