universal functions that apply to all datasets

In [None]:
import os
import csv
import numpy as np
import pandas as pd
import scipy as sp
import seaborn as sb
from shutil import rmtree

import matplotlib.pyplot as plt
from matplotlib import rcParams
from matplotlib import colors
import scanpy as sc

# Scanpy params
sc.settings.verbosity = 0
sc.settings.set_figure_params(dpi=150, figsize=[5,5])
plt.rcParams['figure.figsize']=(5,5)
plt.ioff()

directory paths & file names

In [None]:
### DIRECTORY PATHS ###
# Start in "EMT-in-cancer/0-MultipleConditions" folder

# Main dataset directory
mainDir = os.path.dirname(os.getcwd())
# Data directory -- should be reset (Part 1: Filtering & Normalization)
dataDir = os.path.dirname(mainDir)+"/EMT-in-cancer-datafiles/" # note: don't change this
# Sample settings directory
runSettingsDir = mainDir+"/0-MultipleRuns/"
# Figure directory -- should be reset (each Part)
figDir = mainDir+"/Figures - fix_directory/"
sc.settings.figdir = figDir
# Data summaries directory -- should be reset (Part 2: Clustering)
dataSumDir = mainDir+"/DataSummaries - fix_directory/"
# EMT Marker directory
emtMarkerDir = mainDir+"/Cross-Dataset Comparisons/Markers/"
# Cluster DE gene directory
epiStateDir = mainDir+"/Cross-Dataset Comparisons/Epithelial State Genes/Data - Log2FC from Conditions & Genes/"
intStateDir = mainDir+"/Cross-Dataset Comparisons/Intermediate State Genes/Data - Log2FC from Conditions & Genes/"
mesStateDir = mainDir+"/Cross-Dataset Comparisons/Mesenchymal State Genes/Data - Log2FC from Conditions & Genes/"
nCellsDir = mainDir+"/Cross-Dataset Comparisons/nCells/"
# ODE data directory -- should be reset (Part 3: Trajectory Inference)
odeDir = mainDir+"/ODE Model/Data/"

In [None]:
### FILE PATHS ###

# File: import run settings
runSettingsCsv = "all_run_settings.xlsx"

# Files: cell cycle markers
cellCycle_g1S_csv = "Cell Cycle Markers - G1,S Genes.csv"
cellCycle_g2M_csv = "Cell Cycle Markers - G2,M Genes.csv"


# Files: export cluster state gene markers from this run
epiStateCsv = "_Epithelial State Marker Genes, All Conditions.csv"
intStateCsv = "_Intermediate State Marker Genes, All Conditions.csv"
mesStateCsv = "_Mesenchymal State Marker Genes, All Conditions.csv"
nCellsCsv = "_"+dataset_name+"_"+run_name+" - percent of cells with gene expression.csv"

# Specific markers
msigdbMarkersCsv = "Markers - MSigDB.csv"
panglaoMarkersCsv = "Markers - PanglaoDB, Epithelial.csv"
empCookMarkersCsv = "Markers - EMP Cook 2021.csv"

### Part 1: IMPORT DATA, FILTERING, AND NORMALIZATION

setDirectories

In [None]:
def setDirectories(dataset_name, run_name):
    global dataDir
    global figDir
    
    # Set directories
    dataDir = dataDir+dataset_name+"/"
    figDir = mainDir+"/"+dataset_name+"/Figures/1-Filtering and Normalization/"+run_name+"/"
    sc.settings.figdir = figDir
    sc.settings.set_figure_params(dpi=150, figsize=[5,5])

importSettings

In [None]:
def importSettings(run_name):
    df = pd.read_excel(runSettingsDir+runSettingsCsv, engine='openpyxl')
    run_settings = df.loc[df['run_name'] == run_name].squeeze()
    return run_settings

importData

In [None]:
def importData(dataset_name, run_name):
    # Import data
    adata = sc.read_csv(dataDir+"/_rawData/"+run_name+".csv")
    adata.obs_names_make_unique()
    adata.var_names = adata.var_names.str.upper()
    adata.var_names_make_unique()
    return adata

filterData

In [None]:
def filterData(adata):
    
    # Remove ERCC genes
    ercc_genes = adata.var_names.str.startswith('ERCC')
    adata = adata[:, ercc_genes==False].copy()
    
    # Filter for minimum cells and genes
    sc.pp.filter_cells(adata, min_genes=200)
    sc.pp.filter_genes(adata, min_cells=3)
    
    return adata

find_Mito

In [None]:
def find_Mito(adata, mt_string=('MT-','MTRNR')):
    # Mitochondrial genes
    adata.var["mito_gene"] = adata.var_names.str.startswith(mt_string)
    return adata

filter_MitoRibo

In [None]:
def filter_MitoRibo(adata, total_counts_cutoff, mito_cutoff, ribo_cutoff, run_name, dataset_name=""):
    
    ### GENE METRICS ###
    
    # sc.pp.calculate_qc_metrics generates pct_counts_mito_gene from  0-100 not 0-1
    mito_cutoff = mito_cutoff*100
    
    # Gene metrics
    sc.pp.calculate_qc_metrics(adata, qc_vars=["mito_gene"], percent_top=None, t=False, inplace=True)
    adata.obs["n_genes"] = adata.obs["n_genes_by_counts"].copy() # Rename "n_genes_by_counts" to "n_genes"
    del adata.obs["n_genes_by_counts"]
    sc.pl.violin(adata, ['n_genes', 'total_counts', 'pct_counts_mito_gene'], jitter=0.4, multi_panel=True, save=" - n_genes, n_counts, perc_mito.png")
    
    # Ribosomal genes
    ribo_genes = adata.var_names.str.startswith(('RPL','RPS'))
    adata.obs["pct_counts_ribo_gene"] = np.sum(adata[:, ribo_genes].X, axis=1) / np.sum(adata.X, axis=1)
    sc.pl.scatter(adata, x='total_counts', y='pct_counts_ribo_gene', save=" - pct_counts_ribo_gene vs n_counts.png")
    
    # Gene metric scatterplots
    sc.pl.scatter(adata, x='total_counts', y='n_genes', save=" - n_genes vs total_counts.png")
    sc.pl.scatter(adata, x='total_counts', y='pct_counts_mito_gene', save=" - pct_counts_mito_gene vs total_counts.png")
    sc.pl.scatter(adata, x='pct_counts_ribo_gene', y='pct_counts_mito_gene', save=" - pct_counts_mito_gene vs pct_counts_ribo_gene.png")
    sc.pl.scatter(adata, x='total_counts', y='n_genes', color='pct_counts_ribo_gene', save=" - n_genes vs total_counts, colored by ribo.png")
    
    
    ### FILTERING ###
    
    # Remove total_count outliers
    adata = adata[adata.obs["total_counts"] < total_counts_cutoff,:].copy()
    
    # Remove mito genes, remove high mito cells
    adata = adata[:, adata.var.mito_gene==False].copy() # Mt genes
    del adata.var["mito_gene"]
    adata = adata[adata.obs.pct_counts_mito_gene < mito_cutoff, :].copy() # MT cells
    
    # Remove ribo genes, remove high ribo cells (if desired)
    ribo_genes = adata.var_names.str.startswith(('RPL','RPS'))
    adata = adata[:, ribo_genes==False].copy()
    adata = adata[adata.obs.pct_counts_ribo_gene < ribo_cutoff, :].copy()
    
    ### Figures after filtering
    
    # Gene metric scatterplot after filtering
    sc.pl.scatter(adata, x='total_counts', y='n_genes', color='pct_counts_mito_gene', save=" - n_genes vs total_counts filtered.png")
    if mito_cutoff < 100:
        sc.pl.scatter(adata, x='total_counts', y='pct_counts_mito_gene', save=" - pct_counts_mito_gene vs total_counts filtered.png")
    if ribo_cutoff < 1:
        sc.pl.scatter(adata, x='total_counts', y='pct_counts_ribo_gene', save=" - pct_counts_ribo_gene vs n_counts filtered.png")
        sc.pl.scatter(adata, x='pct_counts_mito_gene', y='pct_counts_ribo_gene', save=" - pct_counts_mito_gene vs pct_counts_ribo_gene filtered.png")
    # Genes with largest proportions, after filtering
    sc.pl.highest_expr_genes(adata, n_top=20, save=" - genes w highest expression.png")
    
    
    # Export percent cells expressed per gene
    adata.var["perc_cells_expressed"] = adata.var["n_cells"] / adata.n_obs
    adata.var["perc_cells_expressed"].to_csv(nCellsDir + nCellsCsv)

    return adata

saveFilteredData

In [None]:
def saveFilteredData(adata, run_name):
    adata.write(dataDir+"_h5ad/"+run_name+"/1-Filtered_allGenes.h5ad")

normalize_and_hiVar_Data

In [None]:
def normalize_and_hiVar_Data(adata, run_name):
    
    # Total-count normalize to 10,000 counts per cell, natural log
    sc.pp.normalize_total(adata, target_sum=1e4)
    # Set raw
    adata.raw = adata
    adata.write(dataDir+"_h5ad/"+run_name+"/1-Filtered_and_Normalized_allGenes.h5ad")
    
    # Log and scale data
    sc.pp.log1p(adata)
    adata.raw = adata
    
    # Add cell cycle score
    cellCycle_g1S_genes, cellCycle_g2M_genes = cell_cycle_genes(adata)
    adata.X = adata.X.astype('<f8')
    sc.tl.score_genes_cell_cycle(adata, s_genes=cellCycle_g1S_genes, g2m_genes=cellCycle_g2M_genes, use_raw=False)
    adata.X = adata.X.astype('<f4')
    adata.write(dataDir+"_h5ad/"+run_name+"/1-Filtered_and_Normalized_allGenes_cellCycleScore.h5ad")
    
    # Highly variable genes (note: expects log data)
    sc.pp.highly_variable_genes(adata)
    adataHiVar = adata[:, adata.var.highly_variable].copy()
    
    # Regress out effects of total counts per cell and percent mito AND cell cycle
    sc.pp.regress_out(adataHiVar, ['total_counts','pct_counts_mito_gene', 'S_score', 'G2M_score'])
    sc.pp.scale(adataHiVar, max_value=10)
    adataHiVar.write(dataDir+"_h5ad/"+run_name+"/1-Filtered_and_Normalized_hiVarGenes.h5ad")
    
    return adataHiVar

cell_cycle_genes \
above scoring information from: https://github.com/theislab/scanpy_usage/blob/master/180209_cell_cycle/cell_cycle.ipynb

In [None]:
def cell_cycle_genes(adata):
    os.chdir(emtMarkerDir)
    
    # G1/S Genes
    cellCycle_g1S_genes = []
    with open(cellCycle_g1S_csv) as inputFile:
        reader = csv.reader(inputFile)
        # no header
        for row in reader:
            if row[0] in adata.var_names:
                cellCycle_g1S_genes.append(row[0])

    # G2/M Genes
    cellCycle_g2M_genes = []
    with open(cellCycle_g2M_csv) as inputFile:
        reader = csv.reader(inputFile)
        # no header
        for row in reader:
            if row[0] in adata.var_names:
                cellCycle_g2M_genes.append(row[0])
    
    return cellCycle_g1S_genes, cellCycle_g2M_genes

check_cell_cycle

In [None]:
def check_cell_cycle(run_name):
    # Basically do the entire workflow but without regressing out S_score, G2M_score
    
    # Set directories
    global figDir
    figDir = mainDir+"/"+dataset_name+"/Figures/2-EMTscore and Clustering/"+run_name+"/"
    sc.settings.figdir = figDir
    sc.settings.set_figure_params(dpi=80, figsize=[5,5])    
    
    # Import data and regress
    adata_cellCycle = sc.read_h5ad(dataDir+"_h5ad/"+run_name+"/1-Filtered_and_Normalized_allGenes_cellCycleScore.h5ad")
    adata_cellCycle.uns['log1p']['base'] = None # 22.07 This is included because of a bug; it should already be None
    sc.pp.highly_variable_genes(adata_cellCycle)
    adata_cellCycle_hiVar = adata_cellCycle[:, adata_cellCycle.var.highly_variable].copy()
    sc.pp.regress_out(adata_cellCycle_hiVar, ['total_counts','pct_counts_mito_gene'])
    sc.pp.scale(adata_cellCycle_hiVar, max_value=10)
    
    # Clustering
    sc.pp.neighbors(adata_cellCycle_hiVar)
    sc.tl.umap(adata_cellCycle_hiVar)
    sc.tl.leiden(adata_cellCycle_hiVar, resolution=0.35, key_added='leiden')
    sc.pl.umap(adata_cellCycle_hiVar, color=["phase", "leiden"], save=" - cell cycle, unregressed, clusters.png")
    sc.pl.umap(adata_cellCycle_hiVar, color=["phase"], save=" - cell cycle, unregressed.png")

### Part 2: EMTSCORE AND CLUSTERING

importProcessedData

In [None]:
def importProcessedData(dataset_name, run_name):
    global dataSumDir
    global figDir

    # Set directories
    dataSumDir = mainDir+"/"+dataset_name+"/DataSummaries/2-EMTscore and Clustering/"
    figDir = mainDir+"/"+dataset_name+"/Figures/2-EMTscore and Clustering/"+run_name+"/"
    sc.settings.figdir = figDir
    sc.settings.set_figure_params(dpi=150, figsize=[5,5])
    
    # Import data
    adata_normalized = sc.read_h5ad(dataDir+"_h5ad/"+run_name+"/1-Filtered_and_Normalized_allGenes.h5ad")
    adataHiVar = sc.read_h5ad(dataDir+"_h5ad/"+run_name+"/1-Filtered_and_Normalized_hiVarGenes.h5ad")
    adataHiVar.uns['log1p']['base'] = None # 22.07 This is included because of a bug; it should already be None
    # pandas df with uppercase genes
    df_normalized = adata_normalized.to_df()
    df_normalized.columns = map(str.upper, df_normalized.columns)
    
    return adataHiVar, df_normalized

PCA

In [None]:
def PCA(adataHiVar):
    sc.tl.pca(adataHiVar, svd_solver='arpack')
#     sc.pl.pca_variance_ratio(adataHiVar, log=True, save=".png")
    return adataHiVar

UMAP

In [None]:
def UMAP(adataHiVar):
    sc.pp.neighbors(adataHiVar)
    sc.tl.umap(adataHiVar)
    sc.pl.umap(adataHiVar, color=["pct_counts_mito_gene","pct_counts_ribo_gene","phase"], save=" - perc_mito, perc_ribo, cell cycle.png")
    sc.pl.umap(adataHiVar, color=["phase"], save=" - cell cycle.png")
    
    if run_E_markers:
        sc.pl.umap(adataHiVar, color=run_E_markers, save=" - selected E markers.png")
        sc.pl.umap(adataHiVar, color=run_E_markers[0], save=" - selected E marker.png")
    if run_M_markers:
        sc.pl.umap(adataHiVar, color=run_M_markers, save=" - selected M markers.png")
        sc.pl.umap(adataHiVar, color=run_M_markers[0], save=" - selected M marker.png")

    return adataHiVar

leiden

In [None]:
def leiden(adataHiVar, leiden_resolution, EMT_clusters_in_order):
    sc.tl.leiden(adataHiVar, resolution=leiden_resolution, key_added='leiden')
    sc.pl.umap(adataHiVar, color=["leiden"]) # initializes uns['leiden_colors']
    
    # Annotate the labels of the clusters
    if len(adataHiVar.obs['leiden'].unique()) <= len(EMT_clusters_in_order):
        
        # 4 clusters
        if len(EMT_clusters_in_order) == 4:
            leiden_labels = {str(EMT_clusters_in_order[0]): 'E',
                             str(EMT_clusters_in_order[1]): 'I1',
                             str(EMT_clusters_in_order[2]): 'I2',
                             str(EMT_clusters_in_order[3]): 'M'}
            adataHiVar.obs['leiden_label'] = adataHiVar.obs['leiden'].replace(to_replace=leiden_labels)
            # category reordering from https://scanpy.discourse.group/t/how-to-order-legend-in-sc-pl-umap/411
            adataHiVar.obs['leiden'].cat.reorder_categories(['0','1','2','3'], inplace=True)
            adataHiVar.obs['leiden_label'].cat.reorder_categories(['E','I1','I2','M'], inplace=True)
            # Colors of clusters
            colors_EMT = ['#d62728', '#ff7f0e', '#279e68', '#1f77b4']
            adataHiVar.uns['leiden_colors'] = colors_EMT
            adataHiVar.uns['leiden_label_colors'] = colors_EMT
        
        # 3 clusters
        elif len(EMT_clusters_in_order) == 3:
            leiden_labels = {str(EMT_clusters_in_order[0]): 'E',
                             str(EMT_clusters_in_order[1]): 'I',
                             str(EMT_clusters_in_order[2]): 'M'}
            adataHiVar.obs['leiden_label'] = adataHiVar.obs['leiden'].replace(to_replace=leiden_labels)
            # category reordering from https://scanpy.discourse.group/t/how-to-order-legend-in-sc-pl-umap/411
            adataHiVar.obs['leiden'].cat.reorder_categories(['0','1','2'], inplace=True)
            adataHiVar.obs['leiden_label'].cat.reorder_categories(['E','I','M'], inplace=True)
            # Colors of clusters
            colors_EMT = ['#d62728', '#ff7f0e', '#1f77b4']
            adataHiVar.uns['leiden_colors'] = colors_EMT
            adataHiVar.uns['leiden_label_colors'] = colors_EMT
        
    sc.pl.umap(adataHiVar, color=["leiden_label"], save=" - clusters.png")
    return adataHiVar

PAGA

In [None]:
# def PAGA(adataHiVar, paga_cutoff=0.1):
#     sc.tl.paga(adataHiVar, groups='leiden_label')
#     sc.pl.paga(adataHiVar, color=['leiden_label'], threshold=paga_cutoff, save=" - clusters.png")
#     return adataHiVar

leiden_marker_genes

In [None]:
def leiden_marker_genes(adataHiVar, dataset_name, run_name):
    sc.tl.rank_genes_groups(adataHiVar, 'leiden_label', method='wilcoxon')
    sc.pl.rank_genes_groups(adataHiVar, n_genes=25, sharey=False, save=" - cluster markers.png")
    
    # Heatmap of top marker genes
    sc.tl.dendrogram(adataHiVar, groupby='leiden_label')
    sc.pl.rank_genes_groups_heatmap(adataHiVar, n_genes=5, save=" - leiden marker genes, top 5.png")
    sc.pl.rank_genes_groups_heatmap(adataHiVar, n_genes=20, save=" - leiden marker genes, top 20.png")
    sc.pl.rank_genes_groups_heatmap(adataHiVar, n_genes=100, save=" - leiden marker genes, top 100.png")

    # Output to CSV
    pd.DataFrame(adataHiVar.uns['rank_genes_groups']['names']).head(10)
    os.chdir(dataSumDir)
    pd.DataFrame(adataHiVar.uns['rank_genes_groups']['names']).head(200).to_csv("Cluster Marker Genes - "+run_name+".csv")

    return adataHiVar

leiden_marker_genes_highlight - highlighting just for visualization purposes

In [None]:
def leiden_marker_genes_highlight(run_name, df_normalized):
    
    # Import MSigDB genes, EMP Cook genes, and PanglaoDB E genes
    os.chdir(emtMarkerDir)
    df_normalized_genes = df_normalized.columns
    
    msigdbGenes = []
    with open(msigdbMarkersCsv) as inputFile:
        reader = csv.reader(inputFile)
        # no header
        for row in reader:
            if row[0] in df_normalized_genes:
                msigdbGenes.append(row[0])
                
    empCookGenes = []
    with open(empCookMarkersCsv) as inputFile:
        reader = csv.reader(inputFile)
        # no header
        for row in reader:
            if row[0] in df_normalized_genes:
                empCookGenes.append(row[0])
                
    panglaoGenes = []
    with open(panglaoMarkersCsv) as inputFile:
        reader = csv.reader(inputFile)
        # no header
        for row in reader:
            if row[0] in df_normalized_genes:
                panglaoGenes.append(row[0])
                
    
    # Import leiden_marker_genes notebook
    os.chdir(dataSumDir)
    import openpyxl
    wb = openpyxl.Workbook()
    ws = wb.active
    with open("Cluster Marker Genes - "+run_name+".csv") as f:
        reader = csv.reader(f)
        for row in reader:
            ws.append(row)

    # Highlight E genes (red) and M genes (blue / purple)
    for row in ws.iter_rows(min_col=2, min_row=2):
        for cell in row:
            if cell.value in panglaoGenes: # PanglaoDB = light red
                cell.fill = openpyxl.styles.PatternFill("solid", fgColor="D98686")
            elif cell.value in msigdbGenes: # MSigDB = light blue
                cell.fill = openpyxl.styles.PatternFill("solid", fgColor="ABC9DE")
            elif cell.value in empCookGenes: # EMP Cook = light purple
                cell.fill = openpyxl.styles.PatternFill("solid", fgColor="DABEED")

    wb.save("Cluster Marker Genes - "+run_name+", colored.xlsx")
    wb.close()

    os.chdir(mainDir)

cluster_DE_genes_export

In [None]:
def cluster_DE_genes_export(adataHiVar, dataset_name, run_name, EMT_clusters_in_order, currCluster="E"):
    
    # Per cluster:
    # -Find genes with adj-p-val <.01
    # -Add p<.01 genes to summary csv "_X State Marker Genes, All Conditions"
    # -Add all genes with log2FC to separate csv
    
    # Output
    if currCluster == "E":
        currClusterDir = epiStateDir
        currClusterCsv = epiStateCsv
        currClusterLabel = "Epithelial"
    elif currCluster in ["I", "I1", "I2"]:
        currClusterDir = intStateDir
        currClusterCsv = intStateCsv
        currClusterLabel = "Intermediate"
    elif currCluster == "M":
        currClusterDir = mesStateDir
        currClusterCsv = mesStateCsv
        currClusterLabel = "Mesenchymal"

    # Find genes with adj-p-val<.01 in I cluster
    num_DE_genes = len(adataHiVar.uns['rank_genes_groups']["pvals_adj"][currCluster][adataHiVar.uns['rank_genes_groups']["pvals_adj"][currCluster] < 0.01])
    # Max value I genes = 500
    if num_DE_genes > 500:
        num_DE_genes = 500
    DE_genes = adataHiVar.uns['rank_genes_groups']["names"][currCluster][0:num_DE_genes]
    
    # Add DE genes to summary csv "_Intermediate State Marker Genes, All Conditions"
    os.chdir(currClusterDir)
    currStateDf = pd.read_csv(currClusterCsv)
    currStateDf[dataset_name+"_"+run_name+", "+currCluster] = pd.Series(DE_genes)
    DE_genes_allConditions = list(set(currStateDf.to_numpy().flatten().tolist()))
    DE_genes_allConditions_minusCurr = list(set(DE_genes_allConditions).difference(set(DE_genes)))
    currStateDf.to_csv(currClusterCsv, index=False)
    
    # logFC of gene in I vs all other clusters
    print(os.getcwd())
    all_genes_log2FC = pd.Series(data=adataHiVar.uns['rank_genes_groups']['logfoldchanges'][currCluster],
                                 index=adataHiVar.uns['rank_genes_groups']['names'][currCluster])
    all_genes_log2FC.to_csv(currClusterLabel+" State Marker Genes - "+dataset_name+"_"+run_name+", "+currCluster+".csv", header=False)

UCell_export_import

In [None]:
def UCell_export_import(adataHiVar, run_name):
    
    # Export cell names with cluster labels
    # UCell imports this and counts matrix
    adataHiVar.obs["leiden_label"].to_csv(dataSumDir+"Cells with Cluster Labels - "+run_name+".csv", header=False)
    
    # UCell is run in R
    
    # Import UCell scores
    df_UCell = pd.read_csv(dataSumDir+"Cells with Cluster Labels - "+run_name+", UCellScore.csv", index_col=1).drop(labels="Unnamed: 0", axis=1)
    df_UCell = df_UCell.reindex(adataHiVar.obs.index)
    adataHiVar.obs["UCell_EMTscore"] = df_UCell["M_MSigDB_UCell"]
    sc.pl.umap(adataHiVar, color=["UCell_EMTscore", "leiden_label"], save=" - EMTscore UCell and leiden_label.png")
    sc.pl.umap(adataHiVar, color=["UCell_EMTscore"], show=False, save=" - EMTscore UCell.png")

    return adataHiVar

saveClusteredData

In [None]:
def saveClusteredData(adataHiVar, run_name):
    adataHiVar.write(dataDir+"_h5ad/"+run_name+"/2-EMTscore_and_Clustered.h5ad")

### Part 3: PSEUDOTIME

importClusteredData

In [None]:
def importClusteredData(dataset_name, run_name):
    global pseudotimeRootDir
    global figDir
    global odeDir
    
    # Set directories
    pseudotimeRootDir = mainDir+"/"+dataset_name+"/Figures/3-Pseudotime/"+run_name+"/Root Node/"
    odeDir = mainDir+"/ODE Model/Data/"+dataset_name+"/"
    figDir = mainDir+"/"+dataset_name+"/Figures/3-Pseudotime/"+run_name+"/"
    sc.settings.figdir = figDir
    
    # Import data
    adataHiVar = sc.read_h5ad(dataDir+"_h5ad/"+run_name+"/2-EMTscore_and_Clustered.h5ad")
    return adataHiVar

diffmap

In [None]:
def diffmap(adataHiVar, numNeighbors=10):
    sc.tl.diffmap(adataHiVar)
    sc.pp.neighbors(adataHiVar, numNeighbors, use_rep='X_diffmap')
    return adataHiVar

pseudotime_rootNodes

In [None]:
def pseudotime_rootNodes(adataHiVar, pseudotime_DC, pseudotime_DC_rootNodes, numRoots=5):
    # Find best root nodes: highest values of diffmap DC
    
    # THESE INDEXES ARE CURRENTLY BUGGED IN scanpy 1.9.1 (but not 1.8.2) - (usually) indexes should start at 0 (bug) curr start at 1
    # DC dimension 
    if pseudotime_DC == "DC1":
        diffmap_dim = np.asarray([cell_dim[1] for cell_dim in adataHiVar.obsm['X_diffmap']])
    elif pseudotime_DC == "DC2":
        diffmap_dim = np.asarray([cell_dim[2] for cell_dim in adataHiVar.obsm['X_diffmap']])
    elif pseudotime_DC == "DC3":
        diffmap_dim = np.asarray([cell_dim[3] for cell_dim in adataHiVar.obsm['X_diffmap']])
    
    # DC1, most neg values or most pos values
    if pseudotime_DC_rootNodes == "low":
        root_nodes = diffmap_dim.argsort()[:numRoots] # lowest values
    else:
        root_nodes = diffmap_dim.argsort()[-numRoots:] # highest values
    return root_nodes

pseudotime_mean

In [None]:
def pseudotime_mean(adataHiVar, root_nodes):
    
    # Calculate pseudotime with N best root nodes
    df_pseudotime = pd.DataFrame()
    for curr_node in root_nodes:
        adataHiVar.uns['iroot'] = curr_node # root cell assigned
        sc.tl.dpt(adataHiVar) # calculate pseudotime using this root
        df_pseudotime["Root Node "+str(curr_node)] = adataHiVar.obs['dpt_pseudotime'].copy()
    
    # Calculate mean per cell
    df_pseudotime["Mean"] = df_pseudotime.mean(axis=1)
    df_pseudotime["Cluster"] = adataHiVar.obs['leiden_label'].copy()
    adataHiVar.obs['dpt_pseudotime_mean'] = df_pseudotime["Mean"].copy()
    
    # Figures
    sc.settings.figdir = figDir
    sc.pl.diffmap(adataHiVar, color=['dpt_pseudotime_mean'], save=" - Pseudotime Mean.png")
    sc.pl.diffmap(adataHiVar, color=['dpt_pseudotime_mean'], components=['2,3'], save=" - Pseudotime Mean (bug correction 22.08).png")
    sc.pl.diffmap(adataHiVar, color=['leiden_label'], save=" - clusters.png")
    sc.pl.diffmap(adataHiVar, color=['leiden_label'], components=['1,2', '1,3', '2,3'], save=" - clusters, 3 components.png")
    sc.pl.umap(adataHiVar, color=['dpt_pseudotime_mean'], save=" - Pseudotime Mean.png")
    
    return adataHiVar, df_pseudotime

pseudotime_per_cluster

In [None]:
def pseudotime_per_cluster(adataHiVar, EMT_clusters_in_order, run_name, df_pseudotime, numBins=15):
    
    increments = np.linspace(0,1,numBins+1)
    
    # Clusters with pseudotime cell numbers
    root_nodes_list = df_pseudotime.columns[:-2].tolist()
    numIntervals_list = list(range(0,numBins))
    if len(EMT_clusters_in_order) == 4:
        df_pseudotimeClusters_mean = pd.DataFrame(columns=['E','I1','I2','M','Pseudotime Interval'])
        df_pseudotimeClusters_E = pd.DataFrame(index=numIntervals_list, columns=root_nodes_list)
        df_pseudotimeClusters_I1 = pd.DataFrame(index=numIntervals_list, columns=root_nodes_list)
        df_pseudotimeClusters_I2 = pd.DataFrame(index=numIntervals_list, columns=root_nodes_list)
        df_pseudotimeClusters_M = pd.DataFrame(index=numIntervals_list, columns=root_nodes_list)
    if len(EMT_clusters_in_order) == 3:
        df_pseudotimeClusters_mean = pd.DataFrame(columns=['E','I', 'M','Pseudotime Interval'])
        df_pseudotimeClusters_E = pd.DataFrame(index=numIntervals_list, columns=root_nodes_list)
        df_pseudotimeClusters_I = pd.DataFrame(index=numIntervals_list, columns=root_nodes_list)
        df_pseudotimeClusters_M = pd.DataFrame(index=numIntervals_list, columns=root_nodes_list)
    
    # 4 separate matrices: E, I1, I2, M
    # Rows: pseudotime intervals
    # Columns: each of 10 root nodes
    # Contain cell counts from calculating per each root node
    
    # 5th matrix: mean matrix
    # Rows: pseudotime intervals
    # Columns: E, I, M
    # Contains cell counts from calculating each cell's mean pseudotime
    
    # Fill out matrices
    if len(EMT_clusters_in_order) == 4:
        # Mean matrix
        for i in range(1,len(increments)):
            df_currInterval = df_pseudotime[(df_pseudotime['Mean'] <= increments[i]) & (df_pseudotime['Mean'] > increments[i-1])]
            counts_E = np.sum(df_currInterval["Cluster"] == "E")
            counts_I1 = np.sum(df_currInterval["Cluster"] == "I1")
            counts_I2 = np.sum(df_currInterval["Cluster"] == "I2")
            counts_M = np.sum(df_currInterval["Cluster"] == "M")
            df_pseudotimeClusters_mean = pd.concat([df_pseudotimeClusters_mean,
                                                    pd.DataFrame(data={'E': [counts_E],
                                                                       'I1': [counts_I1],
                                                                       'I2': [counts_I2],
                                                                       'M': [counts_M],
                                                                       'Pseudotime Interval': [round(increments[i],3)]})],
                                                   ignore_index=True)
        # Root node E, I1, I2, M matrices
        for idx, currRootNode in enumerate(df_pseudotime.columns[:-2].tolist()):
            for i in range(1,len(increments)):
                df_currInterval = df_pseudotime[(df_pseudotime[currRootNode] <= increments[i]) & (df_pseudotime[currRootNode] > increments[i-1])]
                counts_E = np.sum(df_currInterval["Cluster"] == "E")
                counts_I1 = np.sum(df_currInterval["Cluster"] == "I1")
                counts_I2 = np.sum(df_currInterval["Cluster"] == "I2")
                counts_M = np.sum(df_currInterval["Cluster"] == "M")
                df_pseudotimeClusters_E.iat[i-1, idx] = counts_E
                df_pseudotimeClusters_I1.iat[i-1, idx] = counts_I1
                df_pseudotimeClusters_I2.iat[i-1, idx] = counts_I2
                df_pseudotimeClusters_M.iat[i-1, idx] = counts_M
        # Root node E, I1, I2, M matrices; standard deviation of # cells in each pseudotime interval
        df_pseudotimeClusters_stdev = pd.DataFrame()
        df_pseudotimeClusters_stdev["E"] = df_pseudotimeClusters_E.std(axis=1)
        df_pseudotimeClusters_stdev["I1"] = df_pseudotimeClusters_I1.std(axis=1)
        df_pseudotimeClusters_stdev["I2"] = df_pseudotimeClusters_I2.std(axis=1)
        df_pseudotimeClusters_stdev["M"] = df_pseudotimeClusters_M.std(axis=1)
    
    elif len(EMT_clusters_in_order) == 3:
        # Mean matrix
        for i in range(1,len(increments)):
            df_currInterval = df_pseudotime[(df_pseudotime['Mean'] <= increments[i]) & (df_pseudotime['Mean'] > increments[i-1])]
            counts_E = np.sum(df_currInterval["Cluster"] == "E")
            counts_I = np.sum(df_currInterval["Cluster"] == "I")
            counts_M = np.sum(df_currInterval["Cluster"] == "M")
            df_pseudotimeClusters_mean = pd.concat([df_pseudotimeClusters_mean,
                                                    pd.DataFrame(data={'E': [counts_E],
                                                                       'I': [counts_I],
                                                                       'M': [counts_M],
                                                                       'Pseudotime Interval': [round(increments[i],3)]})],
                                                   ignore_index=True)
        # Root node E, I, M matrices
        for idx, currRootNode in enumerate(df_pseudotime.columns[:-2].tolist()):
            for i in range(1,len(increments)):
                df_currInterval = df_pseudotime[(df_pseudotime[currRootNode] <= increments[i]) & (df_pseudotime[currRootNode] > increments[i-1])]
                counts_E = np.sum(df_currInterval["Cluster"] == "E")
                counts_I = np.sum(df_currInterval["Cluster"] == "I")
                counts_M = np.sum(df_currInterval["Cluster"] == "M")
                df_pseudotimeClusters_E.iat[i-1, idx] = counts_E
                df_pseudotimeClusters_I.iat[i-1, idx] = counts_I
                df_pseudotimeClusters_M.iat[i-1, idx] = counts_M
        # Root node E, I1, I2, M matrices; standard deviation of # cells in each pseudotime interval
        df_pseudotimeClusters_stdev = pd.DataFrame()
        df_pseudotimeClusters_stdev["E"] = df_pseudotimeClusters_E.std(axis=1)
        df_pseudotimeClusters_stdev["I"] = df_pseudotimeClusters_I.std(axis=1)
        df_pseudotimeClusters_stdev["M"] = df_pseudotimeClusters_M.std(axis=1)
            

    # Plot figure with cell numbers
    if len(EMT_clusters_in_order) == 4:
        colors_EMT_rainbow = ['#d62728', '#ff7f0e', '#279e68', '#1f77b4'] #ROGB
        y_plot = ["E","I1","I2","M"]
    elif len(EMT_clusters_in_order) == 3:
        colors_EMT_rainbow = ['#d62728', '#ff7f0e', '#1f77b4'] #ROB
        y_plot = ["E","I","M"]
    
    df_pseudotimeClusters_mean.reset_index().plot(x="Pseudotime Interval", y=y_plot,
                                                  yerr=[df_pseudotimeClusters_stdev[col] for col in df_pseudotimeClusters_stdev],
                                                  kind="bar", width=0.9, figsize=(8,6),
                                                  color=colors_EMT_rainbow)
    plt.ylim(bottom=0)
    plt.title("Pseudotime Intervals with Cluster Counts");
    plt.ylabel("# Cells");
    plt.savefig(figDir+"Pseudotime Intervals by Cluster - "+str(numBins)+" bins.png")
    plt.show()
    
    
    
    # ODE Data - proportions
    
    # Calculate by proportions per interval instead of numbers
    df_pseudotimeClusters_proportions = df_pseudotimeClusters_mean.iloc[:,:-1].copy()
    numCells_per_interval = df_pseudotimeClusters_proportions.sum(axis=1)
    
    df_pseudotimeClusters_proportions = df_pseudotimeClusters_proportions.div(numCells_per_interval.replace(0.0,1.0), axis=0)
    df_pseudotimeClusters_proportions["Pseudotime Interval"] = df_pseudotimeClusters_mean["Pseudotime Interval"].copy()
    
    # Trim, starting from last bin with 100% E cells
    model_start_bin = max(0, df_pseudotimeClusters_proportions.iloc[:,0][df_pseudotimeClusters_proportions.iloc[:,0] == 1.0].index[-1])
    df_pseudotimeClusters_proportions = df_pseudotimeClusters_proportions[model_start_bin:]
    
    
    # Plot figure with cell proportions
    if len(EMT_clusters_in_order) == 4:
        # Convert standard deviation # cells into proportions
        df_pseudotimeClusters_stdev_proportions = pd.DataFrame()
        df_pseudotimeClusters_stdev_proportions["E"] = (df_pseudotimeClusters_stdev["E"] / numCells_per_interval).fillna(0)[model_start_bin:]
        df_pseudotimeClusters_stdev_proportions["I1"] = (df_pseudotimeClusters_stdev["I1"] / numCells_per_interval).fillna(0)[model_start_bin:]
        df_pseudotimeClusters_stdev_proportions["I2"] = (df_pseudotimeClusters_stdev["I2"] / numCells_per_interval).fillna(0)[model_start_bin:]
        df_pseudotimeClusters_stdev_proportions["M"] = (df_pseudotimeClusters_stdev["M"] / numCells_per_interval).fillna(0)[model_start_bin:]
    
        plt.clf()
        plt.figure(figsize=(8, 5), dpi=150)
        plt.errorbar(df_pseudotimeClusters_proportions["Pseudotime Interval"], df_pseudotimeClusters_proportions["E"],
                     yerr = df_pseudotimeClusters_stdev_proportions["E"], fmt='-o', color="C3", lw=2)
        plt.errorbar(df_pseudotimeClusters_proportions["Pseudotime Interval"], df_pseudotimeClusters_proportions["I1"],
                     yerr = df_pseudotimeClusters_stdev_proportions["I1"], fmt='-o', color="C1", lw=2)
        plt.errorbar(df_pseudotimeClusters_proportions["Pseudotime Interval"], df_pseudotimeClusters_proportions["I2"],
                     yerr = df_pseudotimeClusters_stdev_proportions["I2"], fmt='-o', color="C2", lw=2)
        plt.errorbar(df_pseudotimeClusters_proportions["Pseudotime Interval"], df_pseudotimeClusters_proportions["M"],
                     yerr = df_pseudotimeClusters_stdev_proportions["M"], fmt='-o', color="C0", lw=2)
        plt.legend(["E", "I1", "I2", "M"], loc="center left")
        plt.xlabel("Pseudotime")
        plt.ylabel("Cell Population Proportion")
        plt.title("Cell Population across Pseudotime")
        plt.ylim(0,1);
        plt.savefig(figDir+"Pseudotime Intervals by Cluster - "+str(numBins)+" bins, Cell Proportions.png")
        plt.show()
        
    elif len(EMT_clusters_in_order) == 3:
        # Convert standard deviation # cells into proportions
        df_pseudotimeClusters_stdev_proportions = pd.DataFrame()
        df_pseudotimeClusters_stdev_proportions["E"] = (df_pseudotimeClusters_stdev["E"] / numCells_per_interval).fillna(0)[model_start_bin:]
        df_pseudotimeClusters_stdev_proportions["I"] = (df_pseudotimeClusters_stdev["I"] / numCells_per_interval).fillna(0)[model_start_bin:]
        df_pseudotimeClusters_stdev_proportions["M"] = (df_pseudotimeClusters_stdev["M"] / numCells_per_interval).fillna(0)[model_start_bin:]
        
        plt.clf()
        plt.figure(figsize=(8, 5), dpi=150)
        plt.errorbar(df_pseudotimeClusters_proportions["Pseudotime Interval"], df_pseudotimeClusters_proportions["E"],
                     yerr = df_pseudotimeClusters_stdev_proportions["E"], fmt='-o', color="C3", lw=2)
        plt.errorbar(df_pseudotimeClusters_proportions["Pseudotime Interval"], df_pseudotimeClusters_proportions["I"],
                     yerr = df_pseudotimeClusters_stdev_proportions["I"], fmt='-o', color="C1", lw=2)
        plt.errorbar(df_pseudotimeClusters_proportions["Pseudotime Interval"], df_pseudotimeClusters_proportions["M"],
                     yerr = df_pseudotimeClusters_stdev_proportions["M"], fmt='-o', color="C0", lw=2)
        plt.legend(["E", "I", "M"], loc="center left")
        plt.xlabel("Pseudotime")
        plt.ylabel("Cell Population Proportion")
        plt.title("Cell Population across Pseudotime")
        plt.ylim(0,1);
        plt.savefig(figDir+"Pseudotime Intervals by Cluster - "+str(numBins)+" bins, Cell Proportions.png")
        plt.show()
    
    
    # Save ODE data
    del df_pseudotimeClusters_proportions["Pseudotime Interval"]
    os.chdir(odeDir)
    df_pseudotimeClusters_proportions.to_csv(run_name+" - Pseudotime - "+str(numBins)+" bins, cropped.csv")
    df_pseudotimeClusters_stdev_proportions.to_csv(run_name+" - Pseudotime - "+str(numBins)+" bins, cropped, stdev.csv")

savePseudotimeData

In [None]:
def savePseudotimeData(adataHiVar, run_name):
    adataHiVar.write(dataDir+"/_h5ad/"+run_name+"/3-Pseudotime.h5ad")