In [None]:
import os
currDir = os.getcwd()

In [None]:
# Import universal dataset functions
os.chdir(os.path.dirname(os.path.dirname(currDir))+"/0-MultipleRuns/")
%run universal_dataset_functions.ipynb
os.chdir(currDir)

### Part 1: IMPORT DATA, FILTERING, AND NORMALIZATION

importData (overwrite)

In [None]:
def importData(dataset_name, run_name):
    # Import data
    adata = sc.read_csv(dataDir+"/_rawData/"+run_name+".csv")
    adata.var_names_make_unique()
    
    
    ### COOK UNIQUE ###
    
    # Add metadata: timepoint and original pseudotime
    df_metadata = pd.read_csv(dataDir+"/_rawData/"+run_name+"_metadata.csv")

    # Annotate timepoint labels
    labelDict = {'0d': '0) 0d',
                 '8h': '1) 8h',
                 '1d': '2) 1d',
                 '3d': '3) 3d',
                 '7d': '4) 7d',
                 '8h_rm': '5) 8h no stim',
                 '1d_rm': '6) 1d no stim',
                 '3d_rm': '7) 3d no stim'}
    df_metadata["Time_edited"] = df_metadata["Time"].replace(to_replace=labelDict)

    # Add labels to adata
    adata.obs["Timepoint"] = df_metadata["Time_edited"].tolist()
    adata.obs["Authors_Pseudotime"] = df_metadata["Pseudotime"].tolist()
    adata.obs["Batch"] = df_metadata["Mix"].tolist()
    
    # Only keep forward timepoints
    adata = adata[[x in ['0) 0d','1) 8h','2) 1d','3) 3d','4) 7d'] for x in adata.obs["Timepoint"]]].copy()
    
    ### /COOK UNIQUE ###
    
    
    return adata

filter_MitoRibo (overwrite)

In [None]:
def filter_MitoRibo(adata, total_counts_cutoff, mito_cutoff, ribo_cutoff, run_name, dataset_name=""):
    
    ### GENE METRICS ###
    
    # ScanPy pct_counts_mito_gene is from  0-100 not 0-1
    mito_cutoff = mito_cutoff*100
    
    # Gene metrics
    sc.pp.calculate_qc_metrics(adata, qc_vars=["mito_gene"], percent_top=None, log1p=False, inplace=True)
    adata.obs["n_genes"] = adata.obs["n_genes_by_counts"].copy() # Rename "n_genes_by_counts" to "n_genes"
    del adata.obs["n_genes_by_counts"]
    sc.pl.violin(adata, ['n_genes', 'total_counts', 'pct_counts_mito_gene'], jitter=0.4, multi_panel=True, save=" - n_genes, n_counts, perc_mito.png")
    
    # Ribosomal genes
    ribo_genes = adata.var_names.str.startswith(('RPL','RPS'))
    adata.obs["pct_counts_ribo_gene"] = np.sum(adata[:, ribo_genes].X, axis=1) / np.sum(adata.X, axis=1)
    sc.pl.scatter(adata, x='total_counts', y='pct_counts_ribo_gene', save=" - pct_counts_ribo_gene vs n_counts.png")
    
    # Gene metric scatterplots
    sc.pl.scatter(adata, x='total_counts', y='n_genes', save=" - n_genes vs total_counts.png")
    sc.pl.scatter(adata, x='total_counts', y='pct_counts_mito_gene', save=" - pct_counts_mito_gene vs total_counts.png")
    sc.pl.scatter(adata, x='pct_counts_ribo_gene', y='pct_counts_mito_gene', save=" - pct_counts_mito_gene vs pct_counts_ribo_gene.png")
    
    
    ### FILTERING ###
    
    # Remove total_count outliers
    adata = adata[adata.obs["total_counts"] < total_counts_cutoff,:].copy()
    
    ### COOK UNIQUE
    
    if run_name == "MCF7-TNF":
        sc.pp.filter_cells(adata, min_genes=1000)
        
    ### /COOK UNIQUE
    
    # Remove mito genes, remove high mito cells
    adata = adata[:, adata.var.mito_gene==False].copy() # Mt genes
    del adata.var["mito_gene"]
    adata = adata[adata.obs.pct_counts_mito_gene < mito_cutoff, :].copy() # MT cells
    
    # Remove ribo genes, remove high ribo cells
    ribo_genes = adata.var_names.str.startswith(('RPL','RPS'))
    adata = adata[:, ribo_genes==False].copy()
    adata = adata[adata.obs.pct_counts_ribo_gene < ribo_cutoff, :].copy()
    
    ### Figures after filtering
    
    # Gene metric scatterplot after filtering
    sc.pl.scatter(adata, x='total_counts', y='n_genes', color='pct_counts_mito_gene', save=" - n_genes vs total_counts filtered.png")
    if mito_cutoff < 100:
        sc.pl.scatter(adata, x='total_counts', y='pct_counts_mito_gene', save=" - pct_counts_mito_gene vs total_counts filtered.png")
    if ribo_cutoff < 1:
        sc.pl.scatter(adata, x='total_counts', y='pct_counts_ribo_gene', save=" - pct_counts_ribo_gene vs n_counts filtered.png")
        sc.pl.scatter(adata, x='pct_counts_mito_gene', y='pct_counts_ribo_gene', save=" - pct_counts_mito_gene vs pct_counts_ribo_gene filtered.png")
    # Genes with largest proportions, after filtering
    sc.pl.highest_expr_genes(adata, n_top=20, save=" - genes w highest expression.png")
    
    
    # Export percent cells expressed per gene
    adata.var["perc_cells_expressed"] = adata.var["n_cells"] / adata.n_obs
    adata.var["perc_cells_expressed"].to_csv(nCellsDir + nCellsCsv)
    
    return adata

normalize_and_hiVar_Data (overwrite)

In [None]:
def normalize_and_hiVar_Data(adata, run_name):
    
    # Total-count normalize to 10,000 counts per cell, natural log
    sc.pp.normalize_total(adata, target_sum=1e4)
    # Set raw
    adata.raw = adata
    adata.write(dataDir+"_h5ad/"+run_name+"/1-Filtered_and_Normalized_allGenes.h5ad")
    
    # Log and scale data
    sc.pp.log1p(adata)
    adata.raw = adata
    # Add cell cycle score
    cellCycle_g1S_genes, cellCycle_g2M_genes = cell_cycle_genes(adata)
    adata.X = adata.X.astype('<f8')
    sc.tl.score_genes_cell_cycle(adata, s_genes=cellCycle_g1S_genes, g2m_genes=cellCycle_g2M_genes, use_raw=False)
    adata.X = adata.X.astype('<f4')
    
    ### COOK UNIQUE
    # Regress out batch effects with Combat
    sc.pp.combat(adata, key="Batch")
    ### /COOK UNIQUE

    adata.write(dataDir+"_h5ad/"+run_name+"/1-Filtered_and_Normalized_allGenes_cellCycleScore.h5ad")

    # Highly variable genes (note: expects log data)
    sc.pp.highly_variable_genes(adata)
    adataHiVar = adata[:, adata.var.highly_variable].copy()
    
    # Regress out effects of total counts per cell and percent mito AND cell cycle
    sc.pp.regress_out(adataHiVar, ['total_counts','pct_counts_mito_gene', 'S_score', 'G2M_score'])
    sc.pp.scale(adataHiVar, max_value=10)
    adataHiVar.write(dataDir+"_h5ad/"+run_name+"/1-Filtered_and_Normalized_hiVarGenes.h5ad")
    
    return adataHiVar

### Part 2: EMTSCORE AND CLUSTERING

Cook_unique_filter (overwrite)

In [None]:
def Cook_unique_filter(adataHiVar, run_name):
    
    if run_name == "A549-TGFB1":
        # Calculate diffmap
        sc.pp.neighbors(adataHiVar)
        sc.tl.diffmap(adataHiVar)
        sc.pp.neighbors(adataHiVar, n_neighbors=10, use_rep='X_diffmap')
        
        # Pick lowest 9 cells from DC1
        diffmap_dim1 = np.asarray([cell_dim[1] for cell_dim in adataHiVar.obsm['X_diffmap']])
        remove_nodes = diffmap_dim1.argsort()[:5]
        remove_nodes_list = ["0"]*len(adataHiVar.obs)
        for currNode in remove_nodes:
            remove_nodes_list[currNode] = "1"
        adataHiVar.obs["diffmap_filter"] = remove_nodes_list.copy()
        sc.pl.diffmap(adataHiVar, color=["diffmap_filter","Timepoint"], components=['1,2','1,3','2,3'])

        # Remove filtered cells
        adataHiVar = adataHiVar[adataHiVar.obs["diffmap_filter"]=="0"].copy()
    
    if run_name == "OVCA420-TGFB1":
        # Calculate diffmap
        sc.pp.neighbors(adataHiVar)
        sc.tl.diffmap(adataHiVar)
        sc.pp.neighbors(adataHiVar, n_neighbors=10, use_rep='X_diffmap')

        # Pick lowest 13 lowest cells from DC2
        diffmap_dim = np.asarray([cell_dim[2] for cell_dim in adataHiVar.obsm['X_diffmap']])
        remove_nodes = diffmap_dim.argsort()[:13]
        remove_nodes_list = ["0"]*len(adataHiVar.obs)
        print(remove_nodes)
        print(type(remove_nodes))
        print(len(remove_nodes))
        for currNode in remove_nodes:
            remove_nodes_list[currNode] = "1"
        adataHiVar.obs["diffmap_filter"] = remove_nodes_list.copy()
        sc.pl.diffmap(adataHiVar, color=["diffmap_filter","Timepoint"], components=['1,3'])
        
        # Remove filtered cells
        adataHiVar = adataHiVar[adataHiVar.obs["diffmap_filter"]=="0"].copy()
    return adataHiVar

UMAP (overwrite)

In [None]:
def UMAP(adataHiVar):
    sc.pp.neighbors(adataHiVar)
    sc.tl.umap(adataHiVar)
    sc.pl.umap(adataHiVar, color=["pct_counts_mito_gene","pct_counts_ribo_gene","phase"], save=" - perc_mito, perc_ribo, cell cycle.png")
    sc.pl.umap(adataHiVar, color=["phase"], save=" - cell cycle.png")

    # Selected markers
    if run_E_markers:
        sc.pl.umap(adataHiVar, color=run_E_markers, save=" - selected E markers.png")
        sc.pl.umap(adataHiVar, color=run_E_markers[0], save=" - selected E marker.png")
    if run_M_markers:
        sc.pl.umap(adataHiVar, color=run_M_markers, save=" - selected M markers.png")
        sc.pl.umap(adataHiVar, color=run_M_markers[0], save=" - selected M marker.png")

        
    ### COOK UNIQUE ###
    
    # Batch
    sc.pl.umap(adataHiVar, color=["Batch"], save=" - batch.png")
        
    # Timepoints
    sc.pl.umap(adataHiVar, color=["Timepoint","Authors_Pseudotime"], palette="coolwarm_r", save=" - Timepoint and Authors_Pseudotime.png")
    sc.pl.umap(adataHiVar, color=["Timepoint"], palette="coolwarm_r", save=" - Timepoint.png")
    
    ### /COOK UNIQUE ###
    
    
    return adataHiVar

leiden (overwrite)

In [None]:
def leiden(adataHiVar, leiden_resolution, EMT_clusters_in_order, run_name):
    sc.tl.leiden(adataHiVar, resolution=leiden_resolution, key_added='leiden')
    sc.pl.umap(adataHiVar, color=["leiden"]) # initializes uns['leiden_colors']
    
    # Annotate the labels of the clusters
    if len(adataHiVar.obs['leiden'].unique()) <= len(EMT_clusters_in_order):
        
        # 4 clusters
        if len(EMT_clusters_in_order) == 4:
            leiden_labels = {str(EMT_clusters_in_order[0]): 'E',
                             str(EMT_clusters_in_order[1]): 'I1',
                             str(EMT_clusters_in_order[2]): 'I2',
                             str(EMT_clusters_in_order[3]): 'M'}
            adataHiVar.obs['leiden_label'] = adataHiVar.obs['leiden'].replace(to_replace=leiden_labels)
            # category reordering from https://scanpy.discourse.group/t/how-to-order-legend-in-sc-pl-umap/411
            adataHiVar.obs['leiden'].cat.reorder_categories(['0','1','2','3'], inplace=True)
            adataHiVar.obs['leiden_label'].cat.reorder_categories(['E','I1','I2','M'], inplace=True)
            # Colors of clusters
            colors_EMT = ['#d62728', '#ff7f0e', '#279e68', '#1f77b4']
            adataHiVar.uns['leiden_colors'] = colors_EMT
            adataHiVar.uns['leiden_label_colors'] = colors_EMT
        
        # 3 clusters
        elif len(EMT_clusters_in_order) == 3:
            leiden_labels = {str(EMT_clusters_in_order[0]): 'E',
                             str(EMT_clusters_in_order[1]): 'I',
                             str(EMT_clusters_in_order[2]): 'M'}
            adataHiVar.obs['leiden_label'] = adataHiVar.obs['leiden'].replace(to_replace=leiden_labels)
            adataHiVar.obs['leiden'].cat.reorder_categories(['0','1','2'], inplace=True)
            adataHiVar.obs['leiden_label'].cat.reorder_categories(['E','I','M'], inplace=True)
            # Colors of clusters
            colors_EMT = ['#d62728', '#ff7f0e', '#1f77b4']
            adataHiVar.uns['leiden_colors'] = colors_EMT
            adataHiVar.uns['leiden_label_colors'] = colors_EMT
            
    sc.pl.umap(adataHiVar, color=["leiden_label"], save=" - clusters.png")
    return adataHiVar

### Part 3: PSEUDOTIME

pseudotime_rootNodes (overwrite)

In [None]:
def pseudotime_rootNodes(adataHiVar, pseudotime_DC, pseudotime_DC_rootNodes, run_name, numRoots=5):
    # Find best root nodes: highest values of diffmap DC
    
    # THESE INDEXES ARE CURRENTLY BUGGED IN 1.9.1 (but not 1.8.2) - (usually) indexes should start at 0 (bug) curr start at 1
    # DC dimension 
    if pseudotime_DC == "DC1":
        diffmap_dim = np.asarray([cell_dim[1] for cell_dim in adataHiVar.obsm['X_diffmap']])
    elif pseudotime_DC == "DC2":
        diffmap_dim = np.asarray([cell_dim[2] for cell_dim in adataHiVar.obsm['X_diffmap']])
    elif pseudotime_DC == "DC3":
        diffmap_dim = np.asarray([cell_dim[3] for cell_dim in adataHiVar.obsm['X_diffmap']])
    
    # DC1, most neg values or most pos values
    if pseudotime_DC_rootNodes == "low":
        root_nodes = diffmap_dim.argsort()[:numRoots] # lowest values
    else:
        root_nodes = diffmap_dim.argsort()[-numRoots:] # highest values
    
    return root_nodes

pseudotime_mean (overwrite)

In [None]:
def pseudotime_mean(adataHiVar, root_nodes):
    
    # Calculate pseudotime with N best root nodes
    df_pseudotime = pd.DataFrame()
    for curr_node in root_nodes:
        adataHiVar.uns['iroot'] = curr_node # root cell assigned
        sc.tl.dpt(adataHiVar) # calculate pseudotime using this root
        df_pseudotime["Root Node "+str(curr_node)] = adataHiVar.obs['dpt_pseudotime'].copy()
    
    # Calculate mean per cell
    df_pseudotime["Mean"] = df_pseudotime.mean(axis=1)
    df_pseudotime["Cluster"] = adataHiVar.obs['leiden_label'].copy()
    adataHiVar.obs['dpt_pseudotime_mean'] = df_pseudotime["Mean"].copy()
    
    # Figures
    sc.settings.figdir = figDir
    sc.pl.diffmap(adataHiVar, color=['dpt_pseudotime_mean'], save=" - Pseudotime Mean.png")
    sc.pl.diffmap(adataHiVar, color=['leiden_label'], save=" - clusters.png")
    sc.pl.diffmap(adataHiVar, color=['leiden_label'], components=['1,2', '1,3', '2,3'], save=" - clusters, 3 components.png")
    sc.pl.umap(adataHiVar, color=['dpt_pseudotime_mean'], save=" - Pseudotime Mean.png")
    
    
    ### COOK UNIQUE ###
    sc.pl.diffmap(adataHiVar, color=["Timepoint"], save=" - Timepoint.png")
    if "Authors_Pseudotime" in adataHiVar.obs:
        sc.pl.umap(adataHiVar, color=['dpt_pseudotime_mean', 'Authors_Pseudotime'], save=" - Pseudotime Mean vs Authors Pseudotime.png")
    ### /COOK UNIQUE ###
    
    
    return adataHiVar, df_pseudotime