## PAGA Analysis (with timeseries z-score Dataframe ready)

- This is an attempt at compiling a final version of the community detection and downstream steps (i.e. the main analysis)

- Later, this should be merged with all relevent upstream processing to get z-score timeseries

- Try to cut down imports at that point

- That final notebook should also include real (not z-score) timeseries information

- Note that there are fluctuations in the illumination intensity which may be resulting in pathological behavior from the reporter

- This has been normalized out in the upstream processing, but try to fix long term

- Also consider a flat field correction for the final experiment

In [None]:
import paulssonlab.deaton.trenchripper.trenchripper as tr

import numpy as np
import pandas as pd
import seaborn as sns
import scipy as sp
import sklearn as skl
import dask.dataframe as dd
import dask.array as da
import dask
import warnings
import copy
import random
from sklearn.metrics.pairwise import (
    euclidean_distances,
    manhattan_distances,
    cosine_distances,
)

from sklearn.metrics import silhouette_score
import scipy.stats
from sklearn.linear_model import LinearRegression
from sklearn.cluster import AffinityPropagation
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering

import scanpy as sc
import anndata
import scipy as sp
import scipy.sparse
import dask.array as da
from igraph.drawing.text import TextDrawer
from tslearn.neighbors import KNeighborsTimeSeries
from tslearn.metrics import cdist_soft_dtw_normalized, cdist_soft_dtw
import networkx as nx
import igraph as ig
import leidenalg
import umap
from scanpy.plotting.palettes import default_20, vega_20_scanpy
from matplotlib import pyplot as plt
import ast
from tslearn.barycenters import (
    softdtw_barycenter,
    dtw_barycenter_averaging,
    euclidean_barycenter,
)


import pylab
import scipy.cluster.hierarchy as sch

import matplotlib.gridspec as gridspec
import matplotlib as mpl

import holoviews as hv

hv.extension("bokeh")
random.seed(42)
np.random.seed(42)

warnings.filterwarnings(action="once", category=UserWarning)

### Initial Data Processing

Here, I am going to try and replicate (to some extant) the corrections from "Genomewide phenotypic analysis of growth, cell morphogenesis, and cell cycle events in Escherichia coli"

#### Start Dask

In [None]:
headpath = (
    "/home/de64/scratch/de64/sync_folder/2021-06-14_lDE20_biofloat_fullrun_1/Barcodes"
)

In [None]:
dask_controller = tr.trcluster.dask_controller(
    walltime="04:00:00",
    local=False,
    n_workers=40,
    memory="16GB",
    working_directory=headpath + "/dask",
)
dask_controller.startdask()

In [None]:
dask_controller.displaydashboard()

In [None]:
# dask_controller.shutdown()

In [None]:
gene_cluster_df_full = pd.read_pickle(
    "/home/de64/scratch/de64/sync_folder/2021-06-14_lDE20_biofloat_fullrun_1/2021-08-24_gene_cluster_df_no_filter.pkl"
)
gene_cluster_df_full = gene_cluster_df_full.dropna(subset=["Gene"])  # no control genes

## 1) Preprocessing

In [None]:
def parallel_norm_soft_dtw(X, chunk_size=200):
    X_dask = da.from_array(X, chunks=(chunk_size, X.shape[1], X.shape[2]))
    soft_dtw_arr = da.blockwise(
        cdist_soft_dtw, "ik", X_dask, "itd", X_dask, "ktd", concatenate=True
    ).compute()
    d_ii = np.diag(soft_dtw_arr)
    norm_soft_dtw_arr = soft_dtw_arr - (
        0.5 * (d_ii.reshape((-1, 1)) + d_ii.reshape((1, -1)))
    )
    return norm_soft_dtw_arr

### Relabel timeseries (correct this upstream later)

In [None]:
gene_cluster_df_full = gene_cluster_df_full.rename(
    columns={
        "Kernel Trace: Division: major_axis_length: Yeo-Johnson: z score": "Division Length Z-score",
        "Kernel Trace: Mean Linear Growth Rate: Volume: Yeo-Johnson: z score": "Linear Growth Rate Z-score",
        "Kernel Trace: Mean Exponential Growth Rate: Volume: Yeo-Johnson: z score": "Exponential Growth Rate Z-score",
        "Kernel Trace: Mean: minor_axis_length: Yeo-Johnson: z score": "Width Z-score",
        "Kernel Trace: Mean: mCherry Intensity: Yeo-Johnson: z score": "mCherry Intensity Z-score",
        "Kernel Trace: Delta t: Yeo-Johnson: z score": "Doubling Time Z-score",
        "Kernel Trace: Division: major_axis_length": "Division Length",
        "Kernel Trace: Mean Linear Growth Rate: Volume": "Linear Growth Rate",
        "Kernel Trace: Mean Exponential Growth Rate: Volume": "Exponential Growth Rate",
        "Kernel Trace: Mean: minor_axis_length": "Width",
        "Kernel Trace: Mean: mCherry Intensity": "mCherry Intensity",
        "Kernel Trace: Delta t": "Doubling Time",
    }
)

### Take mean z-scores over the timeseries

In [None]:
traces = [
    "Linear Growth Rate",
    "Exponential Growth Rate",
    "Division Length",
    "Width",
    "mCherry Intensity",
    "Doubling Time",
]

zscore_traces = [trace + " Z-score" for trace in traces]

for trace in traces:
    avg = gene_cluster_df_full.apply(lambda x: np.mean(x[trace]), axis=1)
    gene_cluster_df_full[trace + ": Mean"] = avg

for zscore_trace in zscore_traces:
    avg_zscore = gene_cluster_df_full.apply(lambda x: np.mean(x[zscore_trace]), axis=1)
    gene_cluster_df_full[zscore_trace + ": Mean"] = avg_zscore

## 2) Transiently Elongated Cells

In [None]:
from tslearn.preprocessing import TimeSeriesScalerMinMax
from tslearn.preprocessing import TimeSeriesScalerMeanVariance


def plot_cluster_timeseries(
    df,
    cluster_label,
    feature_labels,
    displayed_labels,
    feature_range_list,
    agg_fn=np.mean,
    x_ticks=[0, 10, 20],
    cluster_subset=None,
    figsize=(10, 10),
    wspace=0.0,
    hspace=0.0,
    fontsize=14,
    linewidth=5,
    color_list=None,
):

    if cluster_subset is not None:
        df = copy.copy(df)
        df = df[df[cluster_label].isin(cluster_subset)]

    timeseries_list = []
    for feature_label in feature_labels:
        agg_cluster_timeseries = (
            df.groupby([cluster_label])
            .apply(lambda x: agg_fn(np.array(x[feature_label].tolist()), axis=0))
            .to_frame()
        )
        agg_cluster_timeseries = agg_cluster_timeseries.rename(
            columns={0: feature_label}
        )
        timeseries_list.append(agg_cluster_timeseries)
    timeseries_df = pd.concat(timeseries_list, axis=1)

    fig = plt.figure(constrained_layout=True, figsize=figsize)
    gs = fig.add_gridspec(1, len(timeseries_df), wspace=wspace)

    for i in range(len(timeseries_df)):
        mean_cluster_timeseries = timeseries_df.iloc[i]

        clust_arr = np.array(timeseries_df.iloc[i].tolist())
        if color_list == None:
            color = "tab:blue"
        else:
            color = color_list[i]

        if i == 0:
            inner_gs = gs[0, i].subgridspec(
                clust_arr.shape[0], 1, wspace=0, hspace=hspace
            )
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):

                feature_range = feature_range_list[c[0]]
                ax.plot(clust_arr[c], linewidth=linewidth, color=color)
                ax.set_ylim(feature_range[0], feature_range[1])
                ax.set(xticks=[])
                #                 ax.set(xticks=[], yticks=[0,6])
                ax.set_ylabel(
                    displayed_labels[c[0]],
                    rotation=0,
                    labelpad=30,
                    fontsize=fontsize,
                    ha="right",
                )  # ,orientation="horizontal")

            ax.set_xlabel(str(i), fontsize=fontsize)
            ax.set(xticks=x_ticks)

        else:
            inner_gs = gs[0, i].subgridspec(
                clust_arr.shape[0], 1, wspace=0, hspace=hspace
            )
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):
                feature_range = feature_range_list[c[0]]
                ax.plot(clust_arr[c], linewidth=linewidth, color=color)
                ax.set_ylim(feature_range[0], feature_range[1])
                ax.set(xticks=[], yticks=[])

            ax.set_xlabel(str(i), fontsize=fontsize)
            ax.set(xticks=x_ticks)

    plt.tight_layout()
    return fig


def get_braycenters(df, columns=None, max_iter=50, tol=0.001):
    df = df[columns]
    X = np.array(df.apply(lambda x: x.tolist(), axis=1).tolist())
    X = np.swapaxes(X, 1, 2)

    Y = softdtw_barycenter(X, max_iter=max_iter, tol=tol)  # T X D

    return Y


def plot_cluster_timeseries_braycenters(
    df,
    cluster_label,
    feature_labels,
    displayed_labels,
    feature_range_list,
    x_ticks=[0, 10, 20],
    cluster_subset=None,
    figsize=(10, 10),
    wspace=0.0,
    hspace=0.0,
    fontsize=14,
    linewidth=3,
    color_list=None,
):

    if cluster_subset is not None:
        df = copy.copy(df)
        df = df[df[cluster_label].isin(cluster_subset)]

    cluster_groupby = df.groupby([cluster_label])
    agg_cluster_timeseries = cluster_groupby.apply(
        lambda x: get_braycenters(x, columns=feature_labels)
    )

    timeseries_list = []
    for i, feature_label in enumerate(feature_labels):
        selected_feature_agg = agg_cluster_timeseries.apply(
            lambda x: x[:, i]
        ).to_frame()
        selected_feature_agg = selected_feature_agg.rename(columns={0: feature_label})
        timeseries_list.append(selected_feature_agg)

    timeseries_df = pd.concat(timeseries_list, axis=1)

    fig = plt.figure(constrained_layout=True, figsize=figsize)
    gs = fig.add_gridspec(1, len(timeseries_df), wspace=wspace)

    for i in range(len(timeseries_df)):
        mean_cluster_timeseries = timeseries_df.iloc[i]

        clust_arr = np.array(timeseries_df.iloc[i].tolist())
        if color_list == None:
            color = "tab:blue"
        else:
            color = color_list[i]

        if i == 0:
            inner_gs = gs[0, i].subgridspec(
                clust_arr.shape[0], 1, wspace=0, hspace=hspace
            )
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):

                feature_range = feature_range_list[c[0]]
                ax.plot(clust_arr[c], linewidth=linewidth, color=color)
                ax.set_ylim(feature_range[0], feature_range[1])
                ax.set(xticks=[])
                #                 ax.set(xticks=[], yticks=[0,6])
                ax.set_ylabel(
                    displayed_labels[c[0]],
                    rotation=0,
                    labelpad=30,
                    fontsize=fontsize,
                    ha="right",
                )  # ,orientation="horizontal")

            ax.set_xlabel(str(i), fontsize=fontsize)
            ax.set(xticks=x_ticks)

        else:
            inner_gs = gs[0, i].subgridspec(
                clust_arr.shape[0], 1, wspace=0, hspace=hspace
            )
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):
                feature_range = feature_range_list[c[0]]
                ax.plot(clust_arr[c], linewidth=linewidth, color=color)
                ax.set_ylim(feature_range[0], feature_range[1])
                ax.set(xticks=[], yticks=[])

            ax.set_xlabel(str(i), fontsize=fontsize)
            ax.set(xticks=x_ticks)

    plt.tight_layout()
    return fig

### Renormalization

In [None]:
norm_threshold = 1
feature_integrated_norm = gene_cluster_df_full["Division Length Z-score"].apply(
    lambda x: sp.integrate.simpson(x)
)
feature_max_norm = gene_cluster_df_full["Division Length Z-score"].apply(
    lambda x: np.max(x)
)
feature_filtered_df = gene_cluster_df_full[feature_max_norm > norm_threshold]

In [None]:
X_feature = np.array(feature_filtered_df["Division Length"].tolist())[:, :, np.newaxis]

In [None]:
V_max = np.max(X_feature, axis=1, keepdims=True)
V_min = np.min(X_feature, axis=1, keepdims=True)
V_i = X_feature[:, 0:1]
min_max_norm = (X_feature - V_i) / (V_max - V_min)
feature_filtered_df["Division Length Feature Norm"] = [
    item for item in min_max_norm[:, :, 0]
]

In [None]:
plt.plot(min_max_norm[0])

In [None]:
dtw_feature_norm = parallel_norm_soft_dtw(min_max_norm)

### Initialize Anndata Object

In [None]:
an_df_feature = anndata.AnnData(
    X=min_max_norm.reshape(min_max_norm.shape[0], -1), obs=feature_filtered_df
)  # AnnData container to use scanpy functions with unwrapped time vector

### Compute KNN Graph

In [None]:
n_neighbors = 15
n_pcs = 20  # This shouldn't affect anything

sc.pp.neighbors(an_df_feature, n_neighbors=n_neighbors, n_pcs=n_pcs)
knn_indices, knn_dists, forest = sc.neighbors.compute_neighbors_umap(
    dtw_feature_norm, n_neighbors=n_neighbors, metric="precomputed"
)
(
    an_df_feature.uns["neighbors"]["distances"],
    an_df_feature.uns["neighbors"]["connectivities"],
) = sc.neighbors._compute_connectivities_umap(
    knn_indices,
    knn_dists,
    an_df_feature.shape[0],
    n_neighbors,  # change to neighbors you plan to use
)
an_df_feature.obsp["distances"] = an_df_feature.uns["neighbors"]["distances"]
an_df_feature.obsp["connectivities"] = an_df_feature.uns["neighbors"]["connectivities"]
an_df_feature.obsp["soft_dtw"] = dtw_feature_norm

### Computing Leiden, PAGA and UMAP

In [None]:
feature_paga_df_dict = {}
for resolution in [0.25, 1.0, 1.5]:
    feature_paga_df_dict[resolution] = copy.deepcopy(an_df_feature)
    sc.tl.leiden(
        feature_paga_df_dict[resolution], resolution=resolution, n_iterations=-1
    )
    sc.tl.paga(feature_paga_df_dict[resolution], groups="leiden")
    sc.pl.paga(feature_paga_df_dict[resolution], add_pos=True, show=False)
sc.tl.umap(feature_paga_df_dict[1.0], init_pos="paga", min_dist=0.25, spread=5.0)
feature_paga_df_dict[1.0].obs["leiden_lowres"] = feature_paga_df_dict[0.25].obs[
    "leiden"
]
feature_paga_df_dict[1.0].obs["leiden_highres"] = feature_paga_df_dict[1.5].obs[
    "leiden"
]
feature_paga_df = feature_paga_df_dict[1.0]

In [None]:
fig = sc.pl.umap(
    feature_paga_df,
    color=["leiden_lowres", "leiden", "leiden_highres"],
    title=["Leiden Resolution=0.25", "Leiden Resolution=1.", "Leiden Resolution=1.5"],
    show=False,
    legend_loc="on data",
    edges=True,
    add_outline=False,
    size=50,
    return_fig=True,
    palette=vega_20_scanpy,
)

In [None]:
feature_paga_df.obs["N Match"] = 20.0 - feature_paga_df.obs["N Mismatch"]
feature_del_N_match_series = feature_paga_df.obs.groupby("TargetID").apply(
    lambda x: x["N Match"] - np.min(x["N Match"])
)
feature_del_N_match_series = feature_del_N_match_series.droplevel("TargetID")
feature_paga_df.obs["Delta N Match"] = feature_del_N_match_series

In [None]:
feature_paga_df.obs["Division Length Feature Norm Final Value"] = feature_paga_df.obs[
    "Division Length Feature Norm"
].apply(lambda x: x[-1])

In [None]:
feature_paga_df.obs["Division Length Feature Norm t Half Max"] = feature_paga_df.obs[
    "Division Length Feature Norm"
].apply(lambda x: np.where(x >= (np.max(x) / 2.0))[0][0])

In [None]:
plt.hist(feature_paga_df.obs["Division Length Feature Norm t Half Max"], bins=20)
plt.show()
plt.hist(feature_paga_df.obs["Division Length Feature Norm Final Value"], bins=20)
plt.show()

In [None]:
np.unique(
    feature_paga_df.obs[
        (feature_paga_df.obs["Division Length Feature Norm t Half Max"] >= 3.0)
        & (feature_paga_df.obs["Division Length Feature Norm t Half Max"] < 4.0)
    ]["Gene"].tolist(),
    return_counts=True,
)

In [None]:
df_low_compensation = feature_paga_df.obs[
    feature_paga_df.obs["Division Length Feature Norm Final Value"] > 0.5
]
gene_groupby_mean = df_low_compensation.groupby("Gene")[
    "Division Length Feature Norm t Half Max"
].apply(lambda x: np.nanmedian(x))

In [None]:
plt.hist(gene_groupby_mean, bins=20)
plt.show()

In [None]:
gene_groupby_mean[gene_groupby_mean < 7.0]

In [None]:
plt.plot(
    feature_paga_df.obs[feature_paga_df.obs["Gene"] == "tff"]["Division Length"][0]
)

In [None]:
plt.plot(feature_paga_df.obs[feature_paga_df.obs["Gene"] == "xseB"]["Width"][0])

In [None]:
feature_paga_df.obs[feature_paga_df.obs["Gene"] == "xseB"]["Division Length"][0]

In [None]:
fig = sc.pl.umap(
    feature_paga_df,
    color=[
        "Division Length Feature Norm Final Value",
        "Division Length Feature Norm t Half Max",
    ],
    show=False,
    legend_loc="on data",
    add_outline=False,
    size=50,
    return_fig=True,
    cmap="RdBu_r",
    wspace=0.25,
)

In [None]:
fig = plot_cluster_timeseries_braycenters(
    feature_paga_df.obs,
    "leiden",
    ["Division Length Feature Norm", "Division Length Feature Norm"],
    ["Division Length Feature Norm", "Division Length Feature Norm"],
    [(-1, 1), (-1, 1)],
    figsize=(15, 6),
    wspace=0.25,
    hspace=0.25,
    color_list=vega_20_scanpy,
)

### GO Term Enrichment

In [None]:
import goatools
import goatools.base
from goatools.base import download_go_basic_obo

from goatools.obo_parser import GODag
from goatools.anno.gaf_reader import GafReader
from goatools.semantic import semantic_similarity
from goatools.semantic import TermCounts, get_info_content

from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
from goatools.go_enrichment import GOEnrichmentStudy


def search_go(ns2assoc, obodag, inv_gene_to_id, go_term):
    namespace_abbv = {
        "biological_process": "BP",
        "molecular_function": "MF",
        "cellular_component": "CC",
    }

    print("Searching for " + str(obodag[go_term].name))
    namespace = namespace_abbv[obodag[go_term].namespace]
    child_goterms = list(obodag[go_term].get_all_children())
    gene_list = [
        inv_gene_to_id[key]
        for key, val in ns2assoc[namespace].items()
        if go_term in val
    ]
    for child_goterm in child_goterms:
        gene_list += [
            inv_gene_to_id[key]
            for key, val in ns2assoc[namespace].items()
            if child_goterm in val
        ]
    gene_list = sorted(list(set(gene_list)))
    return gene_list


def get_enriched_GO_terms(
    background_gene_list, gene_list, obodag, objanno, ns2assoc, pval=0.05, GO_type="BP"
):

    gene_to_id = {assoc.DB_Symbol: assoc.DB_ID for assoc in objanno.associations}
    synonym_dict = {
        synonym: assoc.DB_ID
        for assoc in objanno.associations
        for synonym in assoc.DB_Synonym
    }
    gene_to_id.update(synonym_dict)

    # background gene set

    all_genes_uniprot = [
        gene_to_id[item] for item in background_gene_list if item in gene_to_id.keys()
    ]
    selected_genes_uniprot = [
        gene_to_id[item] for item in gene_list if item in gene_to_id.keys()
    ]

    print(len(all_genes_uniprot))
    print(len(selected_genes_uniprot))

    goeaobj = GOEnrichmentStudy(
        all_genes_uniprot,  # List of mouse protein-coding genes
        ns2assoc[GO_type],  # geneid/GO associations
        obodag,  # Ontologies
        propagate_counts=True,
        alpha=pval,  # default significance cut-off
        methods=["fdr_bh"],
    )
    # defult multipletest correction method

    goea_results_all = goeaobj.run_study(selected_genes_uniprot, prt=None)
    goea_quiet_sig = [r for r in goea_results_all if r.p_fdr_bh < pval]
    goea_quiet_enriched = [r for r in goea_quiet_sig if r.enrichment == "e"]
    return goea_quiet_enriched


def pick_exemplar(go1, go2, termcounts, obodag, info_thr, pval_factor=2.0):

    info_1_low = get_info_content(go1.GO, termcounts) < info_thr
    info_2_low = get_info_content(go2.GO, termcounts) < info_thr
    if info_1_low and not info_2_low:
        return go2
    elif info_2_low and not info_1_low:
        return go1
    elif info_2_low and info_1_low:
        return go1

    pval_ratio = go1.p_fdr_bh / go2.p_fdr_bh

    if pval_ratio > pval_factor:
        return go2
    elif pval_ratio < (1.0 / pval_factor):
        return go1

    go1_parents = list(obodag[go1.GO].get_all_parents())
    go2_parents = list(obodag[go2.GO].get_all_parents())

    if go2.GO in go1_parents:
        return go2

    elif go1.GO in go2_parents:
        return go1

    return go1


def get_filtered_go_terms(
    obodag, objanno, goea_list, sim_thr=0.05, info_thr=1.0, GO_type="BP"
):

    termcounts = TermCounts(obodag, objanno.get_ns2assc()[GO_type])

    go_term_list = [item.GO for item in goea_list]
    sim_arr = np.zeros((len(go_term_list), len(go_term_list)))
    for i in range(len(go_term_list)):
        for j in range(len(go_term_list)):
            sim_arr[i, j] = semantic_similarity(
                go_term_list[i], go_term_list[j], obodag
            )
    np.fill_diagonal(sim_arr, 0.0)

    working_group_idx = 0
    grouped_terms = {}
    group_exemplars = {}
    go_term_indices = list(range(len(go_term_list)))

    while len(go_term_indices) > 0:
        i = go_term_indices[0]
        most_sim_arg = np.argmax(sim_arr[i])
        sim_score = sim_arr[i, most_sim_arg]
        if sim_score > sim_thr:
            if len(grouped_terms) > 0:
                in_other_group_keys = [
                    key for key, val in grouped_terms.items() if most_sim_arg in val
                ]
                if len(in_other_group_keys) == 1:
                    other_group_idx = in_other_group_keys[0]
                    grouped_terms[other_group_idx] = grouped_terms[other_group_idx] + [
                        i
                    ]
                    group_exemplars[other_group_idx] = pick_exemplar(
                        group_exemplars[other_group_idx],
                        goea_list[i],
                        termcounts,
                        obodag,
                        info_thr,
                    )
                else:
                    grouped_terms[working_group_idx] = [i, most_sim_arg]
                    group_exemplars[working_group_idx] = pick_exemplar(
                        goea_list[i],
                        goea_list[most_sim_arg],
                        termcounts,
                        obodag,
                        info_thr,
                    )
                    working_group_idx += 1
                    go_term_indices.remove(most_sim_arg)
            else:
                grouped_terms[working_group_idx] = [i, most_sim_arg]
                group_exemplars[working_group_idx] = pick_exemplar(
                    goea_list[i], goea_list[most_sim_arg], termcounts, obodag, info_thr
                )
                working_group_idx += 1
                go_term_indices.remove(most_sim_arg)
        go_term_indices.remove(i)

    group_exemplars = list(group_exemplars.values())

    return group_exemplars


def get_GO_assign_dict(selected_goea, cluster_genes_uniprot):
    all_study_items = copy.copy(cluster_genes_uniprot)
    depth_list = sorted(set([item.depth for item in selected_goea]))[::-1]
    assign_dict = {}
    for depth in depth_list:
        go_terms_at_level = [item for item in selected_goea if item.depth == depth]
        for go_term in go_terms_at_level:
            study_item_list = list(go_term.study_items)
            for study_item in study_item_list:
                if study_item in all_study_items:
                    assign_dict[study_item] = go_term.name
                    all_study_items.remove(study_item)

    for remaining_item in all_study_items:
        assign_dict[remaining_item] = "Unassigned"

    return assign_dict

In [None]:
# Get ontologies
obo_fname = download_go_basic_obo()

# Get ecoli association file (ecocyc)
gaf_handle = goatools.base.http_get(
    "http://current.geneontology.org/annotations/ecocyc.gaf.gz", fout="./ecocyc.gaf.gz"
)
gaf_fname = goatools.base.gunzip("./ecocyc.gaf.gz")

## Getting ontologies and other nonesense

obodag = GODag(obo_fname)
objanno = GafReader(gaf_fname)
ns2assoc = objanno.get_ns2assc()

gene_to_id = {assoc.DB_Symbol: assoc.DB_ID for assoc in objanno.associations}
inv_gene_to_id = {assoc.DB_ID: assoc.DB_Symbol for assoc in objanno.associations}
synonym_dict = {
    synonym: assoc.DB_ID
    for assoc in objanno.associations
    for synonym in assoc.DB_Synonym
}
gene_to_id.update(synonym_dict)

In [None]:
# background gene set

all_genes = feature_paga_df.obs["Gene"].unique().tolist()

clust_id = str(17)
cluster_genes = sorted(
    feature_paga_df.obs[feature_paga_df.obs["leiden_highres"] == clust_id]["Gene"]
    .unique()
    .tolist()
)

goea_quiet_enriched = get_enriched_GO_terms(
    all_genes, cluster_genes, obodag, objanno, ns2assoc, pval=0.05, GO_type="BP"
)
filtered_go_terms = get_filtered_go_terms(
    obodag, objanno, goea_quiet_enriched, sim_thr=0.3, info_thr=1.0
)
go_term_dict = {
    go_term.name: go_term.ratio_in_study[0] for go_term in filtered_go_terms
}
# ttl_terms = np.sum(list(go_term_dict.values()))
# go_term_dict = {key:val/ttl_terms for key,val in go_term_dict.items()}

print()
for key, value in go_term_dict.items():
    print(key, " : ", value)
print()
for i in range(0, len(cluster_genes), 5):
    print(cluster_genes[i : i + 5])

In [None]:
labels = [zscore_trace + ": Mean" for zscore_trace in zscore_traces]

fig = sc.pl.umap(
    feature_paga_df,
    color=labels,
    show=False,
    legend_loc="on data",
    add_outline=False,
    size=50,
    return_fig=True,
    vcenter=0.0,
    cmap="RdBu_r",
    wspace=0.25,
)

In [None]:
fig = plot_cluster_timeseries_braycenters(
    feature_paga_df.obs,
    "leiden",
    ["Division Length Feature Norm", "Division Length Feature Norm"],
    ["Division Length Feature Norm", "Division Length Feature Norm"],
    [(0, 1), (0, 1)],
    figsize=(15, 6),
    wspace=0.25,
    hspace=0.25,
    color_list=vega_20_scanpy,
)

### Filter for strong effects by taking max over integrated zscores

In [None]:
min_feature_thr = 20

gene_cluster_df_filtered = gene_cluster_df_full[
    gene_cluster_df_full["Integrated Feature Max"] > min_feature_thr
]

In [None]:
plt.hist(gene_cluster_df_full["Integrated Feature Max"], bins=50)
plt.show()

### soft-DTW Calculation

In [None]:
X = np.array(gene_cluster_df_filtered["Feature Vector"].tolist())
X = np.swapaxes(X, 1, 2)
norm_soft_dtw_arr = parallel_norm_soft_dtw(X)

### Initialize Anndata Object

In [None]:
an_df = anndata.AnnData(
    X=X.reshape(X.shape[0], -1), obs=gene_cluster_df_filtered
)  # AnnData container to use scanpy functions with unwrapped time vector

### Compute KNN Graph

In [None]:
n_neighbors = 15
n_pcs = 20  # This shouldn't affect anything

sc.pp.neighbors(an_df, n_neighbors=n_neighbors, n_pcs=n_pcs)
knn_indices, knn_dists, forest = sc.neighbors.compute_neighbors_umap(
    norm_soft_dtw_arr, n_neighbors=n_neighbors, metric="precomputed"
)
(
    an_df.uns["neighbors"]["distances"],
    an_df.uns["neighbors"]["connectivities"],
) = sc.neighbors._compute_connectivities_umap(
    knn_indices,
    knn_dists,
    an_df.shape[0],
    n_neighbors,  # change to neighbors you plan to use
)
an_df.obsp["distances"] = an_df.uns["neighbors"]["distances"]
an_df.obsp["connectivities"] = an_df.uns["neighbors"]["connectivities"]
an_df.obsp["soft_dtw"] = norm_soft_dtw_arr

### Computing Leiden, PAGA and UMAP

Note that the lower resolution UMAP was set to the same UMAP positions as the higher resolution UMAP

In [None]:
paga_df_dict = {}
for resolution in [0.25, 1.0, 1.5]:
    paga_df_dict[resolution] = copy.deepcopy(an_df)
    sc.tl.leiden(paga_df_dict[resolution], resolution=resolution, n_iterations=-1)
    sc.tl.paga(paga_df_dict[resolution], groups="leiden")
    sc.pl.paga(paga_df_dict[resolution], add_pos=True, show=False)
sc.tl.umap(paga_df_dict[1.0], init_pos="paga", min_dist=0.25, spread=5.0)
paga_df_dict[1.0].obs["leiden_lowres"] = paga_df_dict[0.25].obs["leiden"]
paga_df_dict[1.0].obs["leiden_highres"] = paga_df_dict[1.5].obs["leiden"]
paga_df = paga_df_dict[1.0]

In [None]:
fig = sc.pl.umap(
    paga_df,
    color=["leiden_lowres", "leiden", "leiden_highres"],
    title=["Leiden Resolution=0.25", "Leiden Resolution=1.", "Leiden Resolution=1.5"],
    show=False,
    legend_loc="on data",
    edges=True,
    add_outline=False,
    size=50,
    return_fig=True,
    palette=vega_20_scanpy,
)
# fig.savefig("./1_Global_Analysis/Global_PAGA.png",dpi=150)

### Plotting Mean Z-scores, Euclidean Norm, and N Match

In [None]:
paga_df.obs["N Match"] = 20.0 - paga_df.obs["N Mismatch"]
del_N_match_series = paga_df.obs.groupby("TargetID").apply(
    lambda x: x["N Match"] - np.min(x["N Match"])
)
del_N_match_series = del_N_match_series.droplevel("TargetID")
paga_df.obs["Delta N Match"] = del_N_match_series

In [None]:
labels = [zscore_trace + ": Mean" for zscore_trace in zscore_traces]

fig = sc.pl.umap(
    paga_df,
    color=labels,
    show=False,
    legend_loc="on data",
    add_outline=False,
    size=50,
    return_fig=True,
    vcenter=0.0,
    cmap="RdBu_r",
    wspace=0.25,
)
fig.savefig("./1_Global_Analysis/Mean_zscores.png", dpi=300)

In [None]:
labels = [trace + ": Mean" for trace in traces]

fig = sc.pl.umap(
    paga_df,
    color=labels,
    show=False,
    legend_loc="on data",
    add_outline=False,
    size=50,
    return_fig=True,
    cmap="RdBu_r",
)
fig.savefig("./1_Global_Analysis/Mean.png", dpi=300)

In [None]:
labels = ["Delta N Match", "Integrated Euclidean Norm"]

fig = sc.pl.umap(
    paga_df,
    color=labels,
    show=False,
    legend_loc="on data",
    add_outline=False,
    size=50,
    return_fig=True,
    vcenter=0.0,
    cmap="RdBu_r",
)
fig.savefig("./1_Global_Analysis/Match_and_Euc_Norm.png", dpi=300)

### Highlight Genes of Interest (by name)

In [None]:
def selection_fn(item, gene_name):
    is_gene = item["Gene"] == gene_name
    if is_gene:
        return item["TargetID"]
    else:
        return 0


def highlight_gene_group(an_df, selection_list):

    highlight_genes_df = copy.deepcopy(an_df)

    selection_list = sorted(
        list(
            set(highlight_genes_df.obs["Gene"].unique().tolist()) & set(selection_list)
        )
    )

    for i, selected_gene in enumerate(selection_list):
        selected_series = (highlight_genes_df.obs["Gene"] == selected_gene).astype(
            "category"
        )
        selected_series = selected_series.cat.reorder_categories([True, False])
        highlight_genes_df.obs["Selected Genes: " + str(i)] = selected_series

    selected_series = (highlight_genes_df.obs["Gene"].isin(selection_list)).astype(
        "category"
    )
    selected_series = selected_series.cat.reorder_categories([True, False])
    highlight_genes_df.obs["All Genes"] = selected_series

    # selected_series = (paga_df.obs["Gene"]=="ftsZ").astype(float)
    # selected_series[selected_series==0.] = np.NaN
    # paga_df.obs["Selected Genes"] = selected_series

    fig = sc.pl.umap(
        highlight_genes_df,
        title=selection_list + ["All Genes"],
        color=["Selected Genes: " + str(i) for i in range(len(selection_list))]
        + ["All Genes"],
        groups=[True],
        show=False,
        legend_loc="right margin",
        add_outline=False,
        size=50,
        return_fig=True,
        palette={True: "red", False: "lightgrey"},
    )  # palette ={}

    return fig

In [None]:
fts_genes = (
    paga_df.obs["Gene"][paga_df.obs["Gene"].apply(lambda x: "fts" in x)]
    .unique()
    .tolist()
)
fig = highlight_gene_group(paga_df, fts_genes)
fig.savefig("./1_Global_Analysis/Highlight_Genes/fts_genes.png", dpi=150)

In [None]:
rps_genes = (
    paga_df.obs["Gene"][paga_df.obs["Gene"].apply(lambda x: "rps" in x)]
    .unique()
    .tolist()
)
fig = highlight_gene_group(paga_df, rps_genes)
fig.savefig("./1_Global_Analysis/Highlight_Genes/rps_genes.png", dpi=150)

In [None]:
rpm_genes = (
    paga_df.obs["Gene"][paga_df.obs["Gene"].apply(lambda x: "rpm" in x)]
    .unique()
    .tolist()
)
fig = highlight_gene_group(paga_df, rpm_genes)
fig.savefig("./1_Global_Analysis/Highlight_Genes/rpm_genes.png", dpi=150)

In [None]:
rpl_genes = (
    paga_df.obs["Gene"][paga_df.obs["Gene"].apply(lambda x: "rpl" in x)]
    .unique()
    .tolist()
)
fig = highlight_gene_group(paga_df, rpl_genes)
fig.savefig("./1_Global_Analysis/Highlight_Genes/rpl_genes.png", dpi=150)

In [None]:
sec_and_bam_genes = (
    paga_df.obs["Gene"][
        paga_df.obs["Gene"].apply(
            lambda x: ("sec" in x) or ("bam" in x) or ("yidC" in x) or ("yajC" in x)
        )
    ]
    .unique()
    .tolist()
)
fig = highlight_gene_group(paga_df, sec_and_bam_genes)
fig.savefig("./1_Global_Analysis/Highlight_Genes/sec_and_bam_genes.png", dpi=150)

In [None]:
rr_genes = (
    paga_df.obs["Gene"][
        paga_df.obs["Gene"].apply(
            lambda x: ("rrf" in x) or ("rrs" in x) or ("rrl" in x)
        )
    ]
    .unique()
    .tolist()
)
fig = highlight_gene_group(paga_df, rr_genes)
fig.savefig("./1_Global_Analysis/Highlight_Genes/rr_genes.png", dpi=150)

In [None]:
dna_genes = (
    paga_df.obs["Gene"][paga_df.obs["Gene"].apply(lambda x: ("dna" in x))]
    .unique()
    .tolist()
)
fig = highlight_gene_group(paga_df, dna_genes)
fig.savefig("./1_Global_Analysis/Highlight_Genes/dna_genes.png", dpi=150)

In [None]:
gyr_genes = (
    paga_df.obs["Gene"][paga_df.obs["Gene"].apply(lambda x: ("gyr" in x))]
    .unique()
    .tolist()
)
fig = highlight_gene_group(paga_df, gyr_genes)
fig.savefig("./1_Global_Analysis/Highlight_Genes/gyr_genes.png", dpi=150)

In [None]:
hol_genes = (
    paga_df.obs["Gene"][paga_df.obs["Gene"].apply(lambda x: ("hol" in x))]
    .unique()
    .tolist()
)
fig = highlight_gene_group(paga_df, hol_genes)
fig.savefig("./1_Global_Analysis/Highlight_Genes/hol_genes.png", dpi=150)

In [None]:
rpo_genes = (
    paga_df.obs["Gene"][paga_df.obs["Gene"].apply(lambda x: ("rpo" in x))]
    .unique()
    .tolist()
)
fig = highlight_gene_group(paga_df, rpo_genes)
fig.savefig("./1_Global_Analysis/Highlight_Genes/rpo_genes.png", dpi=150)

In [None]:
all_genes = paga_df.obs["Gene"].unique().tolist()
step = 75
for idx, i in enumerate(list(range(0, len(all_genes), step))):
    all_genes_sub = all_genes[i : i + step]
    fig = highlight_gene_group(paga_df, all_genes_sub)
    fig.savefig(
        "./1_Global_Analysis/Highlight_Genes/All_Genes/all_genes_" + str(idx) + ".png",
        dpi=75,
    )

### Highlight Genes of Interest (by GO)

In [None]:
import goatools
import goatools.base
from goatools.base import download_go_basic_obo

from goatools.obo_parser import GODag
from goatools.anno.gaf_reader import GafReader
from goatools.semantic import semantic_similarity
from goatools.semantic import TermCounts, get_info_content

from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
from goatools.go_enrichment import GOEnrichmentStudy


def search_go(ns2assoc, obodag, inv_gene_to_id, go_term):
    namespace_abbv = {
        "biological_process": "BP",
        "molecular_function": "MF",
        "cellular_component": "CC",
    }

    print("Searching for " + str(obodag[go_term].name))
    namespace = namespace_abbv[obodag[go_term].namespace]
    child_goterms = list(obodag[go_term].get_all_children())
    gene_list = [
        inv_gene_to_id[key]
        for key, val in ns2assoc[namespace].items()
        if go_term in val
    ]
    for child_goterm in child_goterms:
        gene_list += [
            inv_gene_to_id[key]
            for key, val in ns2assoc[namespace].items()
            if child_goterm in val
        ]
    gene_list = sorted(list(set(gene_list)))
    return gene_list

In [None]:
# Get ontologies
obo_fname = download_go_basic_obo()

# Get ecoli association file (ecocyc)
gaf_handle = goatools.base.http_get(
    "http://current.geneontology.org/annotations/ecocyc.gaf.gz", fout="./ecocyc.gaf.gz"
)
gaf_fname = goatools.base.gunzip("./ecocyc.gaf.gz")

## Getting ontologies and other nonesense

obodag = GODag(obo_fname)
objanno = GafReader(gaf_fname)
ns2assoc = objanno.get_ns2assc()

gene_to_id = {assoc.DB_Symbol: assoc.DB_ID for assoc in objanno.associations}
inv_gene_to_id = {assoc.DB_ID: assoc.DB_Symbol for assoc in objanno.associations}
synonym_dict = {
    synonym: assoc.DB_ID
    for assoc in objanno.associations
    for synonym in assoc.DB_Synonym
}
gene_to_id.update(synonym_dict)

In [None]:
tRNA_aminoacylation_genes = search_go(ns2assoc, obodag, inv_gene_to_id, "GO:0003746")
fig = highlight_gene_group(paga_df, tRNA_aminoacylation_genes)
fig.savefig("./1_Global_Analysis/Highlight_Genes/Elongation_factor_genes.png", dpi=150)

In [None]:
tRNA_aminoacylation_genes = search_go(ns2assoc, obodag, inv_gene_to_id, "GO:0043039")
fig = highlight_gene_group(paga_df, tRNA_aminoacylation_genes)
fig.savefig(
    "./1_Global_Analysis/Highlight_Genes/tRNA_aminoacylation_genes.png", dpi=150
)

In [None]:
division_genes = search_go(ns2assoc, obodag, inv_gene_to_id, "GO:0051301")
fig = highlight_gene_group(paga_df, division_genes)
fig.savefig("./1_Global_Analysis/Highlight_Genes/division_genes.png", dpi=150)

In [None]:
ribosome_genes = search_go(ns2assoc, obodag, inv_gene_to_id, "GO:0005840")
fig = highlight_gene_group(paga_df, ribosome_genes)
fig.savefig("./1_Global_Analysis/Highlight_Genes/ribosome_genes.png", dpi=150)

In [None]:
peptidoglycan_genes = search_go(ns2assoc, obodag, inv_gene_to_id, "GO:0000270")
fig = highlight_gene_group(paga_df, peptidoglycan_genes)
fig.savefig("./1_Global_Analysis/Highlight_Genes/peptidoglycan_genes.png", dpi=150)

In [None]:
replication_genes = search_go(ns2assoc, obodag, inv_gene_to_id, "GO:0006260")
fig = highlight_gene_group(paga_df, replication_genes)
fig.savefig("./1_Global_Analysis/Highlight_Genes/replication_genes.png", dpi=150)

In [None]:
initiation_genes = search_go(ns2assoc, obodag, inv_gene_to_id, "GO:0006270")
fig = highlight_gene_group(paga_df, initiation_genes)
fig.savefig("./1_Global_Analysis/Highlight_Genes/initiation_genes.png", dpi=150)

In [None]:
shape_genes = search_go(ns2assoc, obodag, inv_gene_to_id, "GO:0008360")
fig = highlight_gene_group(paga_df, shape_genes)
fig.savefig("./1_Global_Analysis/Highlight_Genes/shape_genes.png", dpi=150)

## 2) Cluster Analysis

### Average Timeseries

In [None]:
def plot_cluster_timeseries(
    df,
    cluster_label,
    feature_labels,
    displayed_labels,
    feature_range_list,
    agg_fn=np.mean,
    x_ticks=[0, 10, 20],
    cluster_subset=None,
    figsize=(10, 10),
    wspace=0.0,
    hspace=0.0,
    fontsize=14,
    linewidth=5,
    color_list=None,
):

    if cluster_subset is not None:
        df = copy.copy(df)
        df = df[df[cluster_label].isin(cluster_subset)]

    timeseries_list = []
    for feature_label in feature_labels:
        agg_cluster_timeseries = (
            df.groupby([cluster_label])
            .apply(lambda x: agg_fn(np.array(x[feature_label].tolist()), axis=0))
            .to_frame()
        )
        agg_cluster_timeseries = agg_cluster_timeseries.rename(
            columns={0: feature_label}
        )
        timeseries_list.append(agg_cluster_timeseries)
    timeseries_df = pd.concat(timeseries_list, axis=1)

    fig = plt.figure(constrained_layout=True, figsize=figsize)
    gs = fig.add_gridspec(1, len(timeseries_df), wspace=wspace)

    for i in range(len(timeseries_df)):
        mean_cluster_timeseries = timeseries_df.iloc[i]

        clust_arr = np.array(timeseries_df.iloc[i].tolist())
        if color_list == None:
            color = "tab:blue"
        else:
            color = color_list[i]

        if i == 0:
            inner_gs = gs[0, i].subgridspec(
                clust_arr.shape[0], 1, wspace=0, hspace=hspace
            )
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):

                feature_range = feature_range_list[c[0]]
                ax.plot(clust_arr[c], linewidth=linewidth, color=color)
                ax.set_ylim(feature_range[0], feature_range[1])
                ax.set(xticks=[])
                #                 ax.set(xticks=[], yticks=[0,6])
                ax.set_ylabel(
                    displayed_labels[c[0]],
                    rotation=0,
                    labelpad=30,
                    fontsize=fontsize,
                    ha="right",
                )  # ,orientation="horizontal")

            ax.set_xlabel(str(i), fontsize=fontsize)
            ax.set(xticks=x_ticks)

        else:
            inner_gs = gs[0, i].subgridspec(
                clust_arr.shape[0], 1, wspace=0, hspace=hspace
            )
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):
                feature_range = feature_range_list[c[0]]
                ax.plot(clust_arr[c], linewidth=linewidth, color=color)
                ax.set_ylim(feature_range[0], feature_range[1])
                ax.set(xticks=[], yticks=[])

            ax.set_xlabel(str(i), fontsize=fontsize)
            ax.set(xticks=x_ticks)

    plt.tight_layout()
    return fig

In [None]:
fig = sc.pl.umap(
    paga_df,
    color=["leiden_lowres", "leiden", "leiden_highres"],
    title=["Leiden Resolution=0.25", "Leiden Resolution=1.", "Leiden Resolution=1.5"],
    show=False,
    legend_loc="on data",
    edges=True,
    add_outline=False,
    size=50,
    return_fig=True,
    palette=vega_20_scanpy,
)

In [None]:
fig = plot_cluster_timeseries(
    paga_df.obs,
    "leiden_highres",
    [
        "Division Length",
        "Linear Growth Rate",
        "Exponential Growth Rate",
        "Width",
        "mCherry Intensity",
        "Doubling Time",
    ],
    [
        "Division Length",
        "Linear Growth Rate",
        "Exponential Growth Rate",
        "Width",
        "mCherry Intensity",
        "Doubling Time",
    ],
    [(3, 12), (0, 20), (0.5, 2.5), (1.2, 1.6), (0, 6000), (0, 20)],
    figsize=(25, 8),
    wspace=0.25,
    hspace=0.25,
    color_list=vega_20_scanpy,
)
fig.tight_layout()
fig.savefig("./2_Cluster_Analysis/leiden_highres_timeseries.png", dpi=150)

In [None]:
fig = plot_cluster_timeseries(
    paga_df.obs,
    "leiden",
    [
        "Division Length",
        "Linear Growth Rate",
        "Exponential Growth Rate",
        "Width",
        "mCherry Intensity",
        "Doubling Time",
    ],
    [
        "Division Length",
        "Linear Growth Rate",
        "Exponential Growth Rate",
        "Width",
        "mCherry Intensity",
        "Doubling Time",
    ],
    [(3, 12), (0, 20), (0.5, 2.5), (1.2, 1.6), (0, 6000), (0, 20)],
    figsize=(20, 8),
    wspace=0.25,
    hspace=0.25,
    color_list=vega_20_scanpy,
)
fig.tight_layout()
fig.savefig("./2_Cluster_Analysis/leiden_timeseries.png", dpi=150)

In [None]:
fig = plot_cluster_timeseries(
    paga_df.obs,
    "leiden_lowres",
    [
        "Division Length",
        "Linear Growth Rate",
        "Exponential Growth Rate",
        "Width",
        "mCherry Intensity",
        "Doubling Time",
    ],
    [
        "Division Length",
        "Linear Growth Rate",
        "Exponential Growth Rate",
        "Width",
        "mCherry Intensity",
        "Doubling Time",
    ],
    [(3, 12), (0, 20), (0.5, 2.5), (1.2, 1.6), (0, 6000), (0, 20)],
    figsize=(7, 8),
    wspace=0.25,
    hspace=0.25,
    color_list=vega_20_scanpy,
)
fig.tight_layout()
fig.savefig("./2_Cluster_Analysis/leiden_lowres_timeseries.png", dpi=150)

### Barycenter Timeseries

In [None]:
from tslearn.barycenters import (
    softdtw_barycenter,
    dtw_barycenter_averaging,
    euclidean_barycenter,
)

In [None]:
def get_braycenters(df, columns=None, max_iter=50, tol=0.001):
    df = df[columns]
    X = np.array(df.apply(lambda x: x.tolist(), axis=1).tolist())
    X = np.swapaxes(X, 1, 2)

    Y = softdtw_barycenter(X, max_iter=max_iter, tol=tol)  # T X D

    return Y


def plot_cluster_timeseries_braycenters(
    df,
    cluster_label,
    feature_labels,
    displayed_labels,
    feature_range_list,
    x_ticks=[0, 10, 20],
    cluster_subset=None,
    figsize=(10, 10),
    wspace=0.0,
    hspace=0.0,
    fontsize=14,
    linewidth=3,
    color_list=None,
):

    if cluster_subset is not None:
        df = copy.copy(df)
        df = df[df[cluster_label].isin(cluster_subset)]

    cluster_groupby = df.groupby([cluster_label])
    agg_cluster_timeseries = cluster_groupby.apply(
        lambda x: get_braycenters(x, columns=feature_labels)
    )

    timeseries_list = []
    for i, feature_label in enumerate(feature_labels):
        selected_feature_agg = agg_cluster_timeseries.apply(
            lambda x: x[:, i]
        ).to_frame()
        selected_feature_agg = selected_feature_agg.rename(columns={0: feature_label})
        timeseries_list.append(selected_feature_agg)

    timeseries_df = pd.concat(timeseries_list, axis=1)

    fig = plt.figure(constrained_layout=True, figsize=figsize)
    gs = fig.add_gridspec(1, len(timeseries_df), wspace=wspace)

    for i in range(len(timeseries_df)):
        mean_cluster_timeseries = timeseries_df.iloc[i]

        clust_arr = np.array(timeseries_df.iloc[i].tolist())
        if color_list == None:
            color = "tab:blue"
        else:
            color = color_list[i]

        if i == 0:
            inner_gs = gs[0, i].subgridspec(
                clust_arr.shape[0], 1, wspace=0, hspace=hspace
            )
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):

                feature_range = feature_range_list[c[0]]
                ax.plot(clust_arr[c], linewidth=linewidth, color=color)
                ax.set_ylim(feature_range[0], feature_range[1])
                ax.set(xticks=[])
                #                 ax.set(xticks=[], yticks=[0,6])
                ax.set_ylabel(
                    displayed_labels[c[0]],
                    rotation=0,
                    labelpad=30,
                    fontsize=fontsize,
                    ha="right",
                )  # ,orientation="horizontal")

            ax.set_xlabel(str(i), fontsize=fontsize)
            ax.set(xticks=x_ticks)

        else:
            inner_gs = gs[0, i].subgridspec(
                clust_arr.shape[0], 1, wspace=0, hspace=hspace
            )
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):
                feature_range = feature_range_list[c[0]]
                ax.plot(clust_arr[c], linewidth=linewidth, color=color)
                ax.set_ylim(feature_range[0], feature_range[1])
                ax.set(xticks=[], yticks=[])

            ax.set_xlabel(str(i), fontsize=fontsize)
            ax.set(xticks=x_ticks)

    plt.tight_layout()
    return fig

In [None]:
fig = plot_cluster_timeseries_braycenters(
    paga_df.obs,
    "leiden_highres",
    [
        "Division Length Z-score",
        "Linear Growth Rate Z-score",
        "Exponential Growth Rate Z-score",
        "Width Z-score",
        "mCherry Intensity Z-score",
        "Doubling Time Z-score",
    ],
    [
        "Division Length Z-score",
        "Linear Growth Rate Z-score",
        "Exponential Growth Rate Z-score",
        "Width Z-score",
        "mCherry Intensity Z-score",
        "Doubling Time Z-score",
    ],
    [(-4, 4), (-4, 4), (-4, 4), (-4, 4), (-4, 4), (-4, 4)],
    figsize=(25, 8),
    wspace=0.25,
    hspace=0.25,
    color_list=vega_20_scanpy,
)
fig.savefig("./2_Cluster_Analysis/leiden_highres_barycenters.png", dpi=150)

In [None]:
fig = plot_cluster_timeseries_braycenters(
    paga_df.obs,
    "leiden",
    [
        "Division Length Z-score",
        "Linear Growth Rate Z-score",
        "Exponential Growth Rate Z-score",
        "Width Z-score",
        "mCherry Intensity Z-score",
        "Doubling Time Z-score",
    ],
    [
        "Division Length Z-score",
        "Linear Growth Rate Z-score",
        "Exponential Growth Rate Z-score",
        "Width Z-score",
        "mCherry Intensity Z-score",
        "Doubling Time Z-score",
    ],
    [(-4, 4), (-4, 4), (-4, 4), (-4, 4), (-4, 4), (-4, 4)],
    figsize=(20, 8),
    wspace=0.25,
    hspace=0.25,
    color_list=vega_20_scanpy,
)
fig.savefig("./2_Cluster_Analysis/leiden_barycenters.png", dpi=150)

In [None]:
fig = plot_cluster_timeseries_braycenters(
    paga_df.obs,
    "leiden_lowres",
    [
        "Division Length Z-score",
        "Linear Growth Rate Z-score",
        "Exponential Growth Rate Z-score",
        "Width Z-score",
        "mCherry Intensity Z-score",
        "Doubling Time Z-score",
    ],
    [
        "Division Length Z-score",
        "Linear Growth Rate Z-score",
        "Exponential Growth Rate Z-score",
        "Width Z-score",
        "mCherry Intensity Z-score",
        "Doubling Time Z-score",
    ],
    [(-4, 4), (-4, 4), (-4, 4), (-4, 4), (-4, 4), (-4, 4)],
    figsize=(7, 8),
    wspace=0.25,
    hspace=0.25,
    color_list=vega_20_scanpy,
)
fig.savefig("./2_Cluster_Analysis/leiden_lowres_barycenters.png", dpi=150)

### GO Term Enrichment

In [None]:
import goatools
import goatools.base
from goatools.base import download_go_basic_obo

from goatools.obo_parser import GODag
from goatools.anno.gaf_reader import GafReader
from goatools.semantic import semantic_similarity
from goatools.semantic import TermCounts, get_info_content

from goatools.goea.go_enrichment_ns import GOEnrichmentStudyNS
from goatools.go_enrichment import GOEnrichmentStudy


def search_go(ns2assoc, obodag, inv_gene_to_id, go_term):
    namespace_abbv = {
        "biological_process": "BP",
        "molecular_function": "MF",
        "cellular_component": "CC",
    }

    print("Searching for " + str(obodag[go_term].name))
    namespace = namespace_abbv[obodag[go_term].namespace]
    child_goterms = list(obodag[go_term].get_all_children())
    gene_list = [
        inv_gene_to_id[key]
        for key, val in ns2assoc[namespace].items()
        if go_term in val
    ]
    for child_goterm in child_goterms:
        gene_list += [
            inv_gene_to_id[key]
            for key, val in ns2assoc[namespace].items()
            if child_goterm in val
        ]
    gene_list = sorted(list(set(gene_list)))
    return gene_list


def get_enriched_GO_terms(
    background_gene_list, gene_list, obodag, objanno, ns2assoc, pval=0.05, GO_type="BP"
):

    gene_to_id = {assoc.DB_Symbol: assoc.DB_ID for assoc in objanno.associations}
    synonym_dict = {
        synonym: assoc.DB_ID
        for assoc in objanno.associations
        for synonym in assoc.DB_Synonym
    }
    gene_to_id.update(synonym_dict)

    # background gene set

    all_genes_uniprot = [
        gene_to_id[item] for item in background_gene_list if item in gene_to_id.keys()
    ]
    selected_genes_uniprot = [
        gene_to_id[item] for item in gene_list if item in gene_to_id.keys()
    ]

    print(len(all_genes_uniprot))
    print(len(selected_genes_uniprot))

    goeaobj = GOEnrichmentStudy(
        all_genes_uniprot,  # List of mouse protein-coding genes
        ns2assoc[GO_type],  # geneid/GO associations
        obodag,  # Ontologies
        propagate_counts=True,
        alpha=pval,  # default significance cut-off
        methods=["fdr_bh"],
    )
    # defult multipletest correction method

    goea_results_all = goeaobj.run_study(selected_genes_uniprot, prt=None)
    goea_quiet_sig = [r for r in goea_results_all if r.p_fdr_bh < pval]
    goea_quiet_enriched = [r for r in goea_quiet_sig if r.enrichment == "e"]
    return goea_quiet_enriched


def pick_exemplar(go1, go2, termcounts, obodag, info_thr, pval_factor=2.0):

    info_1_low = get_info_content(go1.GO, termcounts) < info_thr
    info_2_low = get_info_content(go2.GO, termcounts) < info_thr
    if info_1_low and not info_2_low:
        return go2
    elif info_2_low and not info_1_low:
        return go1
    elif info_2_low and info_1_low:
        return go1

    pval_ratio = go1.p_fdr_bh / go2.p_fdr_bh

    if pval_ratio > pval_factor:
        return go2
    elif pval_ratio < (1.0 / pval_factor):
        return go1

    go1_parents = list(obodag[go1.GO].get_all_parents())
    go2_parents = list(obodag[go2.GO].get_all_parents())

    if go2.GO in go1_parents:
        return go2

    elif go1.GO in go2_parents:
        return go1

    return go1


def get_filtered_go_terms(
    obodag, objanno, goea_list, sim_thr=0.05, info_thr=1.0, GO_type="BP"
):

    termcounts = TermCounts(obodag, objanno.get_ns2assc()[GO_type])

    go_term_list = [item.GO for item in goea_list]
    sim_arr = np.zeros((len(go_term_list), len(go_term_list)))
    for i in range(len(go_term_list)):
        for j in range(len(go_term_list)):
            sim_arr[i, j] = semantic_similarity(
                go_term_list[i], go_term_list[j], obodag
            )
    np.fill_diagonal(sim_arr, 0.0)

    working_group_idx = 0
    grouped_terms = {}
    group_exemplars = {}
    go_term_indices = list(range(len(go_term_list)))

    while len(go_term_indices) > 0:
        i = go_term_indices[0]
        most_sim_arg = np.argmax(sim_arr[i])
        sim_score = sim_arr[i, most_sim_arg]
        if sim_score > sim_thr:
            if len(grouped_terms) > 0:
                in_other_group_keys = [
                    key for key, val in grouped_terms.items() if most_sim_arg in val
                ]
                if len(in_other_group_keys) == 1:
                    other_group_idx = in_other_group_keys[0]
                    grouped_terms[other_group_idx] = grouped_terms[other_group_idx] + [
                        i
                    ]
                    group_exemplars[other_group_idx] = pick_exemplar(
                        group_exemplars[other_group_idx],
                        goea_list[i],
                        termcounts,
                        obodag,
                        info_thr,
                    )
                else:
                    grouped_terms[working_group_idx] = [i, most_sim_arg]
                    group_exemplars[working_group_idx] = pick_exemplar(
                        goea_list[i],
                        goea_list[most_sim_arg],
                        termcounts,
                        obodag,
                        info_thr,
                    )
                    working_group_idx += 1
                    go_term_indices.remove(most_sim_arg)
            else:
                grouped_terms[working_group_idx] = [i, most_sim_arg]
                group_exemplars[working_group_idx] = pick_exemplar(
                    goea_list[i], goea_list[most_sim_arg], termcounts, obodag, info_thr
                )
                working_group_idx += 1
                go_term_indices.remove(most_sim_arg)
        go_term_indices.remove(i)

    group_exemplars = list(group_exemplars.values())

    return group_exemplars


def get_GO_assign_dict(selected_goea, cluster_genes_uniprot):
    all_study_items = copy.copy(cluster_genes_uniprot)
    depth_list = sorted(set([item.depth for item in selected_goea]))[::-1]
    assign_dict = {}
    for depth in depth_list:
        go_terms_at_level = [item for item in selected_goea if item.depth == depth]
        for go_term in go_terms_at_level:
            study_item_list = list(go_term.study_items)
            for study_item in study_item_list:
                if study_item in all_study_items:
                    assign_dict[study_item] = go_term.name
                    all_study_items.remove(study_item)

    for remaining_item in all_study_items:
        assign_dict[remaining_item] = "Unassigned"

    return assign_dict

In [None]:
# Get ontologies
obo_fname = download_go_basic_obo()

# Get ecoli association file (ecocyc)
gaf_handle = goatools.base.http_get(
    "http://current.geneontology.org/annotations/ecocyc.gaf.gz", fout="./ecocyc.gaf.gz"
)
gaf_fname = goatools.base.gunzip("./ecocyc.gaf.gz")

## Getting ontologies and other nonesense

obodag = GODag(obo_fname)
objanno = GafReader(gaf_fname)
ns2assoc = objanno.get_ns2assc()

gene_to_id = {assoc.DB_Symbol: assoc.DB_ID for assoc in objanno.associations}
inv_gene_to_id = {assoc.DB_ID: assoc.DB_Symbol for assoc in objanno.associations}
synonym_dict = {
    synonym: assoc.DB_ID
    for assoc in objanno.associations
    for synonym in assoc.DB_Synonym
}
gene_to_id.update(synonym_dict)

In [None]:
fig = sc.pl.umap(
    paga_df,
    color=["leiden_lowres", "leiden", "leiden_highres"],
    title=["Leiden Resolution=0.25", "Leiden Resolution=1.", "Leiden Resolution=1.5"],
    show=False,
    legend_loc="on data",
    edges=True,
    add_outline=False,
    size=50,
    return_fig=True,
    palette=vega_20_scanpy,
)

#### Division Cluster

In [None]:
# background gene set

all_genes = paga_df.obs["Gene"].unique().tolist()

clust_id = str(6)
cluster_genes = sorted(
    paga_df.obs[paga_df.obs["leiden_highres"] == clust_id]["Gene"].unique().tolist()
)

goea_quiet_enriched = get_enriched_GO_terms(
    all_genes, cluster_genes, obodag, objanno, ns2assoc, pval=0.05, GO_type="BP"
)
filtered_go_terms = get_filtered_go_terms(
    obodag, objanno, goea_quiet_enriched, sim_thr=0.3, info_thr=1.0
)
go_term_dict = {
    go_term.name: go_term.ratio_in_study[0] for go_term in filtered_go_terms
}
# ttl_terms = np.sum(list(go_term_dict.values()))
# go_term_dict = {key:val/ttl_terms for key,val in go_term_dict.items()}

print()
for key, value in go_term_dict.items():
    print(key, " : ", value)
print()
for i in range(0, len(cluster_genes), 5):
    print(cluster_genes[i : i + 5])

#### Starvation Cluster

In [None]:
# background gene set

all_genes = paga_df.obs["Gene"].unique().tolist()

clust_id = str(1)
cluster_genes = sorted(
    paga_df.obs[paga_df.obs["leiden_lowres"] == clust_id]["Gene"].unique().tolist()
)

goea_quiet_enriched = get_enriched_GO_terms(
    all_genes, cluster_genes, obodag, objanno, ns2assoc, pval=0.05, GO_type="BP"
)
filtered_go_terms = get_filtered_go_terms(
    obodag, objanno, goea_quiet_enriched, sim_thr=0.3, info_thr=1.0
)
go_term_dict = {
    go_term.name: go_term.ratio_in_study[0] for go_term in filtered_go_terms
}
# ttl_terms = np.sum(list(go_term_dict.values()))
# go_term_dict = {key:val/ttl_terms for key,val in go_term_dict.items()}

print()
for key, value in go_term_dict.items():
    print(key, " : ", value)
print()
for i in range(0, len(cluster_genes), 5):
    print(cluster_genes[i : i + 5])

#### Ribosome Clusters

In [None]:
# background gene set

all_genes = paga_df.obs["Gene"].unique().tolist()

clust_ids = ["5", "7"]
cluster_genes = sorted(
    paga_df.obs[paga_df.obs["leiden"].isin(clust_ids)]["Gene"].unique().tolist()
)

goea_quiet_enriched = get_enriched_GO_terms(
    all_genes, cluster_genes, obodag, objanno, ns2assoc, pval=0.05, GO_type="BP"
)
filtered_go_terms = get_filtered_go_terms(
    obodag, objanno, goea_quiet_enriched, sim_thr=0.3, info_thr=1.0
)
go_term_dict = {
    go_term.name: go_term.ratio_in_study[0] for go_term in filtered_go_terms
}
# ttl_terms = np.sum(list(go_term_dict.values()))
# go_term_dict = {key:val/ttl_terms for key,val in go_term_dict.items()}

print()
for key, value in go_term_dict.items():
    print(key, " : ", value)
print()
for i in range(0, len(cluster_genes), 5):
    print(cluster_genes[i : i + 5])

#### Replication Cluster

In [None]:
# background gene set

all_genes = paga_df.obs["Gene"].unique().tolist()

clust_id = str(13)
cluster_genes = sorted(
    paga_df.obs[paga_df.obs["leiden"] == clust_id]["Gene"].unique().tolist()
)

goea_quiet_enriched = get_enriched_GO_terms(
    all_genes, cluster_genes, obodag, objanno, ns2assoc, pval=0.05, GO_type="BP"
)
filtered_go_terms = get_filtered_go_terms(
    obodag, objanno, goea_quiet_enriched, sim_thr=0.3, info_thr=1.0
)
go_term_dict = {
    go_term.name: go_term.ratio_in_study[0] for go_term in filtered_go_terms
}
# ttl_terms = np.sum(list(go_term_dict.values()))
# go_term_dict = {key:val/ttl_terms for key,val in go_term_dict.items()}

print()
for key, value in go_term_dict.items():
    print(key, " : ", value)
print()
for i in range(0, len(cluster_genes), 5):
    print(cluster_genes[i : i + 5])

#### Wide Cluster

In [None]:
# background gene set

all_genes = paga_df.obs["Gene"].unique().tolist()

clust_id = str(10)
cluster_genes = sorted(
    paga_df.obs[paga_df.obs["leiden"] == clust_id]["Gene"].unique().tolist()
)

goea_quiet_enriched = get_enriched_GO_terms(
    all_genes, cluster_genes, obodag, objanno, ns2assoc, pval=0.05, GO_type="BP"
)
filtered_go_terms = get_filtered_go_terms(
    obodag, objanno, goea_quiet_enriched, sim_thr=0.3, info_thr=1.0
)
go_term_dict = {
    go_term.name: go_term.ratio_in_study[0] for go_term in filtered_go_terms
}
# ttl_terms = np.sum(list(go_term_dict.values()))
# go_term_dict = {key:val/ttl_terms for key,val in go_term_dict.items()}

print()
for key, value in go_term_dict.items():
    print(key, " : ", value)
print()
for i in range(0, len(cluster_genes), 5):
    print(cluster_genes[i : i + 5])

#### Very Wide Cluster

In [None]:
# background gene set

all_genes = paga_df.obs["Gene"].unique().tolist()

clust_id = str(11)
cluster_genes = sorted(
    paga_df.obs[paga_df.obs["leiden_highres"] == clust_id]["Gene"].unique().tolist()
)

goea_quiet_enriched = get_enriched_GO_terms(
    all_genes, cluster_genes, obodag, objanno, ns2assoc, pval=0.05, GO_type="BP"
)
filtered_go_terms = get_filtered_go_terms(
    obodag, objanno, goea_quiet_enriched, sim_thr=0.3, info_thr=1.0
)
go_term_dict = {
    go_term.name: go_term.ratio_in_study[0] for go_term in filtered_go_terms
}
# ttl_terms = np.sum(list(go_term_dict.values()))
# go_term_dict = {key:val/ttl_terms for key,val in go_term_dict.items()}

print()
for key, value in go_term_dict.items():
    print(key, " : ", value)
print()
for i in range(0, len(cluster_genes), 5):
    print(cluster_genes[i : i + 5])

#### Segmentation Error Cluster

In [None]:
# background gene set

all_genes = paga_df.obs["Gene"].unique().tolist()

clust_id = str(14)
cluster_genes = sorted(
    paga_df.obs[paga_df.obs["leiden"] == clust_id]["Gene"].unique().tolist()
)

goea_quiet_enriched = get_enriched_GO_terms(
    all_genes, cluster_genes, obodag, objanno, ns2assoc, pval=0.05, GO_type="BP"
)
filtered_go_terms = get_filtered_go_terms(
    obodag, objanno, goea_quiet_enriched, sim_thr=0.3, info_thr=1.0
)
go_term_dict = {
    go_term.name: go_term.ratio_in_study[0] for go_term in filtered_go_terms
}
# ttl_terms = np.sum(list(go_term_dict.values()))
# go_term_dict = {key:val/ttl_terms for key,val in go_term_dict.items()}

print()
for key, value in go_term_dict.items():
    print(key, " : ", value)
print()
for i in range(0, len(cluster_genes), 5):
    print(cluster_genes[i : i + 5])

## 3) Divergence Detection

In [None]:
dist_mat = copy.deepcopy(paga_df.obsp["soft_dtw"])
dist_mat = np.triu(dist_mat)
upper_tri_mask = dist_mat > 0.0
gene_list = sorted(paga_df.obs["Gene"].unique().tolist())

In [None]:
unmatched_gene_mask = np.ones(dist_mat.shape, dtype=bool)

In [None]:
import itertools

unmatched_gene_mask = np.ones(dist_mat.shape, dtype=bool) * upper_tri_mask

all_matched_vals = []
gene_groups = {}

for i, gene_i in enumerate(gene_list):
    gene_mask = (paga_df.obs["Gene"] == gene_i).values
    double_gene_mask = np.logical_and.outer(gene_mask, gene_mask)
    masked_vals = dist_mat[double_gene_mask * upper_tri_mask]
    masked_vals = masked_vals[~np.isnan(masked_vals)].tolist()
    all_matched_vals += masked_vals
    gene_groups[gene_i] = masked_vals

    unmatched_gene_mask = unmatched_gene_mask * (~double_gene_mask)

all_unmatched_vals = dist_mat[unmatched_gene_mask].tolist()
sampled_unmatched_vals = np.random.choice(
    all_unmatched_vals, replace=False, size=(100000,)
)
# all_unmatched_vals = []
# for i,gene_i in enumerate(gene_list):
#     for j,gene_j in enumerate(gene_list):
#         if i > j:
#             coord_list = np.array([item for item in itertools.product(gene_loc_dict[gene_i],gene_loc_dict[gene_j]) if item[0]>item[1]])
#             if len(coord_list)>0:
#                 masked_vals = dist_mat[coord_list]
#                 masked_vals = masked_vals[~np.isnan(masked_vals)].tolist()
#                 all_unmatched_vals += masked_vals

In [None]:
plt.hist(all_matched_vals, bins=50, range=(0, 1000))
plt.show()
plt.hist(sampled_unmatched_vals, bins=50, range=(0, 1000))
plt.show()

In [None]:
n_matched_vals = len(all_matched_vals)
n_unmatched_vals = len(all_unmatched_vals)
p_match = n_matched_vals / (n_matched_vals + n_unmatched_vals)
p_unmatch = 1.0 - p_match

### t-test with multihypothesis correction

In [None]:
import statsmodels.stats.multitest

mean_match = np.mean(all_matched_vals)

gene_group_ttest_pval = []
gene_group_ttest_gene = []
for key, val in gene_groups.items():
    if len(val) > 1:
        gene_group_ttest_gene.append(key)
        gene_group_ttest_pval.append(
            sp.stats.ttest_1samp(
                gene_groups[key], mean_match, alternative="greater"
            ).pvalue
        )

gene_group_ttest_pval = np.array(gene_group_ttest_pval)
gene_group_ttest_gene = np.array(gene_group_ttest_gene)

rejected, pval_corr = statsmodels.stats.multitest.fdrcorrection(
    gene_group_ttest_pval, alpha=0.01, method="indep", is_sorted=False
)

In [None]:
gene_group_ttest_gene[rejected]

### Highlight Genes of Interest

In [None]:
fig = highlight_gene_group(paga_df, gene_group_ttest_gene[rejected])
fig.savefig("./3_Divergent_Genes/divergent_genes.png", dpi=150)

### Examining Individuals

 - Temporarily importing nanopore output since coordinate fix not pushed through intermediate steps in the pipeline

In [None]:
import pandas as ps
import ast

In [None]:
nanopore_df = pd.read_csv(
    "/home/de64/paulssonlab/paulssonlab/src/paulssonlab/deaton/nanopore/dev_notebooks/2021-07-11_snakemake_lDE20/2021-04-20_Essential_KO_Lib_df_coords_corrected.tsv",
    sep="\t",
)
nanopore_df["Target Sites"][~nanopore_df["Target Sites"].isna()] = nanopore_df[
    "Target Sites"
][~nanopore_df["Target Sites"].isna()].apply(lambda x: ast.literal_eval(x))

#### Get RegulonDB Files

In [None]:
import urllib.request
from dna_features_viewer import BiopythonTranslator
from Bio import SeqIO

In [None]:
urllib.request.urlretrieve(
    "http://regulondb.ccg.unam.mx/menu/download/datasets/files/U00096.3.gbk",
    "./U00096.3.gbk",
)

In [None]:
urllib.request.urlretrieve(
    "http://regulondb.ccg.unam.mx/menu/download/datasets/files/PromoterSet.txt",
    "./PromoterSet.txt",
)

In [None]:
from dna_features_viewer import BiopythonTranslator
from Bio import SeqIO
from Bio.SeqFeature import SeqFeature, FeatureLocation


class sgRNA_Explorer(BiopythonTranslator):
    """Custom translator implementing the following theme:

    - Color terminators in green, CDS in blue, all other features in gold.
    - Do not display features that are restriction sites unless they are BamHI
    - Do not display labels for restriction sites.
    - For CDS labels just write "CDS here" instead of the name of the gene.

    """

    def __init__(self, ignored_features_types=["CDS"]):
        self.ignored_features_types = ignored_features_types
        super(sgRNA_Explorer, self).__init__()

    def compute_feature_color(self, feature):
        if feature.type == "CDS":
            return "#1f77b4"
        elif feature.type == "terminator":
            return "#279e68"
        elif feature.type == "promoter":
            return "#aa40fc"
        elif feature.type == "sgRNA":
            return "#d62728"
        else:
            return "#aec7e8"


def add_promoters_to_genbank(genome_record, promoter_df):
    promoter_feature_list = []
    for index, promoter in promoter_df.iterrows():
        if promoter["Strand"] == "forward":
            promoter_feature = SeqFeature(
                location=FeatureLocation(promoter["TSS"], promoter["TSS"]),
                type="promoter",
                strand=1,
            )
            promoter_feature.qualifiers["gene"] = promoter["Name"]
            promoter_feature_list.append(promoter_feature)
        else:
            promoter_feature = SeqFeature(
                location=FeatureLocation(promoter["TSS"], promoter["TSS"]),
                type="promoter",
                strand=-1,
            )
            promoter_feature.qualifiers["gene"] = promoter["Name"]
            promoter_feature_list.append(promoter_feature)
    genome_record.features = genome_record.features + promoter_feature_list
    return genome_record


def display_target_sites(genome_record, target_site_list, translator, view_pad=1000):
    strand_dict = {"+": 1, "-": -1}

    n_targets = len(target_site_list)

    for target_site in target_site_list:
        start_coord = target_site[0] - view_pad
        end_coord = target_site[1] + view_pad

        sub_genome_record = genome_record[start_coord:end_coord]
        sgRNA = SeqFeature(
            location=FeatureLocation(view_pad, view_pad + 20),
            type="sgRNA",
            strand=strand_dict[target_site[2]],
        )

        sgRNA.qualifiers["gene"] = "sgRNA"

        sub_genome_record.features = sub_genome_record.features + [sgRNA]
        graphic_record = translator.translate_record(sub_genome_record)

        ax, _ = graphic_record.plot(figure_width=10, strand_in_label_threshold=7)


def display_target_sites_single_locus(
    genome_record, target_site_dict, translator, view_pad=2000, outer_context_pad=20000
):
    strand_dict = {"+": 1, "-": -1}

    n_targets = len(target_site_dict)
    first_key = list(target_site_dict.keys())[0]

    outer_start_coord = target_site_dict[first_key][0] - outer_context_pad
    outer_end_coord = target_site_dict[first_key][1] + outer_context_pad

    start_coord = target_site_dict[first_key][0] - outer_start_coord
    end_coord = target_site_dict[first_key][1] - outer_start_coord

    sub_genome_record = genome_record[outer_start_coord:outer_end_coord]

    for targetid, target_site in target_site_dict.items():

        sgRNA = SeqFeature(
            location=FeatureLocation(
                target_site[0] - outer_start_coord, target_site[1] - outer_start_coord
            ),
            type="sgRNA",
            strand=strand_dict[target_site[2]],
        )

        sgRNA.qualifiers["gene"] = str(targetid)

        sub_genome_record.features = sub_genome_record.features + [sgRNA]
    graphic_record = translator.translate_record(sub_genome_record)

    cropped_record = graphic_record.crop((start_coord - view_pad, end_coord + view_pad))

    ax, _ = cropped_record.plot(figure_width=10, strand_in_label_threshold=7)


def gene_to_target_dict(df, nanopore_df, gene_name):
    subset_df = df[df["Gene"] == gene_name]
    subset_targetid_list = (
        subset_df.groupby("TargetID").apply(lambda x: x.iloc[0])["TargetID"].tolist()
    )

    target_site_dict = (
        nanopore_df[nanopore_df["TargetID"].isin(subset_targetid_list)]
        .groupby("TargetID")
        .apply(lambda x: x.iloc[0])["Target Sites"]
        .to_dict()
    )
    target_site_dict = {
        key: item for key, val in target_site_dict.items() for item in val
    }  # unwrapping target sites

    return target_site_dict, subset_df

#### Make Reference SeqRecord

In [None]:
genome_record = SeqIO.read("./U00096.3.gbk", "genbank")
promoter_df = pd.read_csv(
    "./PromoterSet.txt",
    sep="\t",
    skiprows=37,
    names=[
        "ID",
        "Name",
        "Strand",
        "TSS",
        "Sigma Factor",
        "Sequence",
        "Evidence",
        "Confidence",
    ],
)
promoter_df = promoter_df[promoter_df["Confidence"] == "Strong"]
genome_record_merged = add_promoters_to_genbank(genome_record, promoter_df)

In [None]:
genome_record_merged

#### Initialize Viewer

In [None]:
translator = sgRNA_Explorer()

In [None]:
divergent_df = paga_df.obs[paga_df.obs["Gene"].isin(gene_group_ttest_gene[rejected])]

In [None]:
divergent_df["Gene"].unique().tolist()

In [None]:
fig = sc.pl.umap(
    paga_df,
    color=["leiden_lowres", "leiden", "leiden_highres"],
    title=["Leiden Resolution=0.25", "Leiden Resolution=1.", "Leiden Resolution=1.5"],
    show=False,
    legend_loc="on data",
    edges=True,
    add_outline=False,
    size=50,
    return_fig=True,
    palette=vega_20_scanpy,
)
# fig.savefig("./1_Global_Analysis/Global_PAGA.png",dpi=150)

In [None]:
for gene in divergent_df["Gene"].unique().tolist():
    target_site_dict, subset_df = gene_to_target_dict(paga_df.obs, nanopore_df, gene)
    highlight_gene_group(paga_df, [gene])
    plt.show()
    display_target_sites_single_locus(
        genome_record, target_site_dict, translator, view_pad=1000
    )
    print(subset_df.reset_index().set_index("TargetID").sort_index()["leiden_lowres"])
    plt.show()

In [None]:
def plot_selected_timeseries(df, figsize=(30, 10)):

    feature_labels = [
        "Linear Growth Rate",
        "Exponential Growth Rate",
        "Division Length",
        "Width",
        "mCherry Intensity",
        "Doubling Time",
    ]

    feature_ranges = [
        (0, 15),
        (0.5, 1.5),
        (4.0, 15.0),
        (1.2, 1.4),
        (1200, 3000),
        (6, 15),
    ]

    len_labels = len(feature_labels)

    feature_series_list = []
    for feature_label in feature_labels:
        feature_series = df.groupby(["Gene", "TargetID"]).apply(
            lambda x: x[feature_label].tolist()
        )
        feature_series_list.append(feature_series)

    len_series = len(feature_series)

    fig = plt.figure(figsize=figsize)

    for j, feature_series in enumerate(feature_series_list):
        for i, (idx, row) in enumerate(feature_series.iteritems()):
            plot_arr = np.array(row).T
            ax = plt.subplot(len(feature_labels), len_series, (j * len_series) + i + 1)
            ax.set_title(str(idx) + ": " + feature_labels[j])
            ax.set_ylim(feature_ranges[j])
            ax.plot(plot_arr, color="tab:blue")
    plt.tight_layout()
    plt.show()

In [None]:
divergent_goi_df = divergent_df[divergent_df["Gene"].isin(["minC"])]
divergent_goi_df["Gene"] = divergent_goi_df["Gene"].astype(str)
plot_selected_timeseries(divergent_goi_df, figsize=(15, 15))

In [None]:
divergent_goi_df = divergent_df[divergent_df["Gene"].isin(["fabH"])]
divergent_goi_df["Gene"] = divergent_goi_df["Gene"].astype(str)
plot_selected_timeseries(divergent_goi_df, figsize=(15, 15))

In [None]:
divergent_goi_df = divergent_df[divergent_df["Gene"].isin(["fabG"])]
divergent_goi_df["Gene"] = divergent_goi_df["Gene"].astype(str)
plot_selected_timeseries(divergent_goi_df, figsize=(15, 15))

In [None]:
divergent_goi_df.columns

In [None]:
divergent_goi_df = divergent_df[divergent_df["Gene"].isin(["rne"])]
divergent_goi_df["Gene"] = divergent_goi_df["Gene"].astype(str)
plot_selected_timeseries(divergent_goi_df, figsize=(15, 15))

## 4) Titrations


In [None]:
import scanpy as sc
import anndata
import scipy as sp
import scipy.sparse
import dask.array as da
from igraph.drawing.text import TextDrawer
from tslearn.neighbors import KNeighborsTimeSeries
from tslearn.metrics import cdist_soft_dtw_normalized, cdist_soft_dtw
import networkx as nx
import igraph as ig
import leidenalg
import umap
from scanpy.plotting.palettes import default_20, vega_20_scanpy

In [None]:
def get_pearson_df(titration_df, variable_name, pearson_p_val=0.05):

    var_df = (
        titration_df.groupby("TargetID")
        .apply(lambda x: x[variable_name].tolist())
        .to_frame()
    )
    var_df = var_df.rename(columns={0: variable_name})
    var_df["N Match"] = titration_df.groupby("TargetID").apply(
        lambda x: x["N Match"].tolist()
    )
    var_df[variable_name + ": Pearson R"] = var_df.apply(
        lambda x: sp.stats.pearsonr(x["N Match"], x[variable_name])[0], axis=1
    )
    var_df[variable_name + ": Pearson P-val"] = var_df.apply(
        lambda x: sp.stats.pearsonr(x["N Match"], x[variable_name])[1], axis=1
    )

    return var_df

### Looking at all targetids (including filtered out and unclustered)

In [None]:
import statsmodels.stats.multitest

min_titration = 4
fdr_p_val_thr = 0.1
var_list = [
    "Linear Growth Rate: Mean",
    "Exponential Growth Rate: Mean",
    "Division Length: Mean",
    "Width: Mean",
    "mCherry Intensity: Mean",
    "Doubling Time: Mean",
]

gene_cluster_df_full["N Match"] = 20 - gene_cluster_df_full["N Mismatch"]

target_count_series = gene_cluster_df_full.groupby("TargetID").apply(lambda x: len(x))
targetid_above_thr = target_count_series[
    target_count_series >= min_titration
].index.tolist()
titration_df = gene_cluster_df_full[
    gene_cluster_df_full["TargetID"].isin(targetid_above_thr)
]

# titration_df_pearson = titration_df.dropna(subset=['N Match'] + var_list)
pearson_df = titration_df.groupby("TargetID").apply(lambda x: x.iloc[0])

for var_label in var_list:
    var_df = get_pearson_df(titration_df, var_label)
    pearson_df[var_label + ": Pearson R"] = var_df[var_label + ": Pearson R"]
    pearson_df[var_label + ": Pearson P-val"] = var_df[var_label + ": Pearson P-val"]

pearson_p_labels = [var_label + ": Pearson P-val" for var_label in var_list]
pearson_r_labels = [var_label + ": Pearson R" for var_label in var_list]

pearson_r_sig = np.any(
    [
        statsmodels.stats.multitest.fdrcorrection(
            pearson_df[pearson_p_label].tolist(),
            fdr_p_val_thr,
            method="indep",
            is_sorted=False,
        )[0]
        for pearson_p_label in pearson_p_labels
    ],
    axis=0,
)
pearson_r_sig_df = pearson_df[pearson_r_sig]

n_targetids = len(gene_cluster_df_full["TargetID"].unique().tolist())
n_titration_targetids = np.sum(pearson_r_sig)

# pos_arr = (pearson_r_sig_df[pearson_r_labels]>0.).values
# neg_arr = (pearson_r_sig_df[pearson_r_labels]<=0.).values
# sig_arr = (pearson_r_sig_df[pearson_p_labels]<pearson_p_val_thr).values

# sig_pos_arr = sig_arr*pos_arr
# sig_neg_arr = sig_arr*neg_arr

# pearson_r_sig_df["Pearson Significant Positive"] = [item for item in sig_pos_arr]
# pearson_r_sig_df["Pearson Significant Negative"] = [item for item in sig_neg_arr]

In [None]:
pearson_r_sig

In [None]:
print(n_titration_targetids / n_targetids)

In [None]:
genes_with_titration = sorted(pearson_r_sig_df["Gene"].unique().tolist())
all_genes = sorted(gene_cluster_df_full["Gene"].unique().tolist())
genes_wo_titration = sorted(list(set(all_genes) - set(genes_with_titration)))

In [None]:
len(genes_with_titration)

In [None]:
len(all_genes)

In [None]:
len(genes_wo_titration)

### Looking at clustered targetids

In [None]:
titratable_targetids = pearson_r_sig_df.index.unique().tolist()
titratable_mask = paga_df.obs["TargetID"].isin(titratable_targetids)
titration_paga_df = paga_df[titratable_mask]

In [None]:
cluster_labels = ["leiden_lowres", "leiden", "leiden_highres"]

for cluster_label in cluster_labels:
    cluster_ids = titration_paga_df.obs[cluster_label].cat.categories.tolist()
    for cluster_id in cluster_ids:

        cluster_df = titration_paga_df.obs[
            titration_paga_df.obs[cluster_label] == cluster_id
        ]
        targetid_list = sorted(cluster_df["TargetID"].unique().tolist())

        n_targetids = len(targetid_list)

        var_names = [
            "Linear Growth Rate: Mean",
            "Exponential Growth Rate: Mean",
            "Division Length: Mean",
            "Width: Mean",
            "mCherry Intensity: Mean",
            "Doubling Time: Mean",
        ]

        display_var_names = [
            "Linear Growth Rate",
            "Exponential Growth Rate",
            "Division Length",
            "Mean Width",
            "mCherry Intensity",
            "Doubling Time",
        ]

        feature_ranges = [
            (0, 15),
            (0.5, 1.5),
            (4.0, 15.0),
            (1.2, 1.4),
            (1200, 3000),
            (6, 15),
        ]

        figsize = (int(2.5 * n_targetids) + 1, 15)
        wspace = 0.25
        hspace = 0.25
        fontsize = 14

        step_size = 20

        for idx, n in enumerate(list(range(0, n_targetids, step_size))):

            sub_targetid_list = targetid_list[n : n + step_size]

            fig = plt.figure(constrained_layout=True, figsize=figsize)
            gs = fig.add_gridspec(1, len(sub_targetid_list), wspace=wspace)

            for i, targetid in enumerate(sub_targetid_list):
                selected_targetid_df = gene_cluster_df_full[
                    gene_cluster_df_full["TargetID"] == targetid
                ]

                inner_gs = gs[0, i].subgridspec(
                    len(var_names), 1, wspace=0, hspace=hspace
                )
                inner_grid_sub = inner_gs.subplots(sharex=True)

                for j, ax in np.ndenumerate(inner_grid_sub):
                    var_name = var_names[j[0]]
                    ax.scatter(
                        selected_targetid_df["N Match"], selected_targetid_df[var_name]
                    )
                    ax.set_ylabel(display_var_names[j[0]], fontsize=fontsize)
                    ax.set_ylim(feature_ranges[j[0]])

                ax.set_xlabel(
                    selected_targetid_df["Gene"].iloc[0]
                    + " TargetID: "
                    + str(targetid),
                    fontsize=fontsize,
                )

            plt.tight_layout()
            plt.savefig(
                "4_Titrations/"
                + cluster_label
                + "_"
                + str(cluster_id)
                + "_Part_"
                + str(idx)
                + ".png",
                dpi=75,
            )

## 5) Examining Hits

In [None]:
fig = sc.pl.umap(
    paga_df,
    color=["leiden_lowres", "leiden", "leiden_highres"],
    title=["Leiden Resolution=0.25", "Leiden Resolution=1.", "Leiden Resolution=1.5"],
    show=False,
    legend_loc="on data",
    edges=True,
    add_outline=False,
    size=50,
    return_fig=True,
    palette=vega_20_scanpy,
)

### Gene Browser

In [None]:
gene_browser_df = copy.deepcopy(feature_paga_df.obs)
gene_browser_df = gene_browser_df.reset_index()

In [None]:
gene_browser_df = gene_browser_df[
    [
        "sgRNAid",
        "TargetID",
        "Gene",
        "phenotype trenchids",
        "N Mismatch",
        "N Target Sites",
        "Category",
        "Strand",
        "leiden_lowres",
        "leiden",
        "leiden_highres",
    ]
]
gene_browser_df["sgRNAid"] = gene_browser_df["sgRNAid"].astype(int)

In [None]:
kymo_xarr = tr.kymo_xarr(
    "/home/de64/scratch/de64/sync_folder/2021-06-14_lDE20_biofloat_fullrun_1/Growth_Division"
)
wrapped_kymo_xarr = tr.kymo_xarr(
    "/home/de64/scratch/de64/sync_folder/2021-06-14_lDE20_biofloat_fullrun_1/Growth_Division",
    unwrap=False,
)

In [None]:
# gene_table_layout,select_gene,select_trenchid,select_unpacked_trenchid = tr.linked_table(gene_browser_df,index_key='leiden',trenchids_as_list=True,trenchid_column='phenotype trenchids')
(
    gene_table_layout,
    select_gene,
    select_trenchid,
    select_unpacked_trenchid,
) = tr.linked_table(
    gene_browser_df,
    index_key="Gene",
    trenchids_as_list=True,
    trenchid_column="phenotype trenchids",
)

In [None]:
gene_table_layout

In [None]:
output_display, save_button = tr.linked_kymograph_for_table(
    kymo_xarr,
    wrapped_kymo_xarr,
    gene_browser_df,
    select_gene,
    select_trenchid,
    index_key="Gene",
    select_unpacked_trenchid=select_unpacked_trenchid,
    trenchid_column="phenotype trenchids",
    y_scale=3,
    x_window_size=300,
)

In [None]:
output_display

In [None]:
save_button

In [None]:
target_site_dict, subset_df = gene_to_target_dict(paga_df.obs, nanopore_df, "ftsZ")
display_target_sites_single_locus(
    genome_record, target_site_dict, translator, view_pad=2000
)

### Replication Initiation Defects

Genes: ['dnaA', 'dnaB', 'infA', 'rne']

In [None]:
dna_rep_df = paga_df.obs[paga_df.obs["leiden"] == "13"]
dna_rep_df["Gene"] = dna_rep_df["Gene"].astype(str)

feature_labels = [
    "Linear Growth Rate",
    "Exponential Growth Rate",
    "Division Length",
    "Width",
    "mCherry Intensity",
    "Doubling Time",
]

feature_ranges = [(5, 10), (1.0, 1.5), (6.0, 10.0), (1.2, 1.4), (1200, 2000), (6, 12)]

len_labels = len(feature_labels)

feature_series_list = []
for feature_label in feature_labels:
    feature_series = dna_rep_df.groupby("Gene").apply(
        lambda x: x[feature_label].tolist()
    )
    feature_series_list.append(feature_series)

len_series = len(feature_series)

fig = plt.figure(figsize=(10, 10))

for j, feature_series in enumerate(feature_series_list):
    for i, (idx, row) in enumerate(feature_series.iteritems()):
        plot_arr = np.array(row).T
        ax = plt.subplot(len(feature_labels), len_series, (j * len_series) + i + 1)
        ax.set_title(idx + ": " + feature_labels[j])
        ax.set_ylim(feature_ranges[j])
        ax.plot(plot_arr, color="tab:blue")
plt.tight_layout()
plt.show()

In [None]:
target_site_dict, subset_df = gene_to_target_dict(paga_df.obs, nanopore_df, "dnaA")
display_target_sites_single_locus(
    genome_record, target_site_dict, translator, view_pad=3000
)

In [None]:
target_site_dict, subset_df = gene_to_target_dict(paga_df.obs, nanopore_df, "dnaB")
display_target_sites_single_locus(
    genome_record, target_site_dict, translator, view_pad=3000
)

In [None]:
target_site_dict, subset_df = gene_to_target_dict(paga_df.obs, nanopore_df, "rne")
display_target_sites_single_locus(
    genome_record, target_site_dict, translator, view_pad=3000
)

In [None]:
target_site_dict, subset_df = gene_to_target_dict(paga_df.obs, nanopore_df, "infA")
display_target_sites_single_locus(
    genome_record, target_site_dict, translator, view_pad=1000
)

#### Notes

- DnaAB and rne all have decreased fluorescence accompanied by increasing cell length

- infA also has decreased fluorescence, but with significanly less impairment of length

- infA interacts directly with the small ribosomal subunit, so it may have some odd effect on the rpsL reporter

- Alternatively, it may reduce translational effeciency more broadly, but one would expect a more deleterious effect on divison

- These data also suggest rne has a role in regulating replication initiation, given its similarity to dnaAB

### Division Defects

['acnB', 'bamB', 'bisC', 'comR', 'ddlB']

['dnaK', 'dnaN', 'dxs', 'fixX', 'folC']

['folP', 'ftsK', 'ftsL', 'ftsN', 'ftsQ']

['ftsW', 'ftsY', 'ftsZ', 'gyrA', 'gyrB']

['holA', 'infC', 'kefB', 'lapA', 'lexA']

['lpxC', 'minC', 'minE', 'mraY', 'mraZ']

['mukB', 'mukE', 'mukF', 'murC', 'murD']

['murE', 'murF', 'murG', 'murI', 'nagE']

['nrdA', 'nrdB', 'nusG', 'parC', 'parE']

['pfkA', 'pgsA', 'prmB', 'pssA', 'pyrG']

['pyrH', 'rho', 'rhoL', 'rimP', 'rnhB']

['rnpA', 'rnpB', 'rplK', 'rplN', 'rplS']

['rplU', 'rpmB', 'rpmJ', 'rpoA', 'rpoC']

['rsgA', 'rsmH', 'secE', 'secY', 'ssb']

['tamA', 'tgt', 'tmk', 'ubiJ esrE', 'yagH']

['ybcN', 'yebQ', 'yecM', 'yeiH', 'ygfZ']

['ypaB']


In [None]:
division_df = paga_df.obs[paga_df.obs["leiden"] == "6"]
division_df = division_df[
    division_df["Gene"].isin(
        ["ubiJ esrE", "yagH", "ybcN", "yebQ", "yecM", "yeiH", "ygfZ", "ypaB"]
    )
]
division_df["Gene"] = division_df["Gene"].astype(str)

feature_labels = [
    "Linear Growth Rate",
    "Exponential Growth Rate",
    "Division Length",
    "Width",
    "mCherry Intensity",
    "Doubling Time",
]

feature_ranges = [(5, 15), (1.0, 1.5), (6.0, 15.0), (1.2, 1.4), (1200, 3000), (6, 12)]

len_labels = len(feature_labels)

feature_series_list = []
for feature_label in feature_labels:
    feature_series = division_df.groupby("Gene").apply(
        lambda x: x[feature_label].tolist()
    )
    feature_series_list.append(feature_series)

len_series = len(feature_series)

fig = plt.figure(figsize=(20, 10))

for j, feature_series in enumerate(feature_series_list):
    for i, (idx, row) in enumerate(feature_series.iteritems()):
        plot_arr = np.array(row).T
        ax = plt.subplot(len(feature_labels), len_series, (j * len_series) + i + 1)
        ax.set_title(idx + ": " + feature_labels[j])
        ax.set_ylim(feature_ranges[j])
        ax.plot(plot_arr, color="tab:blue")
plt.tight_layout()
plt.show()

#### Notes

- yebQ seems like it is more similar to a ribosome defect, given increased fluorescence

- All other y genes seemingly good hits

- esrE is subtle; maybe worth validating given how weird it is though

In [None]:
division_df = paga_df.obs[paga_df.obs["leiden"] == "6"]
division_df = division_df[
    division_df["Gene"].isin(
        [
            "acnB",
            "bamB",
            "bisC",
            "comR",
            "ddlB",
            "dnaK",
            "dnaN",
            "dxs",
            "fixX",
            "folC",
            "folP",
        ]
    )
]
division_df["Gene"] = division_df["Gene"].astype(str)

feature_labels = [
    "Linear Growth Rate",
    "Exponential Growth Rate",
    "Division Length",
    "Width",
    "mCherry Intensity",
    "Doubling Time",
]

feature_ranges = [(5, 15), (1.0, 1.5), (6.0, 15.0), (1.2, 1.4), (1200, 3000), (6, 12)]

len_labels = len(feature_labels)

feature_series_list = []
for feature_label in feature_labels:
    feature_series = division_df.groupby("Gene").apply(
        lambda x: x[feature_label].tolist()
    )
    feature_series_list.append(feature_series)

len_series = len(feature_series)

fig = plt.figure(figsize=(30, 10))

for j, feature_series in enumerate(feature_series_list):
    for i, (idx, row) in enumerate(feature_series.iteritems()):
        plot_arr = np.array(row).T
        ax = plt.subplot(len(feature_labels), len_series, (j * len_series) + i + 1)
        ax.set_title(idx + ": " + feature_labels[j])
        ax.set_ylim(feature_ranges[j])
        ax.plot(plot_arr, color="tab:blue")
plt.tight_layout()
plt.show()

#### Notes

- acnB, bamB, folC increase length without much of a linear growth rate increase

- dnaK, dnaN, folC have increased mCherry reporter

- bisC, comR, ddlB, dxs, fixX, folP seemingly good hits

- esrE is subtle; maybe worth validating given how weird it is though

In [None]:
division_df = paga_df.obs[paga_df.obs["leiden"] == "6"]
division_df = division_df[
    division_df["Gene"].isin(
        ["ftsK", "ftsL", "ftsN", "ftsQ", "ftsW", "ftsY", "ftsZ", "gyrA", "gyrB"]
    )
]
division_df["Gene"] = division_df["Gene"].astype(str)

feature_labels = [
    "Linear Growth Rate",
    "Exponential Growth Rate",
    "Division Length",
    "Width",
    "mCherry Intensity",
    "Doubling Time",
]

feature_ranges = [(5, 15), (1.0, 1.5), (6.0, 15.0), (1.2, 1.4), (1200, 3000), (6, 12)]

len_labels = len(feature_labels)

feature_series_list = []
for feature_label in feature_labels:
    feature_series = division_df.groupby("Gene").apply(
        lambda x: x[feature_label].tolist()
    )
    feature_series_list.append(feature_series)

len_series = len(feature_series)

fig = plt.figure(figsize=(30, 10))

for j, feature_series in enumerate(feature_series_list):
    for i, (idx, row) in enumerate(feature_series.iteritems()):
        plot_arr = np.array(row).T
        ax = plt.subplot(len(feature_labels), len_series, (j * len_series) + i + 1)
        ax.set_title(idx + ": " + feature_labels[j])
        ax.set_ylim(feature_ranges[j])
        ax.plot(plot_arr, color="tab:blue")
plt.tight_layout()
plt.show()

In [None]:
division_df = paga_df.obs[paga_df.obs["leiden"] == "6"]
division_df = division_df[
    division_df["Gene"].isin(
        ["holA", "infC", "kefB", "lapA", "lexA", "lpxC", "minC", "minE", "mraY", "mraZ"]
    )
]
division_df["Gene"] = division_df["Gene"].astype(str)

feature_labels = [
    "Linear Growth Rate",
    "Exponential Growth Rate",
    "Division Length",
    "Width",
    "mCherry Intensity",
    "Doubling Time",
]

feature_ranges = [(5, 15), (1.0, 1.5), (6.0, 15.0), (1.2, 1.4), (1200, 3000), (6, 12)]

len_labels = len(feature_labels)

feature_series_list = []
for feature_label in feature_labels:
    feature_series = division_df.groupby("Gene").apply(
        lambda x: x[feature_label].tolist()
    )
    feature_series_list.append(feature_series)

len_series = len(feature_series)

fig = plt.figure(figsize=(30, 10))

for j, feature_series in enumerate(feature_series_list):
    for i, (idx, row) in enumerate(feature_series.iteritems()):
        plot_arr = np.array(row).T
        ax = plt.subplot(len(feature_labels), len_series, (j * len_series) + i + 1)
        ax.set_title(idx + ": " + feature_labels[j])
        ax.set_ylim(feature_ranges[j])
        ax.plot(plot_arr, color="tab:blue")
plt.tight_layout()
plt.show()

In [None]:
division_df = paga_df.obs[paga_df.obs["leiden"] == "6"]
division_df = division_df[
    division_df["Gene"].isin(
        ["mukB", "mukE", "mukF", "murC", "murD", "murE", "murF", "murG", "murI", "nagE"]
    )
]
division_df["Gene"] = division_df["Gene"].astype(str)

feature_labels = [
    "Linear Growth Rate",
    "Exponential Growth Rate",
    "Division Length",
    "Width",
    "mCherry Intensity",
    "Doubling Time",
]

feature_ranges = [(5, 15), (1.0, 1.5), (6.0, 15.0), (1.2, 1.4), (1200, 3000), (6, 12)]

len_labels = len(feature_labels)

feature_series_list = []
for feature_label in feature_labels:
    feature_series = division_df.groupby("Gene").apply(
        lambda x: x[feature_label].tolist()
    )
    feature_series_list.append(feature_series)

len_series = len(feature_series)

fig = plt.figure(figsize=(30, 10))

for j, feature_series in enumerate(feature_series_list):
    for i, (idx, row) in enumerate(feature_series.iteritems()):
        plot_arr = np.array(row).T
        ax = plt.subplot(len(feature_labels), len_series, (j * len_series) + i + 1)
        ax.set_title(idx + ": " + feature_labels[j])
        ax.set_ylim(feature_ranges[j])
        ax.plot(plot_arr, color="tab:blue")
plt.tight_layout()
plt.show()

In [None]:
division_df = paga_df.obs[paga_df.obs["leiden"] == "6"]
division_df = division_df[
    division_df["Gene"].isin(
        ["nrdA", "nrdB", "nusG", "parC", "parE", "pfkA", "pgsA", "prmB", "pssA", "pyrG"]
    )
]
division_df["Gene"] = division_df["Gene"].astype(str)

feature_labels = [
    "Linear Growth Rate",
    "Exponential Growth Rate",
    "Division Length",
    "Width",
    "mCherry Intensity",
    "Doubling Time",
]

feature_ranges = [(5, 15), (1.0, 1.5), (6.0, 15.0), (1.2, 1.4), (1200, 3000), (6, 12)]

len_labels = len(feature_labels)

feature_series_list = []
for feature_label in feature_labels:
    feature_series = division_df.groupby("Gene").apply(
        lambda x: x[feature_label].tolist()
    )
    feature_series_list.append(feature_series)

len_series = len(feature_series)

fig = plt.figure(figsize=(30, 10))

for j, feature_series in enumerate(feature_series_list):
    for i, (idx, row) in enumerate(feature_series.iteritems()):
        plot_arr = np.array(row).T
        ax = plt.subplot(len(feature_labels), len_series, (j * len_series) + i + 1)
        ax.set_title(idx + ": " + feature_labels[j])
        ax.set_ylim(feature_ranges[j])
        ax.plot(plot_arr, color="tab:blue")
plt.tight_layout()
plt.show()

In [None]:
division_df = paga_df.obs[paga_df.obs["leiden"] == "6"]
division_df = division_df[
    division_df["Gene"].isin(
        [
            "pyrH",
            "rho",
            "rhoL",
            "rimP",
            "rnhB",
            "rnpA",
            "rnpB",
            "rplK",
            "rplN",
            "rplS",
            "rplU",
            "rpmB",
            "rpmJ",
        ]
    )
]
division_df["Gene"] = division_df["Gene"].astype(str)

feature_labels = [
    "Linear Growth Rate",
    "Exponential Growth Rate",
    "Division Length",
    "Width",
    "mCherry Intensity",
    "Doubling Time",
]

feature_ranges = [(5, 15), (1.0, 1.5), (6.0, 15.0), (1.2, 1.4), (1200, 3000), (6, 12)]

len_labels = len(feature_labels)

feature_series_list = []
for feature_label in feature_labels:
    feature_series = division_df.groupby("Gene").apply(
        lambda x: x[feature_label].tolist()
    )
    feature_series_list.append(feature_series)

len_series = len(feature_series)

fig = plt.figure(figsize=(30, 10))

for j, feature_series in enumerate(feature_series_list):
    for i, (idx, row) in enumerate(feature_series.iteritems()):
        plot_arr = np.array(row).T
        ax = plt.subplot(len(feature_labels), len_series, (j * len_series) + i + 1)
        ax.set_title(idx + ": " + feature_labels[j])
        ax.set_ylim(feature_ranges[j])
        ax.plot(plot_arr, color="tab:blue")
plt.tight_layout()
plt.show()

In [None]:
division_df = paga_df.obs[paga_df.obs["leiden"] == "6"]
division_df = division_df[
    division_df["Gene"].isin(
        ["rpoA", "rpoC", "rsgA", "rsmH", "secE", "secY", "ssb", "tamA", "tgt", "tmk"]
    )
]
division_df["Gene"] = division_df["Gene"].astype(str)

feature_labels = [
    "Linear Growth Rate",
    "Exponential Growth Rate",
    "Division Length",
    "Width",
    "mCherry Intensity",
    "Doubling Time",
]

feature_ranges = [(5, 15), (1.0, 1.5), (6.0, 15.0), (1.2, 1.4), (1200, 3000), (6, 12)]

len_labels = len(feature_labels)

feature_series_list = []
for feature_label in feature_labels:
    feature_series = division_df.groupby("Gene").apply(
        lambda x: x[feature_label].tolist()
    )
    feature_series_list.append(feature_series)

len_series = len(feature_series)

fig = plt.figure(figsize=(30, 10))

for j, feature_series in enumerate(feature_series_list):
    for i, (idx, row) in enumerate(feature_series.iteritems()):
        plot_arr = np.array(row).T
        ax = plt.subplot(len(feature_labels), len_series, (j * len_series) + i + 1)
        ax.set_title(idx + ": " + feature_labels[j])
        ax.set_ylim(feature_ranges[j])
        ax.plot(plot_arr, color="tab:blue")
plt.tight_layout()
plt.show()

In [None]:
["rpoA", "rpoC", "rsgA", "rsmH", "secE", "secY", "ssb", "tamA", "tgt", "tmk"]

["tamA", "tgt", "tmk", "ubiJ esrE", "yagH"]

["ybcN", "yebQ", "yecM", "yeiH", "ygfZ"]

["ypaB"]

## 6) Examining Transients

In [None]:
from tslearn.preprocessing import TimeSeriesScalerMinMax
from tslearn.preprocessing import TimeSeriesScalerMeanVariance

### First: Long Cells

### Renormalization

In [None]:
norm_threshold = 1
feature_integrated_norm = gene_cluster_df_full["Division Length Z-score"].apply(
    lambda x: sp.integrate.simpson(x)
)
feature_max_norm = gene_cluster_df_full["Division Length Z-score"].apply(
    lambda x: np.max(x)
)
feature_filtered_df = gene_cluster_df_full[feature_max_norm > norm_threshold]

X_feature = np.array(feature_filtered_df["Division Length"].tolist())[:, :, np.newaxis]
X_feature_norm = TimeSeriesScalerMinMax().fit_transform(X_feature)
feature_filtered_df["Division Length Feature Norm"] = [
    item for item in X_feature_norm[:, :, 0]
]

In [None]:
plt.hist(feature_max_norm, bins=50)

In [None]:
dtw_feature_norm = parallel_norm_soft_dtw(X_feature_norm)

### Initialize Anndata Object

In [None]:
an_df_feature = anndata.AnnData(
    X=X_feature_norm.reshape(X_feature_norm.shape[0], -1), obs=feature_filtered_df
)  # AnnData container to use scanpy functions with unwrapped time vector

### Compute KNN Graph

In [None]:
n_neighbors = 15
n_pcs = 20  # This shouldn't affect anything

sc.pp.neighbors(an_df_feature, n_neighbors=n_neighbors, n_pcs=n_pcs)
knn_indices, knn_dists, forest = sc.neighbors.compute_neighbors_umap(
    dtw_feature_norm, n_neighbors=n_neighbors, metric="precomputed"
)
(
    an_df_feature.uns["neighbors"]["distances"],
    an_df_feature.uns["neighbors"]["connectivities"],
) = sc.neighbors._compute_connectivities_umap(
    knn_indices,
    knn_dists,
    an_df_feature.shape[0],
    n_neighbors,  # change to neighbors you plan to use
)
an_df_feature.obsp["distances"] = an_df_feature.uns["neighbors"]["distances"]
an_df_feature.obsp["connectivities"] = an_df_feature.uns["neighbors"]["connectivities"]
an_df_feature.obsp["soft_dtw"] = dtw_feature_norm

### Computing Leiden, PAGA and UMAP

In [None]:
feature_paga_df_dict = {}
for resolution in [0.25, 1.0, 1.5]:
    feature_paga_df_dict[resolution] = copy.deepcopy(an_df_feature)
    sc.tl.leiden(
        feature_paga_df_dict[resolution], resolution=resolution, n_iterations=-1
    )
    sc.tl.paga(feature_paga_df_dict[resolution], groups="leiden")
    sc.pl.paga(feature_paga_df_dict[resolution], add_pos=True, show=False)
sc.tl.umap(feature_paga_df_dict[1.0], init_pos="paga", min_dist=0.25, spread=5.0)
feature_paga_df_dict[1.0].obs["leiden_lowres"] = feature_paga_df_dict[0.25].obs[
    "leiden"
]
feature_paga_df_dict[1.0].obs["leiden_highres"] = feature_paga_df_dict[1.5].obs[
    "leiden"
]
feature_paga_df = feature_paga_df_dict[1.0]

In [None]:
fig = sc.pl.umap(
    feature_paga_df,
    color=["leiden_lowres", "leiden", "leiden_highres"],
    title=["Leiden Resolution=0.25", "Leiden Resolution=1.", "Leiden Resolution=1.5"],
    show=False,
    legend_loc="on data",
    edges=True,
    add_outline=False,
    size=50,
    return_fig=True,
    palette=vega_20_scanpy,
)

In [None]:
feature_paga_df.obs["N Match"] = 20.0 - feature_paga_df.obs["N Mismatch"]
feature_del_N_match_series = feature_paga_df.obs.groupby("TargetID").apply(
    lambda x: x["N Match"] - np.min(x["N Match"])
)
feature_del_N_match_series = feature_del_N_match_series.droplevel("TargetID")
feature_paga_df.obs["Delta N Match"] = feature_del_N_match_series

In [None]:
labels = [zscore_trace + ": Mean" for zscore_trace in zscore_traces]

fig = sc.pl.umap(
    feature_paga_df,
    color=labels,
    show=False,
    legend_loc="on data",
    add_outline=False,
    size=50,
    return_fig=True,
    vcenter=0.0,
    cmap="RdBu_r",
    wspace=0.25,
)

In [None]:
fig = plot_cluster_timeseries_braycenters(
    feature_paga_df.obs,
    "leiden",
    ["Division Length Feature Norm", "Division Length Feature Norm"],
    ["Division Length Feature Norm", "Division Length Feature Norm"],
    [(0, 1), (0, 1)],
    figsize=(15, 6),
    wspace=0.25,
    hspace=0.25,
    color_list=vega_20_scanpy,
)

In [None]:
np.unique(
    feature_paga_df.obs[feature_paga_df.obs["leiden"] == "5"]["Gene"].tolist(),
    return_counts=True,
)

### Small Cells

### Renormalization

In [None]:
norm_threshold = -1
feature_integrated_norm = gene_cluster_df_full["Division Length Z-score"].apply(
    lambda x: sp.integrate.simpson(x)
)
feature_max_norm = gene_cluster_df_full["Division Length Z-score"].apply(
    lambda x: np.min(x)
)
feature_filtered_df = gene_cluster_df_full[feature_max_norm < norm_threshold]

X_feature = np.array(feature_filtered_df["Division Length"].tolist())[:, :, np.newaxis]
X_feature_norm = TimeSeriesScalerMinMax().fit_transform(X_feature)
feature_filtered_df["Division Length Feature Norm"] = [
    item for item in X_feature_norm[:, :, 0]
]

In [None]:
plt.hist(feature_max_norm, bins=50)

In [None]:
dtw_feature_norm = parallel_norm_soft_dtw(X_feature_norm)

### Initialize Anndata Object

In [None]:
an_df_feature = anndata.AnnData(
    X=X_feature_norm.reshape(X_feature_norm.shape[0], -1), obs=feature_filtered_df
)  # AnnData container to use scanpy functions with unwrapped time vector

### Compute KNN Graph

In [None]:
n_neighbors = 10
n_pcs = 20  # This shouldn't affect anything

sc.pp.neighbors(an_df_feature, n_neighbors=n_neighbors, n_pcs=n_pcs)
knn_indices, knn_dists, forest = sc.neighbors.compute_neighbors_umap(
    dtw_feature_norm, n_neighbors=n_neighbors, metric="precomputed"
)
(
    an_df_feature.uns["neighbors"]["distances"],
    an_df_feature.uns["neighbors"]["connectivities"],
) = sc.neighbors._compute_connectivities_umap(
    knn_indices,
    knn_dists,
    an_df_feature.shape[0],
    n_neighbors,  # change to neighbors you plan to use
)
an_df_feature.obsp["distances"] = an_df_feature.uns["neighbors"]["distances"]
an_df_feature.obsp["connectivities"] = an_df_feature.uns["neighbors"]["connectivities"]
an_df_feature.obsp["soft_dtw"] = dtw_feature_norm

### Computing Leiden, PAGA and UMAP

In [None]:
feature_paga_df_dict = {}
for resolution in [0.25, 1.0, 1.5]:
    feature_paga_df_dict[resolution] = copy.deepcopy(an_df_feature)
    sc.tl.leiden(
        feature_paga_df_dict[resolution], resolution=resolution, n_iterations=-1
    )
    sc.tl.paga(feature_paga_df_dict[resolution], groups="leiden")
    sc.pl.paga(feature_paga_df_dict[resolution], add_pos=True, show=False)
sc.tl.umap(feature_paga_df_dict[1.0], init_pos="paga", min_dist=0.25, spread=5.0)
feature_paga_df_dict[1.0].obs["leiden_lowres"] = feature_paga_df_dict[0.25].obs[
    "leiden"
]
feature_paga_df_dict[1.0].obs["leiden_highres"] = feature_paga_df_dict[1.5].obs[
    "leiden"
]
feature_paga_df = feature_paga_df_dict[1.0]

In [None]:
fig = sc.pl.umap(
    feature_paga_df,
    color=["leiden_lowres", "leiden", "leiden_highres"],
    title=["Leiden Resolution=0.25", "Leiden Resolution=1.", "Leiden Resolution=1.5"],
    show=False,
    legend_loc="on data",
    edges=True,
    add_outline=False,
    size=50,
    return_fig=True,
    palette=vega_20_scanpy,
)

In [None]:
feature_paga_df.obs["N Match"] = 20.0 - feature_paga_df.obs["N Mismatch"]
feature_del_N_match_series = feature_paga_df.obs.groupby("TargetID").apply(
    lambda x: x["N Match"] - np.min(x["N Match"])
)
feature_del_N_match_series = feature_del_N_match_series.droplevel("TargetID")
feature_paga_df.obs["Delta N Match"] = feature_del_N_match_series

In [None]:
labels = [zscore_trace + ": Mean" for zscore_trace in zscore_traces]

fig = sc.pl.umap(
    feature_paga_df,
    color=labels,
    show=False,
    legend_loc="on data",
    add_outline=False,
    size=50,
    return_fig=True,
    vcenter=0.0,
    cmap="RdBu_r",
    wspace=0.25,
)

In [None]:
fig = plot_cluster_timeseries_braycenters(
    feature_paga_df.obs,
    "leiden",
    ["Division Length Feature Norm", "Division Length Feature Norm"],
    ["Division Length Feature Norm", "Division Length Feature Norm"],
    [(0, 1), (0, 1)],
    figsize=(15, 6),
    wspace=0.25,
    hspace=0.25,
    color_list=vega_20_scanpy,
)

In [None]:
np.unique(
    feature_paga_df.obs[feature_paga_df.obs["leiden"] == "1"]["Gene"].tolist(),
    return_counts=True,
)

In [None]:
plt.plot(X_feature_norm[1])

In [None]:
plt.hist(X_feature_norm[:, -1, 0], bins=20)

In [None]:
X_feature_norm[:, -1, 0] < 0.05

In [None]:
X_feature[:, 0, 0]

In [None]:
fold_change = X_feature / X_feature[:, 0, 0]

In [None]:
plt.hist(fold_change[:, -1, 0], bins=30)
plt.show()

In [None]:
V_max = np.max(X_feature, axis=1, keepdims=True)
V_min = np.min(X_feature, axis=1, keepdims=True)
V_i = X_feature[:, 0:1]
test_norm = (X_feature - V_i) / (V_max - V_min)
test_norm = test_norm - test_norm[:, :1]

In [None]:
plt.plot(test_norm[0, :, :])

In [None]:
plt.hist(test_norm[:, -1].flatten(), bins=30)
plt.show()

In [None]:
moo = test_norm[((test_norm[:, -1] < 0.25) * (test_norm[:, -1] > -0.25))[:, 0]]

In [None]:
moo.shape

In [None]:
plt.plot(moo[15, :, 0].T, c="lightgrey", alpha=0.7)
plt.show()

In [None]:
moo.shape

### Testing Fitting with Likelihood Ratio

In [None]:
from scipy.optimize import curve_fit

In [None]:
T = 0

y = X_feature_norm[1, :, 0]

const_segment = y[:T]
variable_segment = y[T:]

variable_t = np.array(range(len(variable_segment)))

const_est = np.mean(const_segment)

In [None]:
const_est

In [None]:
def func_null(t, c0, c1, l1):
    return c1 * np.exp(l1 * t) + c0


def func_hyp(t, c0, c1, c2, l1, l2):
    return (c1 * np.exp(l1 * t)) + (c2 * np.exp(l2 * t)) + c0

In [None]:
plt.plot(variable_t, variable_segment, "b-", label="data")

In [None]:
popt, pcov = curve_fit(func_null, variable_t, variable_segment)

In [None]:
popt, pcov = curve_fit(
    func_null,
    variable_t,
    variable_segment,
    bounds=([0.0, -1.0, -np.inf], [1.0, 1.0, np.inf]),
)
plt.plot(variable_t, variable_segment, "b-", label="data")
plt.plot(variable_t, func_null(variable_t, *popt), "r-")

In [None]:
log_L_null = -np.sum(
    (func_null(variable_t, *popt) - variable_segment) ** 2
)  ##dont know what to do about the sigma

In [None]:
popt, pcov = curve_fit(
    func_hyp,
    variable_t,
    variable_segment,
    bounds=([0.0, -np.inf, -np.inf, -np.inf, 0], [1.0, np.inf, np.inf, 0, np.inf]),
)
plt.plot(variable_t, variable_segment, "b-", label="data")
plt.plot(variable_t, func_hyp(variable_t, *popt), "r-")

In [None]:
log_L_hyp = -np.sum((func_hyp(variable_t, *popt) - variable_segment) ** 2)

In [None]:
log_L_hyp

In [None]:
LR = -2 * (log_L_null - log_L_hyp)

In [None]:
LR

In [None]:
chi_dist = sp.stats.chi2(df=1)

In [None]:
1.0 - chi_dist.cdf(LR)

In [None]:
c0, c1, c2 = 0.0, -5.0, 4.0

l1, l2 = -1.0, -0.5

y = func_hyp(variable_t, c0, c1, c2, l1, l2)

plt.plot(y)

In [None]:
y = X_feature_norm[0, :, 0]
X = np.array([0.0 for i in range(len(test_timeseries))]).reshape(-1, 1)

In [None]:
lr_model = skl.linear_model.LinearRegression().fit(X, y)

In [None]:
lr_model.score(X, y)

In [None]:
lr_model.intercept_

In [None]:
np.mean(y)

In [None]:
plt.plot(test_timeseries)

In [None]:
plt.plot(np.log(test_timeseries + 0.001))

In [None]:
def highlight_gene_group_property(an_df, selection_list, color=[]):

    highlight_genes_df = copy.deepcopy(an_df)

    selection_list = sorted(
        list(
            set(highlight_genes_df.obs["Gene"].unique().tolist()) & set(selection_list)
        )
    )

    for i, selected_gene in enumerate(selection_list):
        selected_series = (highlight_genes_df.obs["Gene"] == selected_gene).astype(
            "category"
        )
        selected_series = selected_series.cat.reorder_categories([True, False])
        highlight_genes_df.obs["Selected Genes: " + str(i)] = selected_series

    selected_series = (highlight_genes_df.obs["Gene"].isin(selection_list)).astype(
        "category"
    )
    selected_series = selected_series.cat.reorder_categories([True, False])
    highlight_genes_df.obs["All Genes"] = selected_series

    # selected_series = (paga_df.obs["Gene"]=="ftsZ").astype(float)
    # selected_series[selected_series==0.] = np.NaN
    # paga_df.obs["Selected Genes"] = selected_series

    fig = sc.pl.umap(
        highlight_genes_df,
        title=selection_list + ["All Genes"],
        color=["Selected Genes: " + str(i) for i in range(len(selection_list))]
        + ["All Genes"],
        groups=[True],
        show=False,
        legend_loc="right margin",
        add_outline=False,
        size=50,
        return_fig=True,
        palette={True: "red", False: "lightgrey"},
    )  # palette ={}

    return fig

In [None]:
fig = highlight_gene_group(feature_paga_df, ["dnaB"])

In [None]:
plt.scatter(
    feature_paga_df.obsm["X_umap"][:, 0],
    feature_paga_df.obsm["X_umap"][:, 1],
    s=5,
    c="lightgrey",
)

In [None]:
selection_list = ["dnaB"]

highlight_genes_df = copy.deepcopy(feature_paga_df)

selection_list = sorted(
    list(set(highlight_genes_df.obs["Gene"].unique().tolist()) & set(selection_list))
)

for i, selected_gene in enumerate(selection_list):
    plt.scatter(
        feature_paga_df.obsm["X_umap"][:, 0],
        feature_paga_df.obsm["X_umap"][:, 1],
        s=5,
        c="lightgrey",
    )
    plt.title(selected_gene)
    selected_series = highlight_genes_df.obs["Gene"] == selected_gene
    selected_df = highlight_genes_df[selected_series]
    targetid_list = selected_df.obs["TargetID"].unique().tolist()
    for i, targetid in enumerate(targetid_list):
        selected_targetid_df = selected_df[selected_df.obs["TargetID"] == targetid]
        selected_targetid_df.obs = selected_targetid_df.obs.set_index(
            "N Match"
        ).sort_index()
        x = selected_targetid_df.obsm["X_umap"][:, 0]
        y = selected_targetid_df.obsm["X_umap"][:, 1]
        plt.quiver(
            x[:-1],
            y[:-1],
            x[1:] - x[:-1],
            y[1:] - y[:-1],
            scale_units="xy",
            angles="xy",
            scale=1,
            color=vega_20_scanpy[i],
            label=targetid,
        )
    plt.legend()
    plt.show()

In [None]:
selected_targetid_df.obs

In [None]:
highlight_genes_df

In [None]:
selected_series.cat.categories

In [None]:
vega_20_scanpy

In [None]:
renormalizer = TimeSeriesScalerMinMax(value_range=(0.0, 1.0))

In [None]:
division_df = paga_df.obs[paga_df.obs["leiden"] == "6"]
division_df.drop(columns=["leiden", "leiden_lowres", "leiden_highres"])

In [None]:
division_df["Normalized Division Length Z-score"] = division_df[
    "Division Length Z-score"
].apply(lambda x: renormalizer.fit_transform([x])[0][:, 0])

In [None]:
X = np.array(division_df["Normalized Division Length Z-score"].tolist())[
    :, :, np.newaxis
]
X = np.swapaxes(X, 1, 2)
norm_soft_dtw_arr = parallel_norm_soft_dtw(X)

In [None]:
an_df_division = anndata.AnnData(
    X=X.reshape(X.shape[0], -1), obs=division_df
)  # AnnData container to use scanpy functions with unwrapped time vector

In [None]:
an_df_division

In [None]:
n_neighbors = 15
n_pcs = 20  # This shouldn't affect anything
resolution = 0.25

sc.pp.neighbors(an_df_division, n_neighbors=n_neighbors, n_pcs=n_pcs)
knn_indices, knn_dists, forest = sc.neighbors.compute_neighbors_umap(
    norm_soft_dtw_arr, n_neighbors=n_neighbors, metric="precomputed"
)
(
    an_df_division.uns["neighbors"]["distances"],
    an_df_division.uns["neighbors"]["connectivities"],
) = sc.neighbors._compute_connectivities_umap(
    knn_indices,
    knn_dists,
    an_df_division.shape[0],
    n_neighbors,  # change to neighbors you plan to use
)
an_df_division.obsp["distances"] = an_df_division.uns["neighbors"]["distances"]
an_df_division.obsp["connectivities"] = an_df_division.uns["neighbors"][
    "connectivities"
]

sc.tl.leiden(
    an_df_division,
    resolution=resolution,
    n_iterations=-1,
)
sc.tl.paga(an_df_division, groups="leiden")
sc.pl.paga(an_df_division, add_pos=True, show=True)

sc.tl.umap(an_df_division, init_pos="paga", min_dist=0.25, spread=5.0)

In [None]:
fig = sc.pl.umap(
    an_df_division,
    color=["leiden"],
    title=["Leiden Resolution=1."],
    show=False,
    legend_loc="on data",
    edges=True,
    add_outline=False,
    size=50,
    return_fig=True,
    palette=vega_20_scanpy,
)

In [None]:
fig = plot_cluster_timeseries(
    an_df_division.obs,
    "leiden",
    [
        "Division Length",
        "Linear Growth Rate",
        "Exponential Growth Rate",
        "Width",
        "mCherry Intensity",
        "Doubling Time",
    ],
    [
        "Division Length",
        "Linear Growth Rate",
        "Exponential Growth Rate",
        "Width",
        "mCherry Intensity",
        "Doubling Time",
    ],
    [(3, 12), (0, 20), (0.5, 2.5), (1.2, 1.6), (0, 6000), (0, 20)],
    figsize=(8, 8),
    wspace=0.25,
    hspace=0.25,
    color_list=vega_20_scanpy,
)

In [None]:
fig = plot_cluster_timeseries_braycenters(
    an_df_division.obs,
    "leiden",
    ["Division Length Z-score", "Normalized Division Length Z-score"],
    ["Division Length Z-score", "Normalized Division Length Z-score"],
    [(-4, 6), (0, 1)],
    figsize=(8, 8),
    wspace=0.25,
    hspace=0.25,
    color_list=vega_20_scanpy,
)

In [None]:
group_0 = set(an_df_division.obs[an_df_division.obs["leiden"] == "0"]["Gene"].unique())
group_1 = set(an_df_division.obs[an_df_division.obs["leiden"] == "1"]["Gene"].unique())
group_2 = set(an_df_division.obs[an_df_division.obs["leiden"] == "2"]["Gene"].unique())

strong_recovery_genes_only = list(group_1 - group_0 - group_2)
strong_recovery_genes = list(group_1)

diverging_genes_only = list(group_2 - group_0 - group_1)
diverging_genes = list(group_2)

In [None]:
an_df_division_strong_recovery_only = an_df_division[
    an_df_division.obs["Gene"].isin(strong_recovery_genes_only)
]
an_df_division_strong_recovery = an_df_division[
    an_df_division.obs["Gene"].isin(strong_recovery_genes)
]

In [None]:
plt.plot(
    np.array(
        an_df_division_strong_recovery_only.obs[
            "Normalized Division Length Z-score"
        ].tolist()
    ).T,
    c="grey",
    alpha=0.3,
)
plt.show()
plt.plot(
    np.array(
        an_df_division_strong_recovery.obs[
            "Normalized Division Length Z-score"
        ].tolist()
    ).T,
    c="grey",
    alpha=0.2,
)
plt.show()

In [None]:
sorted(list(strong_recovery_genes_only))

In [None]:
paga_df.obs