## PAGA Basic Analysis (Only Main Analysis)

- In this notebook, I include basic leiden clustering with no downstream timeseries analysis

- Mean parameter values are projected onto the space to visualize

- Distributions of genes of known function should be assembled in this notebook

- Finally, basic clustering and ontology enrichment are present at the end

- This notebook should be used to decide on clustering parameters and the resulting paga_df saved to disk

In [None]:
import paulssonlab.deaton.trenchripper.trenchripper as tr

import numpy as np
import pandas as pd
import seaborn as sns
import scipy as sp
import sklearn as skl
import dask.dataframe as dd
import dask.array as da
import dask
import warnings
import copy
import random
from sklearn.metrics.pairwise import (
    euclidean_distances,
    manhattan_distances,
    cosine_distances,
)

from sklearn.metrics import silhouette_score
import scipy.stats
from sklearn.linear_model import LinearRegression
from sklearn.cluster import AffinityPropagation
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering

import scanpy as sc
import anndata
import scipy as sp
import scipy.sparse
import dask.array as da
from igraph.drawing.text import TextDrawer
from tslearn.neighbors import KNeighborsTimeSeries
from tslearn.metrics import cdist_soft_dtw_normalized, cdist_soft_dtw
import networkx as nx
import igraph as ig
import leidenalg
import umap
from scanpy.plotting.palettes import default_20, vega_20_scanpy
from matplotlib import pyplot as plt
import ast
from tslearn.barycenters import (
    softdtw_barycenter,
    dtw_barycenter_averaging,
    euclidean_barycenter,
)


import pylab
import scipy.cluster.hierarchy as sch

import matplotlib.gridspec as gridspec
import matplotlib as mpl

import holoviews as hv

hv.extension("bokeh")
random.seed(42)
np.random.seed(42)

warnings.filterwarnings(action="once", category=UserWarning)

### Load Data From First Notebook

In [None]:
paga_df_only = pd.read_pickle("./2021-12-07_paga_df_only.pkl")
paga_df = sc.read("./2021-12-07_paga_df.h5ad")
paga_df.obs = paga_df_only

## 2) Cluster Analysis

In [None]:
def plot_cluster_timeseries(
    df,
    cluster_label,
    feature_labels,
    displayed_labels,
    feature_range_list,
    agg_fn=np.mean,
    x_ticks=[0, 10, 20],
    cluster_subset=None,
    figsize=(10, 10),
    wspace=0.0,
    hspace=0.0,
    fontsize=14,
    linewidth=5,
    color_list=None,
):

    if cluster_subset is not None:
        df = copy.copy(df)
        df = df[df[cluster_label].isin(cluster_subset)]

    timeseries_list = []
    for feature_label in feature_labels:
        agg_cluster_timeseries = (
            df.groupby([cluster_label])
            .apply(lambda x: agg_fn(np.array(x[feature_label].tolist()), axis=0))
            .to_frame()
        )
        agg_cluster_timeseries = agg_cluster_timeseries.rename(
            columns={0: feature_label}
        )
        timeseries_list.append(agg_cluster_timeseries)
    timeseries_df = pd.concat(timeseries_list, axis=1)
    timeseries_df = timeseries_df.dropna(axis=0)

    fig = plt.figure(constrained_layout=True, figsize=figsize)
    gs = fig.add_gridspec(1, len(timeseries_df), wspace=wspace)

    for i in range(len(timeseries_df)):
        mean_cluster_timeseries = timeseries_df.iloc[i]

        clust_arr = np.array(timeseries_df.iloc[i].tolist())
        if color_list == None:
            color = "tab:blue"
        else:
            color = color_list[i]

        if i == 0:
            inner_gs = gs[0, i].subgridspec(
                clust_arr.shape[0], 1, wspace=0, hspace=hspace
            )
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):

                feature_range = feature_range_list[c[0]]
                ax.plot(clust_arr[c], linewidth=linewidth, color=color)
                ax.set_ylim(feature_range[0], feature_range[1])
                ax.set(xticks=[])
                #                 ax.set(xticks=[], yticks=[0,6])
                ax.set_ylabel(
                    displayed_labels[c[0]],
                    rotation=0,
                    labelpad=30,
                    fontsize=fontsize,
                    ha="right",
                )  # ,orientation="horizontal")

            ax.set_xlabel(str(timeseries_df.index[i]), fontsize=fontsize)
            ax.set(xticks=x_ticks)

        else:
            inner_gs = gs[0, i].subgridspec(
                clust_arr.shape[0], 1, wspace=0, hspace=hspace
            )
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):
                feature_range = feature_range_list[c[0]]
                ax.plot(clust_arr[c], linewidth=linewidth, color=color)
                ax.set_ylim(feature_range[0], feature_range[1])
                ax.set(xticks=[], yticks=[])

            ax.set_xlabel(str(timeseries_df.index[i]), fontsize=fontsize)
            ax.set(xticks=x_ticks)

    plt.tight_layout()
    return fig

In [None]:
fig = sc.pl.umap(
    paga_df,
    color=["leiden_lowres", "leiden", "leiden_highres", "leiden_ultrahighres"],
    title=[
        "Leiden Resolution=0.25",
        "Leiden Resolution=1.",
        "Leiden Resolution=1.5",
        "Leiden Resolution=3.",
    ],
    show=False,
    legend_loc="on data",
    edges=True,
    add_outline=False,
    size=50,
    return_fig=True,
    palette=vega_20_scanpy,
)

In [None]:
fig = plot_cluster_timeseries(
    paga_df.obs,
    "leiden_lowres",
    [
        "Division Length",
        "Linear Growth Rate",
        "Exponential Growth Rate",
        "Width",
        "mCherry Intensity",
        "Doubling Time",
    ],
    [
        "Division Length",
        "Linear Growth Rate",
        "Exponential Growth Rate",
        "Width",
        "mCherry Intensity",
        "Doubling Time",
    ],
    [(3, 16), (0, 20), (0.5, 2.5), (1.2, 1.6), (0, 12000), (0, 8000)],
    figsize=(8, 8),
    wspace=0.25,
    hspace=0.25,
    color_list=vega_20_scanpy,
)
fig.tight_layout()
# fig.savefig("./2_Cluster_Analysis/leiden_highres_timeseries.png",dpi=150)

In [None]:
fig = plot_cluster_timeseries(
    paga_df.obs,
    "leiden",
    [
        "Division Length",
        "Linear Growth Rate",
        "Exponential Growth Rate",
        "Width",
        "mCherry Intensity",
        "Doubling Time",
    ],
    [
        "Division Length",
        "Linear Growth Rate",
        "Exponential Growth Rate",
        "Width",
        "mCherry Intensity",
        "Doubling Time",
    ],
    [(3, 16), (0, 20), (0.5, 2.5), (1.2, 1.6), (0, 12000), (0, 8000)],
    figsize=(20, 8),
    wspace=0.25,
    hspace=0.25,
    color_list=vega_20_scanpy,
)
fig.tight_layout()
# fig.savefig("./2_Cluster_Analysis/leiden_highres_timeseries.png",dpi=150)

In [None]:
sub_cluster_df = paga_df.obs[(paga_df.obs["leiden"] == "10")]

fig = plot_cluster_timeseries(
    sub_cluster_df,
    "leiden_ultrahighres",
    [
        "Division Length",
        "Linear Growth Rate",
        "Exponential Growth Rate",
        "Width",
        "mCherry Intensity",
        "Doubling Time",
    ],
    [
        "Division Length",
        "Linear Growth Rate",
        "Exponential Growth Rate",
        "Width",
        "mCherry Intensity",
        "Doubling Time",
    ],
    [(3, 16), (0, 20), (0.5, 2.5), (1.2, 1.8), (0, 12000), (0, 8000)],
    figsize=(8, 8),
    wspace=0.25,
    hspace=0.25,
    color_list=vega_20_scanpy,
)
fig.tight_layout()
# fig.savefig("./2_Cluster_Analysis/leiden_highres_timeseries.png",dpi=150)

### Barycenter Timeseries

In [None]:
from tslearn.barycenters import (
    softdtw_barycenter,
    dtw_barycenter_averaging,
    euclidean_barycenter,
)

In [None]:
def get_braycenters(df, columns=None, max_iter=50, tol=0.001):
    df = df[columns]
    X = np.array(df.apply(lambda x: x.tolist(), axis=1).tolist())
    X = np.swapaxes(X, 1, 2)

    Y = softdtw_barycenter(X, max_iter=max_iter, tol=tol)  # T X D

    return Y


def plot_cluster_timeseries_braycenters(
    df,
    cluster_label,
    feature_labels,
    displayed_labels,
    feature_range_list,
    x_ticks=[0, 10, 20],
    cluster_subset=None,
    figsize=(10, 10),
    wspace=0.0,
    hspace=0.0,
    fontsize=14,
    linewidth=3,
    color_list=None,
):

    if cluster_subset is not None:
        df = copy.copy(df)
        df = df[df[cluster_label].isin(cluster_subset)]

    cluster_groupby = df.groupby([cluster_label])
    agg_cluster_timeseries = cluster_groupby.apply(
        lambda x: get_braycenters(x, columns=feature_labels)
    )

    timeseries_list = []
    for i, feature_label in enumerate(feature_labels):
        selected_feature_agg = agg_cluster_timeseries.apply(
            lambda x: x[:, i]
        ).to_frame()
        selected_feature_agg = selected_feature_agg.rename(columns={0: feature_label})
        timeseries_list.append(selected_feature_agg)

    timeseries_df = pd.concat(timeseries_list, axis=1)

    fig = plt.figure(constrained_layout=True, figsize=figsize)
    gs = fig.add_gridspec(1, len(timeseries_df), wspace=wspace)

    for i in range(len(timeseries_df)):
        mean_cluster_timeseries = timeseries_df.iloc[i]

        clust_arr = np.array(timeseries_df.iloc[i].tolist())
        if color_list == None:
            color = "tab:blue"
        else:
            color = color_list[i]

        if i == 0:
            inner_gs = gs[0, i].subgridspec(
                clust_arr.shape[0], 1, wspace=0, hspace=hspace
            )
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):

                feature_range = feature_range_list[c[0]]
                ax.plot(clust_arr[c], linewidth=linewidth, color=color)
                ax.set_ylim(feature_range[0], feature_range[1])
                ax.set(xticks=[])
                #                 ax.set(xticks=[], yticks=[0,6])
                ax.set_ylabel(
                    displayed_labels[c[0]],
                    rotation=0,
                    labelpad=30,
                    fontsize=fontsize,
                    ha="right",
                )  # ,orientation="horizontal")

            ax.set_xlabel(str(i), fontsize=fontsize)
            ax.set(xticks=x_ticks)

        else:
            inner_gs = gs[0, i].subgridspec(
                clust_arr.shape[0], 1, wspace=0, hspace=hspace
            )
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):
                feature_range = feature_range_list[c[0]]
                ax.plot(clust_arr[c], linewidth=linewidth, color=color)
                ax.set_ylim(feature_range[0], feature_range[1])
                ax.set(xticks=[], yticks=[])

            ax.set_xlabel(str(i), fontsize=fontsize)
            ax.set(xticks=x_ticks)

    plt.tight_layout()
    return fig

In [None]:
fig = plot_cluster_timeseries_braycenters(
    paga_df.obs,
    "leiden_lowres",
    [
        "Division Length Z-score",
        "Linear Growth Rate Z-score",
        "Exponential Growth Rate Z-score",
        "Width Z-score",
        "mCherry Intensity Z-score",
        "Doubling Time Z-score",
    ],
    [
        "Division Length Z-score",
        "Linear Growth Rate Z-score",
        "Exponential Growth Rate Z-score",
        "Width Z-score",
        "mCherry Intensity Z-score",
        "Doubling Time Z-score",
    ],
    [(-4, 4), (-4, 4), (-4, 4), (-4, 4), (-4, 4), (-4, 4)],
    figsize=(8, 8),
    wspace=0.25,
    hspace=0.25,
    color_list=vega_20_scanpy,
)
# fig.savefig("./2_Cluster_Analysis/leiden_highres_barycenters.png",dpi=150)

In [None]:
fig = plot_cluster_timeseries_braycenters(
    paga_df.obs,
    "leiden",
    [
        "Division Length Z-score",
        "Linear Growth Rate Z-score",
        "Exponential Growth Rate Z-score",
        "Width Z-score",
        "mCherry Intensity Z-score",
        "Doubling Time Z-score",
    ],
    [
        "Division Length Z-score",
        "Linear Growth Rate Z-score",
        "Exponential Growth Rate Z-score",
        "Width Z-score",
        "mCherry Intensity Z-score",
        "Doubling Time Z-score",
    ],
    [(-4, 4), (-4, 4), (-4, 4), (-4, 4), (-4, 4), (-4, 4)],
    figsize=(25, 8),
    wspace=0.25,
    hspace=0.25,
    color_list=vega_20_scanpy,
)
# fig.savefig("./2_Cluster_Analysis/leiden_highres_barycenters.png",dpi=150)

### Timeseries of Interest

In [None]:
def plot_selected_timeseries(df, figsize=(30, 10)):

    feature_labels = [
        "Linear Growth Rate",
        "Exponential Growth Rate",
        "Division Length",
        "Width",
        "mCherry Intensity",
        "Doubling Time",
    ]

    feature_ranges = [(0, 15), (0.5, 1.5), (3, 16), (1.2, 1.6), (0, 12000), (0, 8000)]

    len_labels = len(feature_labels)

    feature_series_list = []
    for feature_label in feature_labels:
        feature_series = df.groupby(["Gene", "TargetID"]).apply(
            lambda x: x[feature_label].tolist()
        )
        feature_series_list.append(feature_series)

    len_series = len(feature_series)

    fig = plt.figure(figsize=figsize)

    for j, feature_series in enumerate(feature_series_list):
        for i, (idx, row) in enumerate(feature_series.iteritems()):
            plot_arr = np.array(row).T
            ax = plt.subplot(len(feature_labels), len_series, (j * len_series) + i + 1)
            ax.set_title(str(idx) + ": " + feature_labels[j])
            ax.set_ylim(feature_ranges[j])
            ax.plot(plot_arr, color="tab:blue")
    plt.tight_layout()
    plt.show()

In [None]:
all_genes = paga_df.obs["Gene"].unique().tolist()

clust_id = 0
freq_thr = 8

clust_id = str(clust_id)

unique_genes = np.unique(
    paga_df.obs[paga_df.obs["leiden"] == clust_id]["Gene"], return_counts=True
)
cluster_genes = sorted(unique_genes[0][unique_genes[1] >= freq_thr].tolist())

sub_cluster_df = paga_df.obs[(paga_df.obs["Gene"].isin(cluster_genes))]
sub_cluster_df["Gene"] = sub_cluster_df["Gene"].astype(str)

In [None]:
plot_selected_timeseries(paga_df.obs[paga_df.obs["Gene"] == "minC"], figsize=(8, 10))