## PAGA Basic Analysis (Only Main Analysis)

- In this notebook, I include basic leiden clustering with no downstream timeseries analysis

- Mean parameter values are projected onto the space to visualize

- Distributions of genes of known function should be assembled in this notebook

- Finally, basic clustering and ontology enrichment are present at the end

- This notebook should be used to decide on clustering parameters and the resulting paga_df saved to disk

In [None]:
import ast
import copy
import random
import warnings

import anndata
import dask
import dask.array as da
import dask.dataframe as dd
import holoviews as hv
import igraph as ig
import leidenalg
import matplotlib as mpl
import matplotlib.gridspec as gridspec
import networkx as nx
import numpy as np
import pandas as pd
import pylab
import scanpy as sc
import scipy as sp
import scipy.cluster.hierarchy as sch
import scipy.sparse
import scipy.stats
import seaborn as sns
import sklearn as skl
import umap
from igraph.drawing.text import TextDrawer
from matplotlib import pyplot as plt
from scanpy.plotting.palettes import default_20, vega_20_scanpy
from sklearn.cluster import AffinityPropagation, AgglomerativeClustering
from sklearn.linear_model import LinearRegression
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import (
    cosine_distances,
    euclidean_distances,
    manhattan_distances,
)
from tslearn.barycenters import (
    dtw_barycenter_averaging,
    euclidean_barycenter,
    softdtw_barycenter,
)
from tslearn.metrics import cdist_soft_dtw, cdist_soft_dtw_normalized
from tslearn.neighbors import KNeighborsTimeSeries

import paulssonlab.deaton.trenchripper.trenchripper as tr

hv.extension("bokeh")
random.seed(42)
np.random.seed(42)

warnings.filterwarnings(action="once", category=UserWarning)

### Load Data From First Notebook

In [None]:
gene_cluster_df_full_w_control = pd.read_pickle(
    "/home/de64/scratch/de64/sync_folder/2021-11-08_lDE20_Final_3/2021-12-07_gene_cluster_df_no_filter.pkl"
)
gene_cluster_df_full = gene_cluster_df_full_w_control.dropna(
    subset=["Gene"]
)  # no control genes

In [None]:
gene_cluster_df_full = gene_cluster_df_full.rename(
    columns={
        "Kernel Trace: Division: major_axis_length: Yeo-Johnson: z score": "Division Length Z-score",
        "Kernel Trace: Mean Linear Growth Rate: Volume: Yeo-Johnson: z score": "Linear Growth Rate Z-score",
        "Kernel Trace: Mean Exponential Growth Rate: Volume: Yeo-Johnson: z score": "Exponential Growth Rate Z-score",
        "Kernel Trace: Mean: minor_axis_length: Yeo-Johnson: z score": "Width Z-score",
        "Kernel Trace: Mean: mCherry Intensity: Yeo-Johnson: z score": "mCherry Intensity Z-score",
        "Kernel Trace: Delta time (s): Yeo-Johnson: z score": "Doubling Time Z-score",
        "Kernel Trace: Division: major_axis_length": "Division Length",
        "Kernel Trace: Mean Linear Growth Rate: Volume": "Linear Growth Rate",
        "Kernel Trace: Mean Exponential Growth Rate: Volume": "Exponential Growth Rate",
        "Kernel Trace: Mean: minor_axis_length": "Width",
        "Kernel Trace: Mean: mCherry Intensity": "mCherry Intensity",
        "Kernel Trace: Delta time (s)": "Doubling Time",
    }
)

### Take mean z-scores over the timeseries

In [None]:
traces = [
    "Linear Growth Rate",
    "Exponential Growth Rate",
    "Division Length",
    "Width",
    "mCherry Intensity",
    "Doubling Time",
]

zscore_traces = [trace + " Z-score" for trace in traces]

for trace in traces:
    avg = gene_cluster_df_full.apply(lambda x: np.mean(x[trace]), axis=1)
    gene_cluster_df_full[trace + ": Mean"] = avg

for zscore_trace in zscore_traces:
    avg_zscore = gene_cluster_df_full.apply(lambda x: np.mean(x[zscore_trace]), axis=1)
    gene_cluster_df_full[zscore_trace + ": Mean"] = avg_zscore

## 4) Titrations


In [None]:
import anndata
import dask.array as da
import igraph as ig
import leidenalg
import networkx as nx
import scanpy as sc
import scipy as sp
import scipy.sparse
import umap
from igraph.drawing.text import TextDrawer
from scanpy.plotting.palettes import default_20, vega_20_scanpy
from tslearn.metrics import cdist_soft_dtw, cdist_soft_dtw_normalized
from tslearn.neighbors import KNeighborsTimeSeries

In [None]:
def get_pearson_df(titration_df, variable_name, pearson_p_val=0.05):
    var_df = (
        titration_df.groupby("TargetID")
        .apply(lambda x: x[variable_name].tolist())
        .to_frame()
    )
    var_df = var_df.rename(columns={0: variable_name})
    var_df["N Match"] = titration_df.groupby("TargetID").apply(
        lambda x: x["N Match"].tolist()
    )
    var_df[variable_name + ": Pearson R"] = var_df.apply(
        lambda x: sp.stats.pearsonr(x["N Match"], x[variable_name])[0], axis=1
    )
    var_df[variable_name + ": Pearson P-val"] = var_df.apply(
        lambda x: sp.stats.pearsonr(x["N Match"], x[variable_name])[1], axis=1
    )

    return var_df

paga_df_only### Looking at all targetids (including filtered out and unclustered)

In [None]:
import statsmodels.stats.multitest

min_titration = 6
fdr_p_val_thr = 0.01
var_list = [
    "Linear Growth Rate: Mean",
    "Exponential Growth Rate: Mean",
    "Division Length: Mean",
    "Width: Mean",
    "mCherry Intensity: Mean",
    "Doubling Time: Mean",
]

gene_cluster_df_full["N Match"] = 20 - gene_cluster_df_full["N Mismatch"]

target_count_series = gene_cluster_df_full.groupby("TargetID").apply(lambda x: len(x))
targetid_above_thr = target_count_series[
    target_count_series >= min_titration
].index.tolist()
titration_df = gene_cluster_df_full[
    gene_cluster_df_full["TargetID"].isin(targetid_above_thr)
]

# titration_df_pearson = titration_df.dropna(subset=['N Match'] + var_list)
pearson_df = titration_df.groupby("TargetID").apply(lambda x: x.iloc[0])

for var_label in var_list:
    var_df = get_pearson_df(titration_df, var_label)
    pearson_df[var_label + ": Pearson R"] = var_df[var_label + ": Pearson R"]
    pearson_df[var_label + ": Pearson P-val"] = var_df[var_label + ": Pearson P-val"]

pearson_p_labels = [var_label + ": Pearson P-val" for var_label in var_list]
pearson_r_labels = [var_label + ": Pearson R" for var_label in var_list]

pearson_r_sig = np.any(
    [
        statsmodels.stats.multitest.fdrcorrection(
            pearson_df[pearson_p_label].tolist(),
            fdr_p_val_thr,
            method="indep",
            is_sorted=False,
        )[0]
        for pearson_p_label in pearson_p_labels
    ],
    axis=0,
)
pearson_r_sig_df = pearson_df[pearson_r_sig]

n_targetids = len(gene_cluster_df_full["TargetID"].unique().tolist())
n_titration_targetids = np.sum(pearson_r_sig)

# pos_arr = (pearson_r_sig_df[pearson_r_labels]>0.).values
# neg_arr = (pearson_r_sig_df[pearson_r_labels]<=0.).values
# sig_arr = (pearson_r_sig_df[pearson_p_labels]<pearson_p_val_thr).values

# sig_pos_arr = sig_arr*pos_arr
# sig_neg_arr = sig_arr*neg_arr

# pearson_r_sig_df["Pearson Significant Positive"] = [item for item in sig_pos_arr]
# pearson_r_sig_df["Pearson Significant Negative"] = [item for item in sig_neg_arr]

In [None]:
target_count_series

In [None]:
print(n_titration_targetids / n_targetids)

In [None]:
genes_with_titration = sorted(pearson_r_sig_df["Gene"].unique().tolist())
all_genes = sorted(gene_cluster_df_full["Gene"].unique().tolist())
genes_wo_titration = sorted(list(set(all_genes) - set(genes_with_titration)))

In [None]:
len(genes_with_titration)

In [None]:
len(all_genes)

In [None]:
len(genes_wo_titration)

### Looking at clustered targetids

In [None]:
titratable_targetids = pearson_r_sig_df.index.unique().tolist()
titratable_mask = paga_df.obs["TargetID"].isin(titratable_targetids)
titration_paga_df = paga_df[titratable_mask].obs
titration_paga_df = titration_paga_df.reset_index().set_index("Gene").sort_index()

In [None]:
targetid_list = titration_paga_df["TargetID"].unique().tolist()

n_targetids = len(targetid_list)

var_names = [
    "Linear Growth Rate: Mean",
    "Exponential Growth Rate: Mean",
    "Division Length: Mean",
    "Width: Mean",
    "mCherry Intensity: Mean",
    "Doubling Time: Mean",
]

display_var_names = [
    "Linear Growth Rate",
    "Exponential Growth Rate",
    "Division Length",
    "Mean Width",
    "mCherry Intensity",
    "Doubling Time",
]

feature_ranges = [(0, 15), (0.5, 1.5), (3, 16), (1.2, 1.6), (0, 12000), (0, 8000)]

wspace = 0.25
hspace = 0.25
fontsize = 14

step_size = 15

for idx, n in enumerate(list(range(0, n_targetids, step_size))):
    figsize = (int(2.5 * step_size) + 1, 15)

    sub_targetid_list = targetid_list[n : n + step_size]

    fig = plt.figure(constrained_layout=True, figsize=figsize)
    gs = fig.add_gridspec(1, len(sub_targetid_list), wspace=wspace)

    for i, targetid in enumerate(sub_targetid_list):
        selected_targetid_df = gene_cluster_df_full[
            gene_cluster_df_full["TargetID"] == targetid
        ]

        inner_gs = gs[0, i].subgridspec(len(var_names), 1, wspace=0, hspace=hspace)
        inner_grid_sub = inner_gs.subplots(sharex=True)

        for j, ax in np.ndenumerate(inner_grid_sub):
            var_name = var_names[j[0]]
            ax.scatter(selected_targetid_df["N Match"], selected_targetid_df[var_name])
            ax.set_ylabel(display_var_names[j[0]], fontsize=fontsize)
            ax.set_ylim(feature_ranges[j[0]])

        ax.set_xlabel(
            selected_targetid_df["Gene"].iloc[0] + " TargetID: " + str(targetid),
            fontsize=fontsize,
        )

    plt.tight_layout()
    plt.savefig("4_Titrations/" + str(idx) + ".png", dpi=75)

### Less Strict Cutoff

In [None]:
import statsmodels.stats.multitest

min_titration = 5
fdr_p_val_thr = 0.05
var_list = [
    "Linear Growth Rate: Mean",
    "Exponential Growth Rate: Mean",
    "Division Length: Mean",
    "Width: Mean",
    "mCherry Intensity: Mean",
    "Doubling Time: Mean",
]

gene_cluster_df_full["N Match"] = 20 - gene_cluster_df_full["N Mismatch"]

target_count_series = gene_cluster_df_full.groupby("TargetID").apply(lambda x: len(x))
targetid_above_thr = target_count_series[
    target_count_series >= min_titration
].index.tolist()
titration_df = gene_cluster_df_full[
    gene_cluster_df_full["TargetID"].isin(targetid_above_thr)
]

# titration_df_pearson = titration_df.dropna(subset=['N Match'] + var_list)
pearson_df = titration_df.groupby("TargetID").apply(lambda x: x.iloc[0])

for var_label in var_list:
    var_df = get_pearson_df(titration_df, var_label)
    pearson_df[var_label + ": Pearson R"] = var_df[var_label + ": Pearson R"]
    pearson_df[var_label + ": Pearson P-val"] = var_df[var_label + ": Pearson P-val"]

pearson_p_labels = [var_label + ": Pearson P-val" for var_label in var_list]
pearson_r_labels = [var_label + ": Pearson R" for var_label in var_list]

pearson_r_sig = np.any(
    [
        statsmodels.stats.multitest.fdrcorrection(
            pearson_df[pearson_p_label].tolist(),
            fdr_p_val_thr,
            method="indep",
            is_sorted=False,
        )[0]
        for pearson_p_label in pearson_p_labels
    ],
    axis=0,
)
pearson_r_sig_df = pearson_df[pearson_r_sig]

n_targetids = len(gene_cluster_df_full["TargetID"].unique().tolist())
n_titration_targetids = np.sum(pearson_r_sig)

# pos_arr = (pearson_r_sig_df[pearson_r_labels]>0.).values
# neg_arr = (pearson_r_sig_df[pearson_r_labels]<=0.).values
# sig_arr = (pearson_r_sig_df[pearson_p_labels]<pearson_p_val_thr).values

# sig_pos_arr = sig_arr*pos_arr
# sig_neg_arr = sig_arr*neg_arr

# pearson_r_sig_df["Pearson Significant Positive"] = [item for item in sig_pos_arr]
# pearson_r_sig_df["Pearson Significant Negative"] = [item for item in sig_neg_arr]

In [None]:
print(n_titration_targetids / n_targetids)

In [None]:
genes_with_titration = sorted(pearson_r_sig_df["Gene"].unique().tolist())
all_genes = sorted(gene_cluster_df_full["Gene"].unique().tolist())
genes_wo_titration = sorted(list(set(all_genes) - set(genes_with_titration)))

In [None]:
len(genes_with_titration)