## Steady-state Analysis of lDE20 (with lineage Dataframe ready)

- Note that there are fluctuations in the illumination intensity which may be resulting in pathological behavior from the reporter

- Consider either normalizing this out or fixing the underlying problem

In [None]:
import paulssonlab.deaton.trenchripper.trenchripper as tr

import numpy as np
import pandas as pd
import seaborn as sns
import scipy as sp
import sklearn as skl
import dask.dataframe as dd
import dask.array as da
import dask
import warnings
import copy
import random
from sklearn.metrics.pairwise import (
    euclidean_distances,
    manhattan_distances,
    cosine_distances,
)

from sklearn.metrics import silhouette_score
import scipy.stats
from sklearn.linear_model import LinearRegression
from sklearn.cluster import AffinityPropagation
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering

from matplotlib import pyplot as plt
import ast


import pylab
import scipy.cluster.hierarchy as sch

import matplotlib.gridspec as gridspec
import matplotlib as mpl

import holoviews as hv

hv.extension("bokeh")
random.seed(42)
np.random.seed(42)

warnings.filterwarnings(action="once", category=UserWarning)

In [None]:
def get_timepoint_values(
    df,
    label,
    min_timepoint,
    max_timepoint,
    time_label="final cell timepoints list",
    flatten_vals=True,
):
    masked_label_series = df.apply(
        lambda x: np.array(x[label])[
            (np.array(x[time_label]) >= min_timepoint)
            * (np.array(x[time_label]) <= max_timepoint)
        ],
        axis=1,
    )
    if flatten_vals:
        flattened_vals = [val for item in masked_label_series.tolist() for val in item]
        return flattened_vals
    else:
        return masked_label_series


def get_feature_stats(df, feature_label, min_timepoint, max_timepoint):
    feature_vals = get_timepoint_values(df, feature_label, min_timepoint, max_timepoint)
    feature_median = np.median(feature_vals)
    feature_iqr = sp.stats.iqr(feature_vals)
    return feature_median, feature_iqr


def get_feature_median_bytrench(df, feature_label, min_timepoint, max_timepoint):
    masked_label_series = get_timepoint_values(
        final_output_df_pd_filtered,
        feature_label,
        min_timepoint,
        max_timepoint,
        flatten_vals=False,
    )
    trench_median_series = masked_label_series.apply(lambda x: np.nanmedian(x))
    return trench_median_series


def get_feature_scores(
    df,
    feature_label,
    trench_median_series,
    feature_median,
    feature_iqr,
    time_label="final cell timepoints list",
    timepoint_range=None,
):
    scaling_factor = 1.35 * (feature_median / feature_iqr)

    if timepoint_range == None:
        feature_scores = (
            (df[feature_label].apply(lambda x: np.array(x))) / trench_median_series
        ) - 1.0
    else:
        feature_scores = (
            (
                df[feature_label].apply(
                    lambda x: np.array(x)[
                        (np.array(x[time_label]) >= timepoint_range[0])
                        * (np.array(x[time_label]) <= timepoint_range[1])
                    ]
                )
            )
            / trench_median_series
        ) - 1.0
    feature_scores = scaling_factor * feature_scores
    return feature_scores


def get_avg_feature_score(
    df,
    feature_label,
    init_timepoint_range=(0, 20),
    time_label="final cell timepoints list",
    timepoint_range=None,
):
    feature_median, feature_iqr = get_feature_stats(
        df, feature_label, init_timepoint_range[0], init_timepoint_range[1]
    )
    trench_median_series = get_feature_median_bytrench(
        df, feature_label, init_timepoint_range[0], init_timepoint_range[1]
    )
    feature_scores = get_feature_scores(
        df,
        feature_label,
        trench_median_series,
        feature_median,
        feature_iqr,
        time_label=time_label,
        timepoint_range=timepoint_range,
    )
    avg_feature_scores = feature_scores.apply(lambda x: np.nanmean(x))
    return avg_feature_scores


def get_all_avg_feature_scores(
    df,
    feature_labels,
    init_timepoint_range=(0, 20),
    time_label="final cell timepoints list",
    timepoint_range=None,
):

    for feature_label in feature_labels:
        print(feature_label)
        avg_feature_scores = get_avg_feature_score(
            df,
            feature_label,
            init_timepoint_range=init_timepoint_range,
            time_label=time_label,
            timepoint_range=timepoint_range,
        )
        df[feature_label + ": score"] = avg_feature_scores

    return df


def get_sgrnadf_from_scoredf(
    scoredf, feature_labels, score_agg=np.nanmedian, score_agg_name="median"
):
    scoredf_groupby = scoredf.groupby("sgRNA")
    sgrnadf = (
        scoredf_groupby.apply(lambda x: x["phenotype trenchid"].tolist())
        .to_frame()
        .rename(columns={0: "phenotype trenchid"})
    )

    for feature_label in feature_labels:
        sgrnadf[feature_label + ": score " + score_agg_name] = scoredf_groupby.apply(
            lambda x: score_agg(np.array(x[feature_label + ": score"].tolist()))
        )

    sgrnadf["Gene"] = scoredf_groupby.apply(lambda x: x["Gene"].iloc[0])
    sgrnadf["TargetID"] = scoredf_groupby.apply(lambda x: x["TargetID"].iloc[0])
    sgrnadf["N Mismatch"] = scoredf_groupby.apply(lambda x: x["N Mismatch"].iloc[0])
    sgrnadf["N Observations"] = scoredf_groupby.apply(
        lambda x: len(x["phenotype trenchid"].tolist())
    )
    sgrnadf["Category"] = scoredf_groupby.apply(lambda x: x["Category"].iloc[0])

    return sgrnadf


# No longer using this
# def filter_strong_KOs(df,sampling_thr = 4, n_strongest=2):

#     for i in range(sampling_thr,0,-1):
#         sampling_mask = df["N Observations"]>=sampling_thr
#         mismatch_series = df[sampling_mask]["N Mismatch"]

#         for n in range(n_strongest,0,-1):
#             if len(mismatch_series)>=n:
#                 keep_indices = np.argsort(mismatch_series)[:n]
#                 out_df = df[sampling_mask].iloc[keep_indices]

#                 return out_df

### Initial Data Processing

Here, I am going to try and replicate (to some extant) the corrections from "Genomewide phenotypic analysis of growth, cell morphogenesis, and cell cycle events in Escherichia coli"

#### Start Dask

In [None]:
headpath = (
    "/home/de64/scratch/de64/sync_folder/2021-06-14_lDE20_biofloat_fullrun_1/Barcodes"
)

In [None]:
dask_controller = tr.trcluster.dask_controller(
    walltime="04:00:00",
    local=False,
    n_workers=10,
    memory="16GB",
    working_directory=headpath + "/dask",
)
dask_controller.startdask()

In [None]:
dask_controller.displaydashboard()

#### Import Dataframe

In [None]:
final_output_df_pd = pd.read_pickle(
    "/home/de64/scratch/de64/sync_folder/2021-06-14_lDE20_biofloat_fullrun_1/2021-07-26_lDE20_Lineage_Analysis.pkl"
)
final_output_df_pd = final_output_df_pd[
    ~final_output_df_pd["final cell timepoints list"].isna()
]

#### Filter for "Normal" Sizes at Start

1) Fit a gaussian model to each of the specified feature params during the first t timepoints of the experiment (using a subsample for speed) 
2) Compute a normalized probability trenchwise for these features under the gaussian model, during the first t timepoints of the experiment
3) Eliminate trenches that are under some p percentile value of this probability for each feature
4) Display histograms for each property as well as the resulting theshold

Note that these features should be the only features examined in the resulting analysis. For the notebook, I am looking at:
- Birth length (Lb)
- Division length (Ld)
- Mean Area Increment
- Mean Length Increment
- Mean Width
- Cell cycle duration (Delta t)
- Mean mCherry Intensity

In [None]:
early_timepoint_cutoff = 30
gaussian_subsample = 0.2
percentile_threshold = 10

filter_params = [
    "Lb list",
    "Ld list",
    "Mean Area Increment list",
    "Mean Length Increment list",
    "Mean Width list",
    "Mean mCherry Intensity list",
    "Delta t list",
]

final_output_df_pd_dask = dd.from_pandas(final_output_df_pd, npartitions=100).persist()
dask.distributed.wait(final_output_df_pd_dask)
final_output_df_pd_dask["Early Timepoint Mask"] = final_output_df_pd_dask[
    "cell timepoints list"
].apply(
    lambda x: np.array([item if (type(item) is int) else 10000000 for item in x])
    < early_timepoint_cutoff,
    meta=(None, "object"),
)

for filter_param in filter_params:
    early_param_series = final_output_df_pd_dask.apply(
        lambda x: np.array(x[filter_param])[x["Early Timepoint Mask"]]
        if type(x[filter_param]) is list
        else np.array([]),
        axis=1,
        meta=(None, "object"),
    )
    all_param_values = [
        val
        for item in early_param_series.sample(frac=gaussian_subsample)
        .compute()
        .tolist()
        for val in item
    ]
    gaussian_fit = sp.stats.norm.fit(all_param_values)
    gaussian_fit = sp.stats.norm(loc=gaussian_fit[0], scale=gaussian_fit[1])

    final_output_df_pd[filter_param + ": Probability"] = early_param_series.apply(
        lambda x: np.exp(np.sum(gaussian_fit.logpdf(x)) / len(x)), meta=float
    ).persist()

plt.figure(figsize=(22, 16))
query_list = []
for i, filter_param in enumerate(filter_params):
    prob_threshold = np.nanpercentile(
        final_output_df_pd[filter_param + ": Probability"].tolist(),
        percentile_threshold,
    )
    query = "`" + filter_param + ": Probability` > " + str(prob_threshold)
    query_list.append(query)

    min_v, max_v = np.min(final_output_df_pd[filter_param + ": Probability"]), np.max(
        final_output_df_pd[filter_param + ": Probability"]
    )

    plt.subplot(3, 5, i + 1)
    plt.title(filter_param)
    plt.hist(
        final_output_df_pd[
            final_output_df_pd[filter_param + ": Probability"] < prob_threshold
        ][filter_param + ": Probability"].tolist(),
        bins=50,
        range=(min_v, max_v),
    )
    plt.hist(
        final_output_df_pd[
            final_output_df_pd[filter_param + ": Probability"] >= prob_threshold
        ][filter_param + ": Probability"].tolist(),
        bins=50,
        range=(min_v, max_v),
    )
plt.show()

compiled_query = " and ".join(query_list)
final_output_df_pd_filtered = final_output_df_pd.query(compiled_query)

In [None]:
len(final_output_df_pd_filtered) / len(final_output_df_pd)

### Convert properties to z-scores

1) Apply Yeo-Johnson transform to all properties to stabalize ranges and make distributions more gaussian
2) Convert transformed values to z-scores using the following formula:

$$ z = 1.35 \times \frac{median_{t\in \tau}(F_{i,t})}{iqr_{t\in \tau}(F_{i,t})}\Bigg(\frac{mean_{t\in T}(F_{i,k,t})}{median_{t\in \tau}(F_{i,k,t})} - 1\Bigg) $$

where $F_{i,k,t}$ are the yeo-johnson transformed features values for feature i, trench k at time t. $\tau$ are the initial pre-induction timepoints while $T$ are the timepoints from the whole timeseries. 

Essentially this is a z-score using the more outlier robust median and interquartile range to define the differences from normal bahavior. The 1.35 factor scales the values such that z-scores represent number of standard deviations from the mean for a normal distribution. Finally the values are normalized by initial behaviors trenchwise by the $median_{t\in \tau}(F_{i,k,t})$ factor.

In [None]:
params_to_transform = [
    "Lb list",
    "Ld list",
    "Mean Area Increment list",
    "Mean Length Increment list",
    "Mean Width list",
    "Mean mCherry Intensity list",
    "Delta t list",
]
yeo_subsample = 0.1

final_output_df_pd_filtered_dask = dd.from_pandas(
    final_output_df_pd_filtered, npartitions=100
).persist()
dask.distributed.wait(final_output_df_pd_filtered_dask)

for i, param in enumerate(params_to_transform):
    all_param_values = [
        float(val)
        for item in final_output_df_pd_filtered_dask[param]
        .sample(frac=yeo_subsample)
        .compute()
        .tolist()
        for val in item
    ]
    l_norm = sp.stats.yeojohnson_normmax(all_param_values)
    final_output_df_pd_filtered_dask[param + ": Yeo-Johnson"] = (
        final_output_df_pd_filtered_dask[param]
        .apply(
            lambda x: sp.stats.yeojohnson(np.array(x).astype(float), lmbda=l_norm),
            meta="object",
        )
        .persist()
    )
final_output_df_pd_filtered = final_output_df_pd_filtered_dask.compute()

scoredf = get_all_avg_feature_scores(
    final_output_df_pd_filtered,
    [param + ": Yeo-Johnson" for param in params_to_transform],
)
sgrnadf = get_sgrnadf_from_scoredf(
    scoredf, [param + ": Yeo-Johnson" for param in params_to_transform]
)

### sgRNA Effect Size Filtering (within Gene groups)

1) Threshold sgRNAs to include by number of observations
2) Vectorize feature z scores and apply a euclidean norm to measure effect size (this can also be done with a manhattan norm)
3) Thrshold sgRNAs for strong effects by applying a threshold to the euclidean norm that will be displayed with histogram
4) Display a histogram for the sgRNA number per gene

In [None]:
sampling_thr = 4
strong_effect_threshold = 1.75

sgrnadf_wellsampled = sgrnadf[sgrnadf["N Observations"] >= sampling_thr]

feature_vector_series = sgrnadf_wellsampled.apply(
    lambda x: x[sgrnadf_wellsampled.columns[1:8]].values, axis=1
)
sgrnadf_wellsampled["Feature Vector"] = feature_vector_series
zero_vector = np.zeros((1, feature_vector_series.iloc[0].shape[0]))
sgrnadf_wellsampled["Euclidean Norm"] = euclidean_distances(
    np.array(feature_vector_series.tolist()), zero_vector
)[:, 0]


sgrnadf_strong_effect = sgrnadf_wellsampled[
    sgrnadf_wellsampled["Euclidean Norm"] >= strong_effect_threshold
]
min_v, max_v = np.min(sgrnadf_wellsampled["Euclidean Norm"]), np.max(
    sgrnadf_wellsampled["Euclidean Norm"]
)

plt.figure(figsize=(8, 8))
plt.title("Euclidean Norm")
plt.hist(
    sgrnadf_wellsampled[
        sgrnadf_wellsampled["Euclidean Norm"] < strong_effect_threshold
    ]["Euclidean Norm"].tolist(),
    bins=50,
    range=(min_v, max_v),
)
plt.hist(
    sgrnadf_wellsampled[
        sgrnadf_wellsampled["Euclidean Norm"] >= strong_effect_threshold
    ]["Euclidean Norm"].tolist(),
    bins=50,
    range=(min_v, max_v),
)
plt.show()

unique_genes, gene_counts = np.unique(sgrnadf_strong_effect["Gene"], return_counts=True)
plt.title("sgRNAs per Gene")
plt.xticks(range(0, 20, 2), labels=range(0, 20, 2))
plt.hist(gene_counts, bins=np.arange(20) - 0.5)
plt.show()

### Pick Representative Effect per TargetID
~~1) For each target, pick the sgRNA that is most representative of the set by cosine distance (i.e. minimizes the sum of the distances to the sgRNAs in the group)~~
1) For each target, pick the sgRNA that has the strongest phenotype (highest euclidean norm)
2) Additionally identify any targets with titration information by saving a dataframe with targetIDs that posess at least N sgRNAs
    - this is in a preliminary form; transfer to a full notebook later

In [None]:
sgrnadf_strong_effect

In [None]:
most_rep_example_series = (
    sgrnadf_strong_effect.reset_index(drop=False)
    .groupby("TargetID")
    .apply(lambda x: x.iloc[np.argmax(x["Euclidean Norm"])])
    .reset_index(drop=True)
    .set_index("sgRNA", drop=True)
)

In [None]:
# most_rep_example_series = sgrnadf_strong_effect.reset_index(drop=False).groupby("TargetID").apply(lambda x:  \
# x.iloc[np.argmin(np.sum(cosine_distances(np.array(x["Feature Vector"].tolist())),axis=0))]).reset_index(drop=True).set_index("sgRNA", drop=True)

In [None]:
def get_all_stat_correlations(df, stat_list):
    stat_correlations = []
    for stat in stat_list:
        mismatch_series = df["N Mismatch"]
        stat_series = df[stat]
        pearson_r = sp.stats.pearsonr(mismatch_series, stat_series)[0]
        stat_correlations.append(pearson_r)
    return stat_correlations


def get_genes_with_titration(df, stat_list, N_sgRNAs_thr, correlation_magnitude):
    n_sgRNA_per_TargetID = (
        df.reset_index(drop=False).groupby("TargetID").apply(lambda x: len(x))
    )
    well_sampled_TargetID_mask = n_sgRNA_per_TargetID >= N_sgRNAs_thr
    well_sampled_TargetID_list = n_sgRNA_per_TargetID[
        well_sampled_TargetID_mask
    ].index.tolist()
    sgrnadf_titrations_df = df[df["TargetID"].isin(well_sampled_TargetID_list)]

    pearson_r_series = (
        sgrnadf_titrations_df.reset_index(drop=False)
        .groupby("TargetID")
        .apply(lambda x: get_all_stat_correlations(x, stat_list))
    )
    max_abs_pearson_r_series = pearson_r_series.apply(
        lambda x: np.max(abs(np.array(x)))
    )
    titrations_strong_correlation = pearson_r_series[
        max_abs_pearson_r_series > correlation_magnitude
    ].index.tolist()
    strong_titration_df = sgrnadf_titrations_df[
        sgrnadf_titrations_df["TargetID"].isin(titrations_strong_correlation)
    ]
    return strong_titration_df

In [None]:
N_sgRNAs_thr = 4
correlation_magnitude = 0.9

strong_titration_df = get_genes_with_titration(
    sgrnadf_strong_effect,
    ["Mean Width list: Yeo-Johnson: score median"],
    N_sgRNAs_thr,
    correlation_magnitude,
)

### Effect Distance Metrics

Now, I want to evaluate the performance of different distance metrics on the data wrt seperating it maximally while also preserving similarity within replicates

- manhattan distance
- cosine similarity (same as pearson for z-scores)
- euclidean distance

In the end cosine similarity was chosen as it produced superior silhouette scores for sets of targets from genes with different phenotypes.

In [None]:
sgrnadf_examples_for_distance_metric = most_rep_example_series[
    most_rep_example_series["Gene"].isin(["ftsN", "rplA", "mreB", "tufB"])
]

In [None]:
euclidean_silhouette = silhouette_score(
    np.array(sgrnadf_examples_for_distance_metric["Feature Vector"].tolist()),
    sgrnadf_examples_for_distance_metric["Gene"].tolist(),
    metric="euclidean",
)

print("euclidean_silhouette : " + str(euclidean_silhouette))

manhattan_silhouette = silhouette_score(
    np.array(sgrnadf_examples_for_distance_metric["Feature Vector"].tolist()),
    sgrnadf_examples_for_distance_metric["Gene"].tolist(),
    metric="manhattan",
)

print("manhattan_silhouette : " + str(manhattan_silhouette))

cosine_silhouette = silhouette_score(
    np.array(sgrnadf_examples_for_distance_metric["Feature Vector"].tolist()),
    sgrnadf_examples_for_distance_metric["Gene"].tolist(),
    metric="cosine",
)

print("cosine_silhouette : " + str(cosine_silhouette))

### Detecting different effects against single genes

1) Plot a histogram of minimum cosine similarity within groups of TargetIDs against the same genes (for genes with more than one targetID)
2) Use affinity propagation to select the number of phenotype clusters to use per gene (preference set to 0.6 based on toy examples)
3) Among each cluster, represent the final effect as the strongest effect (euc norm) of the members of the cluster

~~3) Among each cluster, represent the final effect as the median of the members of the cluster~~


In [None]:
def get_upper_right_vals(a):
    upper_tri = np.triu(a, k=1)
    upper_tri[upper_tri == 0.0] = np.NaN
    return upper_tri


def get_sgRNA_clusters(df, preference=0.6):
    gene_indexed_df = (
        df.reset_index(drop=False)
        .set_index("Gene")[["sgRNA", "Feature Vector", "TargetID"]]
        .sort_index()
    )
    gene_indexed_df["sgRNA Cluster"] = pd.Series(
        np.zeros(len(gene_indexed_df), dtype=int), dtype=int
    )
    gene_df_list = []
    for gene in gene_indexed_df.index.tolist():
        gene_df = gene_indexed_df.loc[[gene]]
        if len(gene_df) > 1:
            gene_feature_vector = gene_df["Feature Vector"]
            X = np.array(gene_feature_vector.tolist()).astype(float)
            X_sim = 1.0 - cosine_distances(X)
            af_labels = (
                AffinityPropagation(
                    affinity="precomputed", preference=0.6, random_state=42
                )
                .fit_predict(X_sim)
                .astype(int)
            )
            gene_indexed_df.loc[gene, "sgRNA Cluster"] = af_labels
        else:
            gene_indexed_df.loc[gene, "sgRNA Cluster"] = 0
    gene_indexed_df["sgRNA Cluster"] = gene_indexed_df["sgRNA Cluster"].astype(int)
    return gene_indexed_df

In [None]:
most_rep_example_series

In [None]:
n_sgrna_replicate_thr = 2

gene_list, counts_list = np.unique(most_rep_example_series["Gene"], return_counts=True)
genes_with_many_replicate_sgRNAs = gene_list[counts_list >= n_sgrna_replicate_thr]
sgrnadf_many_copies_per_gene = most_rep_example_series[
    most_rep_example_series["Gene"].isin(genes_with_many_replicate_sgRNAs)
]

min_similarity_within_gene = sgrnadf_many_copies_per_gene.groupby("Gene").apply(
    lambda x: np.nanmin(
        get_upper_right_vals(
            1.0 - cosine_distances(np.array(x["Feature Vector"].tolist()))
        )
    )
)
plt.title("Minimum Cosine Similarity per Gene")
plt.hist(min_similarity_within_gene, bins=50)
plt.show()

gene_df = get_sgRNA_clusters(most_rep_example_series)
most_rep_example_series["sgRNA Cluster"] = gene_df.set_index("sgRNA")["sgRNA Cluster"]
most_rep_example_series["sgRNA Cluster Label"] = most_rep_example_series.apply(
    lambda x: str(x["Gene"]) + "-" + str(x["sgRNA Cluster"]), axis=1
)
gene_cluster_df = most_rep_example_series[
    ["sgRNA Cluster Label", "Feature Vector", "Gene", "Euclidean Norm"]
    + [param + ": Yeo-Johnson: score median" for param in params_to_transform]
].reset_index(drop=True)
gene_cluster_groupby = gene_cluster_df.groupby("sgRNA Cluster Label")
# median_feature_series = gene_cluster_groupby.apply(lambda x: np.median(np.stack(x["Feature Vector"]).astype(float), axis=0)).to_frame().rename(columns={0:"Feature Vector"})
feature_series = (
    gene_cluster_groupby.apply(
        lambda x: x.iloc[np.argmax(x["Euclidean Norm"])]["Feature Vector"]
    )
    .to_frame()
    .rename(columns={0: "Feature Vector"})
)

gene_cluster_df = gene_cluster_groupby.apply(
    lambda x: x.iloc[0][
        ["Gene"]
        + [param + ": Yeo-Johnson: score median" for param in params_to_transform]
    ]
)
gene_cluster_df = gene_cluster_df.join(feature_series)

### Clustering: TSNE and Affinity Propagation

In [None]:
X = np.array(gene_cluster_df["Feature Vector"].tolist())
X_sim = 1.0 - cosine_distances(X)

X_embedded = TSNE(
    n_components=2, init="pca", perplexity=5.0, early_exaggeration=50.0, metric="cosine"
).fit_transform(X)
gene_cluster_df["TSNE Coords"] = [X_embedded[i] for i in range(X_embedded.shape[0])]

af_labels = (
    AffinityPropagation(affinity="precomputed", preference=0.0)
    .fit_predict(X_sim)
    .astype(int)
)
gene_cluster_df["Affinity Clusts"] = af_labels

plt.scatter(
    X_embedded[:, 0],
    X_embedded[:, 1],
    s=3,
    alpha=1,
    c=gene_cluster_df["Affinity Clusts"],
)

In [None]:
gene_cluster_df = pd.read_csv(
    "2021-07-31_for_ethan/2021-07-31_Steady_State_Analysis.csv"
)
gene_cluster_df["Feature Vector"] = gene_cluster_df["Feature Vector"].apply(
    lambda x: np.array(ast.literal_eval(x.replace("\n", "").replace(" ", ",")))
)

### Hierarchical Clustering

In [None]:
feature_labels = [
    "Birth Length",
    "Division Length",
    "Area Growth Rate",
    "Length Growth Rate",
    "Average Width",
    "mCherry Intensity",
    "Cell Cycle Duration",
]

hierarchical_labels = gene_cluster_df.index.tolist()


def get_leaf_children(tree, leaf_id):
    cluster_node = tree[leaf_id]
    leaf_children = cluster_node.pre_order(lambda x: x.id)
    return leaf_children


def assign_dendro_clusts(df, children_labels):
    df_out = copy.deepcopy(df)
    df_out["Dendrogram Clusters"] = pd.Series(len(df), dtype=int)
    for clust_i, indices in enumerate(children_labels):
        df_out["Dendrogram Clusters"].iloc[indices] = clust_i
    df_out["Dendrogram Clusters"] = df_out["Dendrogram Clusters"].astype(int)
    return df_out


suppress_thr = 15
min_zscore = -2
max_zscore = 2


def compute_and_plot_dendrogram(
    df, feature_labels, suppress_thr, min_zscore, max_zscore, cmap=mpl.cm.coolwarm
):

    norm = mpl.colors.Normalize(vmin=min_zscore, vmax=max_zscore)

    hierarchical_labels = df.index.tolist()
    X = np.array(df["Feature Vector"].tolist())

    # Compute and plot dendrogram.
    fig = plt.figure(constrained_layout=True, figsize=(20, 10))
    gs = fig.add_gridspec(2, suppress_thr)
    dendro_ax = fig.add_subplot(gs[0, :])

    Y = sch.linkage(X, method="weighted", metric="cosine", optimal_ordering=True)
    cluster_tree = sch.to_tree(Y, rd=True)[1]

    Z = sch.dendrogram(
        Y,
        orientation="top",
        show_leaf_counts=True,
        leaf_rotation=90.0,
        leaf_font_size=12.0,
        truncate_mode="lastp",
        show_contracted=True,
        p=suppress_thr,
        ax=dendro_ax,
        no_labels=True,
    )
    children_labels = [get_leaf_children(cluster_tree, leaf) for leaf in Z["leaves"]]

    fig.colorbar(
        mpl.cm.ScalarMappable(norm=norm, cmap=cmap),
        ax=dendro_ax,
        orientation="vertical",
        label="Z-score",
        use_gridspec=True,
        location="left",
        pad=-0.05,
        aspect=10,
    )

    for i, children in enumerate(children_labels):
        children_arr = np.array(
            df.iloc[children]["Feature Vector"].tolist(), dtype=float
        )
        mean_vector = np.mean(children_arr, axis=0).reshape(-1, 1)

        #     imshow_ax = fig.add_subplot(gs[1, i])
        #     imshow_ax.imshow(mean_vector,cmap=cmap,norm=norm)
        if i == 0:
            imshow_first_ax = fig.add_subplot(gs[1, i])
            imshow_first_ax.imshow(mean_vector, cmap=cmap, norm=norm)

            imshow_first_ax.tick_params(
                axis="x", which="both", bottom=False, top=False, labelbottom=False
            )
            imshow_first_ax.tick_params(
                axis="y", which="both", left=False, right=False, labelbottom=False
            )
            imshow_first_ax.set_xlabel(str(i), fontsize=18)

            imshow_first_ax.set_yticks(range(len(feature_labels)))
            imshow_first_ax.set_yticklabels(
                feature_labels,
                fontsize=18,
            )

        else:
            imshow_ax = fig.add_subplot(gs[1, i], sharey=imshow_first_ax)
            imshow_ax.imshow(mean_vector, cmap=cmap, norm=norm)
            plt.setp(imshow_ax.get_yticklabels(), visible=False)

            imshow_ax.tick_params(
                axis="x", which="both", bottom=False, top=False, labelbottom=False
            )
            imshow_ax.tick_params(
                axis="y", which="both", left=False, right=False, labelbottom=False
            )
            imshow_ax.set_xlabel(str(i), fontsize=18)

    return children_labels


def plot_subset(
    df_subset,
    min_zscore=min_zscore,
    max_zscore=max_zscore,
    feature_labels=feature_labels,
    figsize=(10, 10),
    wspace=1.5,
):

    df_clusts = (
        df_subset.sort_index()
        .reset_index(drop=False)
        .set_index("Dendrogram Clusters")[["sgRNA Cluster Label", "Feature Vector"]]
        .sort_index()
    )

    cmap = mpl.cm.coolwarm
    norm = mpl.colors.Normalize(vmin=min_zscore, vmax=max_zscore)

    # Compute and plot dendrogram.
    fig = plt.figure(constrained_layout=True, figsize=figsize)
    gs = fig.add_gridspec(1, len(df_clusts), wspace=wspace)

    for i in range(len(df_clusts)):

        if i == 0:
            imshow_first_ax = fig.add_subplot(gs[0, i])
            imshow_first_ax.imshow(
                df_clusts["Feature Vector"].iloc[i].astype(float).reshape(-1, 1),
                cmap=cmap,
                norm=norm,
            )

            imshow_first_ax.tick_params(
                axis="x", which="both", bottom=False, top=False, labelbottom=False
            )
            imshow_first_ax.tick_params(
                axis="y", which="both", left=False, right=False, labelbottom=False
            )
            imshow_first_ax.set_xlabel(
                df_clusts["sgRNA Cluster Label"].iloc[i]
                + "\n Cluster "
                + str(df_clusts.index[i]),
                fontsize=14,
            )

            imshow_first_ax.set_yticks(range(len(feature_labels)))
            imshow_first_ax.set_yticklabels(
                feature_labels,
                fontsize=18,
            )
        else:
            imshow_ax = fig.add_subplot(gs[0, i], sharey=imshow_first_ax)
            imshow_ax.imshow(
                df_clusts["Feature Vector"].iloc[i].astype(float).reshape(-1, 1),
                cmap=cmap,
                norm=norm,
            )

            plt.setp(imshow_ax.get_yticklabels(), visible=False)

            imshow_ax.tick_params(
                axis="x", which="both", bottom=False, top=False, labelbottom=False
            )
            imshow_ax.tick_params(
                axis="y", which="both", left=False, right=False, labelbottom=False
            )
            imshow_ax.set_xlabel(
                df_clusts["sgRNA Cluster Label"].iloc[i]
                + "\n Cluster "
                + str(df_clusts.index[i]),
                fontsize=14,
            )


def make_subset_dendrogram(
    sub_df,
    title,
    feature_labels=feature_labels,
    min_zscore=min_zscore,
    max_zscore=max_zscore,
    figsize=(10, 10),
    fontsize=18,
):
    X = np.array(sub_df["Feature Vector"].tolist())

    # Compute and plot dendrogram.
    fig = plt.figure(constrained_layout=True, figsize=figsize)
    gs = fig.add_gridspec(2, len(sub_df))
    dendro_ax = fig.add_subplot(gs[0, :])

    Y = sch.linkage(X, method="weighted", metric="cosine", optimal_ordering=True)
    cluster_tree = sch.to_tree(Y, rd=True)[1]

    Z = sch.dendrogram(
        Y,
        orientation="top",
        show_leaf_counts=True,
        leaf_rotation=90.0,
        leaf_font_size=12.0,
        show_contracted=True,
        ax=dendro_ax,
        no_labels=True,
    )

    cmap = mpl.cm.coolwarm
    norm = mpl.colors.Normalize(vmin=min_zscore, vmax=max_zscore)

    fig.colorbar(
        mpl.cm.ScalarMappable(norm=norm, cmap=cmap),
        ax=dendro_ax,
        orientation="vertical",
        label="Z-score",
        use_gridspec=True,
        location="left",
        pad=-0.0,
        aspect=10,
    )

    fig.suptitle(title, fontsize=20)

    for i, leaf in enumerate(Z["leaves"]):
        leaf_arr = np.array(
            sub_df.iloc[leaf]["Feature Vector"].tolist(), dtype=float
        ).reshape(-1, 1)
        #         imshow_ax = fig.add_subplot(gs[1, i])
        #         imshow_ax.imshow(leaf_arr,cmap=cmap,norm=norm)
        if i == 0:
            imshow_first_ax = fig.add_subplot(gs[1, i])
            imshow_first_ax.imshow(leaf_arr, cmap=cmap, norm=norm)

            imshow_first_ax.tick_params(
                axis="x", which="both", bottom=False, top=False, labelbottom=False
            )
            imshow_first_ax.tick_params(
                axis="y", which="both", left=False, right=False, labelbottom=False
            )
            imshow_first_ax.set_xlabel(
                sub_df.index[leaf], fontsize=fontsize, rotation=90
            )

            imshow_first_ax.set_yticks(range(len(feature_labels)))
            imshow_first_ax.set_yticklabels(
                feature_labels,
                fontsize=fontsize,
            )

        else:
            imshow_ax = fig.add_subplot(gs[1, i], sharey=imshow_first_ax)
            imshow_ax.imshow(leaf_arr, cmap=cmap, norm=norm)
            plt.setp(imshow_ax.get_yticklabels(), visible=False)

            imshow_ax.tick_params(
                axis="x", which="both", bottom=False, top=False, labelbottom=False
            )
            imshow_ax.tick_params(
                axis="y", which="both", left=False, right=False, labelbottom=False
            )
            imshow_ax.set_xlabel(sub_df.index[leaf], fontsize=fontsize, rotation=90)


#         imshow_ax.tick_params(axis='x',which='both',bottom=False,top=False,labelbottom=False)
#         imshow_ax.tick_params(axis='y',which='both',left=False,right=False,labelbottom=False)

#         imshow_ax.set_xlabel(sub_df.index[leaf], fontsize=fontsize)

In [None]:
children_labels = compute_and_plot_dendrogram(
    gene_cluster_df,
    feature_labels,
    suppress_thr,
    min_zscore,
    max_zscore,
    cmap=mpl.cm.coolwarm,
)
plt.savefig("./Dendrograms/Global_Dendrogram.png", dpi=300)

In [None]:
gene_cluster_df = assign_dendro_clusts(gene_cluster_df, children_labels)

In [None]:
plt.scatter(
    X_embedded[:, 0],
    X_embedded[:, 1],
    s=3,
    alpha=1,
    c=gene_cluster_df["Affinity Clusts"],
)

#### Major System Analysis

In [None]:
fts_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "fts" in x["Gene"], axis=1)
]

In [None]:
plot_subset(fts_subset)
plt.tight_layout()
# plt.savefig("./Gene_Groups/fts.png",dpi=200,bbox_inches="tight")

In [None]:
rpl_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "rpl" in x["Gene"], axis=1)
]

In [None]:
plot_subset(rpl_subset, figsize=(30, 10))
plt.tight_layout()
plt.savefig("./Gene_Groups/rpl.png", dpi=200, bbox_inches="tight")

In [None]:
rpm_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "rpm" in x["Gene"], axis=1)
]

In [None]:
plot_subset(rpm_subset, figsize=(30, 10))
plt.tight_layout()
plt.savefig("./Gene_Groups/rpm.png", dpi=200, bbox_inches="tight")

In [None]:
rps_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "rps" in x["Gene"], axis=1)
]

In [None]:
plot_subset(rps_subset, figsize=(30, 10))
plt.tight_layout()
plt.savefig("./Gene_Groups/rps.png", dpi=200, bbox_inches="tight")

In [None]:
rr_subset = gene_cluster_df[gene_cluster_df.apply(lambda x: "rr" in x["Gene"], axis=1)]

In [None]:
plot_subset(rr_subset, figsize=(30, 10))

In [None]:
tff_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "tff" in x["Gene"], axis=1)
]

In [None]:
plot_subset(tff_subset, figsize=(30, 10))
plt.tight_layout()
plt.savefig("./Gene_Groups/tff.png", dpi=200, bbox_inches="tight")

In [None]:
rpo_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "rpo" in x["Gene"], axis=1)
]

In [None]:
plot_subset(rpo_subset, figsize=(10, 10))
plt.tight_layout()
plt.savefig("./Gene_Groups/rpo.png", dpi=200, bbox_inches="tight")

In [None]:
min_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "min" in x["Gene"], axis=1)
]

In [None]:
plot_subset(min_subset, figsize=(6, 10))
plt.tight_layout()
plt.savefig("./Gene_Groups/min.png", dpi=200, bbox_inches="tight")

In [None]:
dna_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "dna" in x["Gene"], axis=1)
]

In [None]:
plot_subset(dna_subset, figsize=(6, 10))
plt.tight_layout()
plt.savefig("./Gene_Groups/dna.png", dpi=200, bbox_inches="tight")

In [None]:
fol_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "fol" in x["Gene"], axis=1)
]

In [None]:
plot_subset(fol_subset, figsize=(6, 10))
plt.tight_layout()
plt.savefig("./Gene_Groups/fol.png", dpi=200, bbox_inches="tight")

In [None]:
muk_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "muk" in x["Gene"], axis=1)
]

In [None]:
plot_subset(muk_subset, figsize=(6, 10))
plt.tight_layout()
plt.savefig("./Gene_Groups/muk.png", dpi=200, bbox_inches="tight")

In [None]:
mre_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "mre" in x["Gene"], axis=1)
]

In [None]:
plot_subset(mre_subset, figsize=(6, 10))
plt.tight_layout()
plt.savefig("./Gene_Groups/mre.png", dpi=200, bbox_inches="tight")

In [None]:
mur_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "mur" in x["Gene"], axis=1)
]

In [None]:
plot_subset(mur_subset, figsize=(12, 10))
plt.tight_layout()
plt.savefig("./Gene_Groups/mur.png", dpi=200, bbox_inches="tight")

In [None]:
nus_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "nus" in x["Gene"], axis=1)
]

In [None]:
plot_subset(nus_subset, figsize=(12, 10))
plt.tight_layout()
plt.savefig("./Gene_Groups/nus.png", dpi=200, bbox_inches="tight")

In [None]:
sec_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "sec" in x["Gene"], axis=1)
]

In [None]:
plot_subset(sec_subset, figsize=(12, 10))
plt.tight_layout()
plt.savefig("./Gene_Groups/sec.png", dpi=200, bbox_inches="tight")

In [None]:
bam_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "bam" in x["Gene"], axis=1)
]

In [None]:
plot_subset(bam_subset, figsize=(6, 10))
plt.tight_layout()
plt.savefig("./Gene_Groups/bam.png", dpi=200, bbox_inches="tight")

In [None]:
hol_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "hol" in x["Gene"], axis=1)
]

In [None]:
plot_subset(hol_subset, figsize=(6, 10))
plt.tight_layout()
plt.savefig("./Gene_Groups/hol.png", dpi=200, bbox_inches="tight")

In [None]:
hda_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "hda" in x["Gene"], axis=1)
]

In [None]:
plot_subset(hda_subset, figsize=(6, 10))
plt.tight_layout()
plt.savefig("./Gene_Groups/hda.png", dpi=200, bbox_inches="tight")

In [None]:
rodZ_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "rodZ" in x["Gene"], axis=1)
]

In [None]:
plot_subset(rodZ_subset, figsize=(6, 10))
plt.tight_layout()
plt.savefig("./Gene_Groups/rodz.png", dpi=200, bbox_inches="tight")

#### Cluster Analysis

In [None]:
clusters, cluster_counts = np.unique(
    gene_cluster_df["Dendrogram Clusters"], return_counts=True
)
singleton_clusters = clusters[cluster_counts == 1]
small_clusters = clusters[cluster_counts <= 40]
big_clusters = clusters[cluster_counts > 40]
print(singleton_clusters)
print(small_clusters)
print(big_clusters)

In [None]:
cluster_6to8 = gene_cluster_df[gene_cluster_df["Dendrogram Clusters"].isin([6, 7, 8])]
cluster_9to10 = gene_cluster_df[gene_cluster_df["Dendrogram Clusters"].isin([9, 10])]
cluster_1 = gene_cluster_df[gene_cluster_df["Dendrogram Clusters"].isin([1])]
cluster_13 = gene_cluster_df[gene_cluster_df["Dendrogram Clusters"].isin([13])]
cluster_14 = gene_cluster_df[gene_cluster_df["Dendrogram Clusters"].isin([14])]

In [None]:
remaining_small_clusters = list(set(small_clusters) - set([6, 7, 8, 9, 10]))

In [None]:
remaining_small_clusters

In [None]:
for i in remaining_small_clusters:
    cluster_df = gene_cluster_df[gene_cluster_df["Dendrogram Clusters"].isin([i])]
    make_subset_dendrogram(
        cluster_df,
        "Cluster " + str(i) + " Dendrogram",
        figsize=(int(len(cluster_df) * 1.75), 12),
        fontsize=16 + int(len(cluster_df) * 0.75),
    )
    plt.savefig("./Dendrograms/Cluster_" + str(i) + ".png", dpi=200)

make_subset_dendrogram(
    cluster_6to8,
    "Cluster 6 to 8 Dendrogram",
    figsize=(int(len(cluster_6to8) * 1.75), 10),
    fontsize=16 + int(len(cluster_6to8) * 0.75),
)
plt.savefig("./Dendrograms/Cluster_6to8.png", dpi=200)

make_subset_dendrogram(
    cluster_9to10,
    "Cluster 9 to 10 Dendrogram",
    figsize=(int(len(cluster_9to10) * 1.75), 10),
    fontsize=16 + int(len(cluster_9to10) * 0.75),
)
plt.savefig("./Dendrograms/Cluster_9to10.png", dpi=200)

In [None]:
make_subset_dendrogram(
    cluster_1,
    "Cluster 1 Dendrogram",
    figsize=(int(len(cluster_1) * 1.75), 30),
    fontsize=16 + int(len(cluster_1) * 0.75),
)
plt.savefig("./Dendrograms/Cluster_1.png", dpi=200)

In [None]:
make_subset_dendrogram(
    cluster_13,
    "Cluster 13 Dendrogram",
    figsize=(int(len(cluster_13) * 1.75), 30),
    fontsize=16 + int(len(cluster_13) * 0.75),
)
plt.savefig("./Dendrograms/Cluster_13.png", dpi=200)

In [None]:
make_subset_dendrogram(
    cluster_14,
    "Cluster 14 Dendrogram",
    figsize=(int(len(cluster_14) * 1.75), 30),
    fontsize=16 + int(len(cluster_14) * 0.75),
)
plt.savefig("./Dendrograms/Cluster_14.png", dpi=200)

In [None]:
gene_cluster_df.to_csv("2021-07-31_Steady_State_Analysis.csv")

### Single Cluster Inspection

In [None]:
params_to_transform = [
    "Lb list",
    "Ld list",
    "delL list",
    "Mean Area Increment list",
    "Mean Length Increment list",
    "Mean Width list",
    "Mean mCherry Intensity list",
    "Delta t list",
]


def get_sgrnadf_from_df(df, feature_labels, time_label="final cell timepoints list"):
    df_groupby = df.groupby("sgRNA")
    sgrnadf = (
        df_groupby.apply(lambda x: x["phenotype trenchid"].tolist())
        .to_frame()
        .rename(columns={0: "phenotype trenchid"})
    )

    for feature_label in feature_labels:
        sgrnadf[feature_label] = df_groupby.apply(
            lambda x: np.array(
                [val for item in x[feature_label].tolist() for val in item]
            )
        )

    sgrnadf[time_label] = df_groupby.apply(
        lambda x: np.array([val for item in x[time_label].tolist() for val in item])
    )
    sgrnadf["Gene"] = df_groupby.apply(lambda x: x["Gene"].iloc[0])
    sgrnadf["TargetID"] = df_groupby.apply(lambda x: x["TargetID"].iloc[0])
    sgrnadf["N Mismatch"] = df_groupby.apply(lambda x: x["N Mismatch"].iloc[0])
    sgrnadf["N Observations"] = df_groupby.apply(
        lambda x: len(x["phenotype trenchid"].tolist())
    )
    sgrnadf["Category"] = df_groupby.apply(lambda x: x["Category"].iloc[0])

    return sgrnadf


def get_timepoint_values(
    df,
    label,
    min_timepoint,
    max_timepoint,
    time_label="final cell timepoints list",
    flatten_vals=True,
):
    masked_label_series = df.apply(
        lambda x: np.array(x[label])[
            (np.array(x[time_label]) >= min_timepoint)
            * (np.array(x[time_label]) <= max_timepoint)
        ],
        axis=1,
    )
    if flatten_vals:
        flattened_vals = [val for item in masked_label_series.tolist() for val in item]
        return flattened_vals
    else:
        return masked_label_series

In [None]:
sgrnadf_nontrasformed_vals = get_sgrnadf_from_df(
    final_output_df_pd, params_to_transform
)

In [None]:
selected_df = sgrnadf_nontrasformed_vals[sgrnadf_nontrasformed_vals["Gene"] == "ftsN"]

In [None]:
idx = 14
t = selected_df["final cell timepoints list"][idx]
y = selected_df["delL list"][idx]
plt.ylim(0, 8)
plt.scatter(t, y, s=3)

In [None]:
idx = 14
t = selected_df["final cell timepoints list"][idx]
y = selected_df["Mean Area Increment list"][idx]
plt.ylim(0, 1)
plt.scatter(t, y, s=3)

In [None]:
idx = 14
t = selected_df["final cell timepoints list"][idx]
y = selected_df["Delta t list"][idx]
plt.ylim(0, 20)
plt.scatter(t, y, s=3)

In [None]:
min_timepoint = 90
max_timepoint = 144
for i in range(len(selected_df)):
    Lb = get_timepoint_values(
        selected_df[i : i + 1], "Lb list", min_timepoint, max_timepoint
    )
    delL = get_timepoint_values(
        selected_df[i : i + 1], "delL list", min_timepoint, max_timepoint
    )
    t = get_timepoint_values(
        selected_df[i : i + 1],
        "final cell timepoints list",
        min_timepoint,
        max_timepoint,
    )
    r = sp.stats.pearsonr(Lb, delL)
    print(r[0])
    plt.scatter(Lb, delL, s=3, c=t)
    plt.xlim(1, 8)
    plt.ylim(1, 8)
    plt.show()

### Gene Browser

In [None]:
df = final_output_df_pd.groupby("sgRNA").apply(lambda x: x.iloc[0])
df["phenotype trenchids"] = final_output_df_pd.groupby("sgRNA").apply(
    lambda x: x["phenotype trenchid"].tolist()
)
df = df[
    [
        "Gene",
        "Target Sequence",
        "phenotype trenchids",
        "N Mismatch",
        "N Target Sites",
        "Category",
        "Strand",
    ]
]

In [None]:
kymo_xarr = tr.kymo_xarr(
    "/home/de64/scratch/de64/sync_folder/2021-06-14_lDE20_biofloat_fullrun_1/Growth_Division"
)
wrapped_kymo_xarr = tr.kymo_xarr(
    "/home/de64/scratch/de64/sync_folder/2021-06-14_lDE20_biofloat_fullrun_1/Growth_Division",
    unwrap=False,
)

In [None]:
(
    gene_table_layout,
    select_gene,
    select_trenchid,
    select_unpacked_trenchid,
) = tr.linked_gene_table(
    df, trenchids_as_list=True, trenchid_column="phenotype trenchids"
)

In [None]:
gene_table_layout

In [None]:
output_display, save_button = tr.linked_kymograph_for_gene_table(
    kymo_xarr,
    wrapped_kymo_xarr,
    df,
    select_gene,
    select_trenchid,
    select_unpacked_trenchid=select_unpacked_trenchid,
    trenchid_column="phenotype trenchids",
    y_scale=3,
    x_window_size=300,
)

In [None]:
output_display

In [None]:
save_button  ## NEED OPTION WHETHER OR NOT TO NORM SIGNAL