popt## Steady-state Analysis of lDE20 (with lineage Dataframe ready)

- Note that there are fluctuations in the illumination intensity which may be resulting in pathological behavior from the reporter

- Consider either normalizing this out or fixing the underlying problem

In [None]:
import paulssonlab.deaton.trenchripper.trenchripper as tr

import numpy as np
import pandas as pd
import seaborn as sns
import scipy as sp
import sklearn as skl
import dask.dataframe as dd
import dask.array as da
import dask
import warnings
import copy
import random
from sklearn.metrics.pairwise import (
    euclidean_distances,
    manhattan_distances,
    cosine_distances,
)

from sklearn.metrics import silhouette_score
import scipy.stats
from sklearn.linear_model import LinearRegression
from sklearn.cluster import AffinityPropagation
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering

from matplotlib import pyplot as plt
import ast


import pylab
import scipy.cluster.hierarchy as sch

import matplotlib.gridspec as gridspec
import matplotlib as mpl

import holoviews as hv

hv.extension("bokeh")
random.seed(42)
np.random.seed(42)

warnings.filterwarnings(action="once", category=UserWarning)

In [None]:
def get_timepoint_values(
    df,
    label,
    min_timepoint,
    max_timepoint,
    time_label="final cell timepoints list",
    flatten_vals=True,
):
    masked_label_series = df.apply(
        lambda x: np.array(x[label])[
            (np.array(x[time_label]) >= min_timepoint)
            * (np.array(x[time_label]) <= max_timepoint)
        ],
        axis=1,
    )
    if flatten_vals:
        flattened_vals = [val for item in masked_label_series.tolist() for val in item]
        return flattened_vals
    else:
        return masked_label_series


def get_feature_stats(df, feature_label, min_timepoint, max_timepoint):
    feature_vals = get_timepoint_values(df, feature_label, min_timepoint, max_timepoint)
    feature_median = np.median(feature_vals)
    feature_iqr = sp.stats.iqr(feature_vals)
    return feature_median, feature_iqr


def get_feature_median_bytrench(df, feature_label, min_timepoint, max_timepoint):
    masked_label_series = get_timepoint_values(
        final_output_df_pd_filtered,
        feature_label,
        min_timepoint,
        max_timepoint,
        flatten_vals=False,
    )
    trench_median_series = masked_label_series.apply(lambda x: np.nanmedian(x))
    return trench_median_series


def compute_score(
    df,
    feature_label,
    trench_median_series,
    feature_median,
    feature_iqr,
    time_label="final cell timepoints list",
    timepoint_range=None,
):
    scaling_factor = 1.35 * (feature_median / feature_iqr)

    if timepoint_range == None:
        scores = (
            (df[feature_label].apply(lambda x: np.array(x))) / trench_median_series
        ) - 1.0
    else:
        scores = (
            (
                df[feature_label].apply(
                    lambda x: np.array(x)[
                        (np.array(x[time_label]) >= timepoint_range[0])
                        * (np.array(x[time_label]) <= timepoint_range[1])
                    ]
                )
            )
            / trench_median_series
        ) - 1.0
    scores = scaling_factor * scores
    return scores


def get_feature_scores(
    df,
    feature_label,
    init_timepoint_range=(0, 20),
    time_label="final cell timepoints list",
    timepoint_range=None,
):
    feature_median, feature_iqr = get_feature_stats(
        df, feature_label, init_timepoint_range[0], init_timepoint_range[1]
    )
    trench_median_series = get_feature_median_bytrench(
        df, feature_label, init_timepoint_range[0], init_timepoint_range[1]
    )
    scores = compute_score(
        df,
        feature_label,
        trench_median_series,
        feature_median,
        feature_iqr,
        time_label=time_label,
        timepoint_range=timepoint_range,
    )
    return scores


def get_all_feature_scores(
    df,
    feature_labels,
    init_timepoint_range=(0, 20),
    time_label="final cell timepoints list",
    timepoint_range=None,
):

    for feature_label in feature_labels:
        print(feature_label)
        feature_scores = get_feature_scores(
            df,
            feature_label,
            init_timepoint_range=init_timepoint_range,
            time_label=time_label,
            timepoint_range=timepoint_range,
        )
        df[feature_label + ": score"] = feature_scores

    return df


def get_sgrnadf_from_scoredf(
    scoredf, feature_labels, time_label="final cell timepoints list"
):
    scoredf_groupby = scoredf.groupby("sgRNA")
    sgrnadf = (
        scoredf_groupby.apply(lambda x: x["phenotype trenchid"].tolist())
        .to_frame()
        .rename(columns={0: "phenotype trenchid"})
    )

    for feature_label in feature_labels:
        sgrnadf[feature_label + ": score"] = scoredf_groupby.apply(
            lambda x: np.array(
                [val for item in x[feature_label + ": score"].tolist() for val in item]
            )
        )

    sgrnadf[time_label] = scoredf_groupby.apply(
        lambda x: np.array([val for item in x[time_label].tolist() for val in item])
    )
    sgrnadf["Gene"] = scoredf_groupby.apply(lambda x: x["Gene"].iloc[0])
    sgrnadf["TargetID"] = scoredf_groupby.apply(lambda x: x["TargetID"].iloc[0])
    sgrnadf["N Mismatch"] = scoredf_groupby.apply(lambda x: x["N Mismatch"].iloc[0])
    sgrnadf["N Observations"] = scoredf_groupby.apply(
        lambda x: len(x["phenotype trenchid"].tolist())
    )
    sgrnadf["Category"] = scoredf_groupby.apply(lambda x: x["Category"].iloc[0])

    return sgrnadf


# No longer using this
# def filter_strong_KOs(df,sampling_thr = 4, n_strongest=2):

#     for i in range(sampling_thr,0,-1):
#         sampling_mask = df["N Observations"]>=sampling_thr
#         mismatch_series = df[sampling_mask]["N Mismatch"]

#         for n in range(n_strongest,0,-1):
#             if len(mismatch_series)>=n:
#                 keep_indices = np.argsort(mismatch_series)[:n]
#                 out_df = df[sampling_mask].iloc[keep_indices]

#                 return out_df


def normalize_timeseries(feature_vector_series, lmbda=0.5):
    timeseries_arr = np.swapaxes(np.array(feature_vector_series.tolist()), 1, 2)
    sigma = np.std(timeseries_arr, axis=1)
    if lmbda > 0.0:
        sigma_prime = ((sigma + 1) ** lmbda - 1) / lmbda  ##yeo-johnson
    elif lmbda == 0.0:
        sigma_prime = np.log(sigma + 1)
    else:
        raise ValueError("lmbda cannot be negative")
    normalizer = sigma / sigma_prime
    normalized_timeseries = timeseries_arr / normalizer[:, np.newaxis, :]
    return normalized_timeseries

### Initial Data Processing

Here, I am going to try and replicate (to some extant) the corrections from "Genomewide phenotypic analysis of growth, cell morphogenesis, and cell cycle events in Escherichia coli"

#### Start Dask

In [None]:
headpath = (
    "/home/de64/scratch/de64/sync_folder/2021-06-14_lDE20_biofloat_fullrun_1/Barcodes"
)

In [None]:
dask_controller = tr.trcluster.dask_controller(
    walltime="04:00:00",
    local=False,
    n_workers=20,
    memory="4GB",
    working_directory=headpath + "/dask",
)
dask_controller.startdask()

In [None]:
dask_controller.displaydashboard()

#### Import Dataframe

In [None]:
final_output_df_pd = pd.read_pickle(
    "/home/de64/scratch/de64/sync_folder/2021-06-14_lDE20_biofloat_fullrun_1/2021-07-26_lDE20_Lineage_Analysis.pkl"
)
final_output_df_pd = final_output_df_pd[
    ~final_output_df_pd["final cell timepoints list"].isna()
]

#### Filter for "Normal" Sizes at Start

1) Fit a gaussian model to each of the specified feature params during the first t timepoints of the experiment (using a subsample for speed) 
2) Compute a normalized probability trenchwise for these features under the gaussian model, during the first t timepoints of the experiment
3) Eliminate trenches that are under some p percentile value of this probability for each feature
4) Display histograms for each property as well as the resulting theshold

Note that these features should be the only features examined in the resulting analysis. For the notebook, I am looking at:
- Birth length (Lb)
- Division length (Ld)
- Mean Area Increment
- Mean Length Increment
- Mean Width
- Cell cycle duration (Delta t)
- Mean mCherry Intensity

In [None]:
early_timepoint_cutoff = 30
gaussian_subsample = 0.2
percentile_threshold = 10

filter_params = [
    "Lb list",
    "Ld list",
    "Mean Area Increment list",
    "Mean Length Increment list",
    "Mean Width list",
    "Mean mCherry Intensity list",
    "Delta t list",
]

final_output_df_pd_dask = dd.from_pandas(final_output_df_pd, npartitions=200).persist()
dask.distributed.wait(final_output_df_pd_dask)
final_output_df_pd_dask["Early Timepoint Mask"] = final_output_df_pd_dask[
    "cell timepoints list"
].apply(
    lambda x: np.array([item if (type(item) is int) else 10000000 for item in x])
    < early_timepoint_cutoff,
    meta=(None, "object"),
)

for filter_param in filter_params:
    early_param_series = final_output_df_pd_dask.apply(
        lambda x: np.array(x[filter_param])[x["Early Timepoint Mask"]]
        if type(x[filter_param]) is list
        else np.array([]),
        axis=1,
        meta=(None, "object"),
    )
    all_param_values = [
        val
        for item in early_param_series.sample(frac=gaussian_subsample)
        .compute()
        .tolist()
        for val in item
    ]
    gaussian_fit = sp.stats.norm.fit(all_param_values)
    gaussian_fit = sp.stats.norm(loc=gaussian_fit[0], scale=gaussian_fit[1])

    final_output_df_pd[filter_param + ": Probability"] = early_param_series.apply(
        lambda x: np.exp(np.sum(gaussian_fit.logpdf(x)) / len(x)), meta=float
    ).persist()

plt.figure(figsize=(22, 16))
query_list = []
for i, filter_param in enumerate(filter_params):
    prob_threshold = np.nanpercentile(
        final_output_df_pd[filter_param + ": Probability"].tolist(),
        percentile_threshold,
    )
    query = "`" + filter_param + ": Probability` > " + str(prob_threshold)
    query_list.append(query)

    min_v, max_v = np.min(final_output_df_pd[filter_param + ": Probability"]), np.max(
        final_output_df_pd[filter_param + ": Probability"]
    )

    plt.subplot(3, 5, i + 1)
    plt.title(filter_param)
    plt.hist(
        final_output_df_pd[
            final_output_df_pd[filter_param + ": Probability"] < prob_threshold
        ][filter_param + ": Probability"].tolist(),
        bins=50,
        range=(min_v, max_v),
    )
    plt.hist(
        final_output_df_pd[
            final_output_df_pd[filter_param + ": Probability"] >= prob_threshold
        ][filter_param + ": Probability"].tolist(),
        bins=50,
        range=(min_v, max_v),
    )
plt.show()

compiled_query = " and ".join(query_list)
final_output_df_pd_filtered = final_output_df_pd.query(compiled_query)

In [None]:
len(final_output_df_pd_filtered) / len(final_output_df_pd)

### Normalize Properties

1) Yeo-Johnson transform the data as before (this time I am omitting the label for simplicity).
2) Convert transformed values to time-dependent s-scores using the following formula:

$$ z(i,k,t) = 1.35 \times \frac{median_{t\in \tau}(F_{i,t})}{iqr_{t\in \tau}(F_{i,t})}\Bigg(\frac{F_{i,k,t}}{median_{t\in \tau}(F_{i,k,t})} - 1\Bigg) $$

where $F_{i,k,t}$ are the feature values for feature i, trench k at time t. $\tau$ are the initial pre-induction timepoints. 

Essentially this is a z-score using the more outlier robust median and interquartile range to define the differences from normal bahavior. The 1.35 factor scales the values such that z-scores represent number of standard deviations from the mean for a normal distribution. Finally the values are normalized by initial behaviors trenchwise by the $median_{t\in \tau}(F_{i,k,t})$ factor.

In [None]:
params_to_transform = [
    "Lb list",
    "Ld list",
    "delL list",
    "Mean Area Increment list",
    "Mean Length Increment list",
    "Mean Width list",
    "Mean mCherry Intensity list",
    "Delta t list",
]
yeo_subsample = 0.1

final_output_df_pd_filtered_dask = dd.from_pandas(
    final_output_df_pd_filtered, npartitions=100
).persist()
dask.distributed.wait(final_output_df_pd_filtered_dask)

for i, param in enumerate(params_to_transform):
    all_param_values = [
        float(val)
        for item in final_output_df_pd_filtered_dask[param]
        .sample(frac=yeo_subsample)
        .compute()
        .tolist()
        for val in item
    ]
    l_norm = sp.stats.yeojohnson_normmax(all_param_values)
    final_output_df_pd_filtered_dask[param] = (
        final_output_df_pd_filtered_dask[param]
        .apply(
            lambda x: sp.stats.yeojohnson(np.array(x).astype(float), lmbda=l_norm),
            meta="object",
        )
        .persist()
    )
final_output_df_pd_filtered = final_output_df_pd_filtered_dask.compute()

scoredf = get_all_feature_scores(final_output_df_pd_filtered, params_to_transform)
sgrnadf = get_sgrnadf_from_scoredf(scoredf, params_to_transform)

### sgRNA Effect Size Filtering (within Gene groups)

1) Threshold sgRNAs to include by number of observations
2) Use LOWESS to smooth out score timeseries into 20 point timeseries
    - changing everything after this
3) For each timepoint, measure the euclidean norm of the feature vector and take the maximum over all time as a measure of effect size
4) Thrshold sgRNAs for strong effects by applying a threshold to the euclidean norm that will be displayed with histogram
5) Display a histogram for the sgRNA number per gene

In [None]:
from scipy.stats import iqr
from statsmodels.nonparametric.smoothers_lowess import lowess
import sklearn as skl
from tslearn.clustering import TimeSeriesKMeans
from tslearn.preprocessing import TimeSeriesScalerMeanVariance


def timeseries_lowess_reg(df, t_label, y_label, min_tpt, max_tpt, bins, frac=0.33):
    del_tpt = max_tpt - min_tpt
    intervals = np.linspace(min_tpt, max_tpt, num=bins, dtype=float)

    def lowess_reg(x_arr, y_arr, start=min_tpt, end=max_tpt, bins=bins, frac=frac):
        intervals = np.linspace(start, end, num=bins, dtype=float)
        w = lowess(y_arr, x_arr, frac=frac, xvals=intervals, it=1)
        reg_x, reg_y = (intervals, w)
        return reg_x, reg_y

    lowess_result = df.apply(
        lambda x: lowess_reg(x[t_label], x[y_label])[1], axis=1, meta=float
    )

    return lowess_result


def get_all_lowess_regs(
    df,
    y_label_list,
    min_tpt,
    max_tpt,
    bins,
    frac=0.33,
    t_label="final cell timepoints list",
    iqr_bins=8,
    iqr_window=3,
):
    out_df = copy.deepcopy(df)

    for y_label in y_label_list:
        lowess_result = timeseries_lowess_reg(
            df, t_label, y_label, min_tpt, max_tpt, bins, frac=frac
        )
        out_df["LOWESS Trace: " + y_label] = lowess_result.persist()
    #         out_df["Binned IQR: " + y_label] = out_df.apply(lambda x: compute_binned_iqr(x[t_label], x[y_label],min_tpt,max_tpt,iqr_bins,iqr_window), axis=1, meta="object").persist()
    #         interp_series = out_df["Binned IQR: " + y_label].apply(lambda x: interp1d_with_nan(np.linspace(min_tpt,max_tpt,num=iqr_bins),x), meta="object").persist()
    #         out_df["Binned IQR Interpolation: " + y_label] = interp_series.apply(lambda x: x(np.linspace(min_tpt,max_tpt,num=bins)), meta="object").persist()
    #         out_df.apply(lambda x: x["Binned IQR Interpolation: " + y_label](np.linspace(min_tpt,max_tpt,num=bins)), axis=1, meta="object").persist()

    return out_df


# def compute_binned_std(time_arr, val_arr, min_tpt, max_tpt, bins):
#     intervals = np.linspace(min_tpt,max_tpt,num=bins,dtype=float)
#     lower_bounds = intervals[:-1]
#     upper_bounds = intervals[1:]
#     interval_assign = np.where(np.logical_and(np.greater.outer(time_arr,lower_bounds),np.less_equal.outer(time_arr,upper_bounds)))[1]
#     std_devs = [np.nanstd(val_arr[interval_assign==i]) for i in range(len(intervals)-1)]
#     return std_devs


def cumulative_std(val_arr):
    intervals = np.array(range(len(val_arr)), dtype=int)
    interval_assign = np.where(np.greater.outer(intervals[:-1], intervals[1:]))[1]
    iqrs = np.array(
        [
            iqr(
                val_arr[
                    (interval_assign <= i + iqr_window_radius)
                    * (interval_assign >= (i - iqr_window_radius))
                ],
                nan_policy="omit",
            )
            for i in range(len(intervals) - 1)
        ]
    )
    return iqrs


# def interp1d_with_nan(timepoint_arr,timeseries_arr):
#     '''
#     interpolate to fill nan values #https://newbedev.com/interpolate-nan-values-in-a-numpy-array
#     '''
#     good_vals = np.where(np.isfinite(timeseries_arr))[0]
#     filtered_timepoint_arr,filtered_timeseries_arr = timepoint_arr[good_vals],timeseries_arr[good_vals]
#     interp = sp.interpolate.interp1d(filtered_timepoint_arr, filtered_timeseries_arr, kind='cubic')
#     return interp

## Attempt to find Breakpoints

In [None]:
N_Observations_thr = 10

sgrnadf_wellsampled = sgrnadf[sgrnadf["N Observations"] >= N_Observations_thr]

In [None]:
min_tpt = 0
max_tpt = 143
bins = 60
frac = (1 / bins) * 4
iqr_bins = 8  # optimized on minC
iqr_window = 3

sgrnadf_wellsampled_dask = dd.from_pandas(
    sgrnadf_wellsampled, npartitions=100
).persist()
dask.distributed.wait(sgrnadf_wellsampled_dask)

lowess_trace_df = get_all_lowess_regs(
    sgrnadf_wellsampled_dask,
    sgrnadf_wellsampled.columns[1:8],
    min_tpt,
    max_tpt,
    bins,
    frac=frac,
    iqr_bins=iqr_bins,
    iqr_window=iqr_window,
).compute()

In [None]:
lowess_trace_df_nan_filtered

In [None]:
params

In [None]:
# lowess_params = ['LOWESS Trace: ' + param + ': score' for param in params_to_transform]
# feature_vector_series = lowess_trace_df.apply(lambda x: np.array(x[lowess_params].tolist()), axis=1)
# lowess_trace_df["Feature Vector"] = feature_vector_series
# lowess_trace_df_nan_filtered = lowess_trace_df[~lowess_trace_df["Feature Vector"].apply(lambda x: np.any(np.isnan(x)))]

lowess_params = ["LOWESS Trace: " + param + ": score" for param in params_to_transform]
params = [param + ": score" for param in params_to_transform]
feature_vector_sampled_series = lowess_trace_df.apply(
    lambda x: np.array(x[params].tolist()), axis=1
)
lowess_trace_df["Sampled Feature Vector"] = feature_vector_sampled_series
feature_vector_series = lowess_trace_df.apply(
    lambda x: np.array(x[lowess_params].tolist()), axis=1
)
lowess_trace_df["Feature Vector"] = feature_vector_series
# lowess_trace_df_nan_filtered = lowess_trace_df[~lowess_trace_df["Sampled Feature Vector"].apply(lambda x: np.any(np.isnan(x)))]
lowess_trace_df_nan_filtered = lowess_trace_df[
    ~lowess_trace_df["Feature Vector"].apply(lambda x: np.any(np.isnan(x)))
]

In [None]:
idx = 984
timeseries_vector = lowess_trace_df_nan_filtered["Feature Vector"][idx]
times = lowess_trace_df_nan_filtered["final cell timepoints list"][idx]

In [None]:
print(lowess_trace_df_nan_filtered["N Observations"][idx])
plt.scatter(times, lowess_trace_df_nan_filtered["Lb list: score"][idx], s=2)
plt.plot(
    np.linspace(min_tpt, max_tpt, bins),
    timeseries_vector[0],
    c="tab:orange",
    linewidth=3,
)
plt.ylim(-3.0, 2.0)
plt.show()

In [None]:
import numpy as np
import matplotlib.pylab as plt
import ruptures as rpt

# # creation of data
# n = 500  # number of samples
# n_bkps, sigma = 3, 5  # number of change points, noise standard deviation
# signal, bkps = rpt.pw_constant(n, 1, n_bkps, noise_std=sigma)

In [None]:
import numpy as np
import matplotlib.pylab as plt
import ruptures as rpt

# # creation of data
# n = 500  # number of samples
# n_bkps, sigma = 3, 5  # number of change points, noise standard deviation
# signal, bkps = rpt.pw_constant(n, 1, n_bkps, noise_std=sigma)
penalty = 2
# change point detection
model = "rbf"  # "l1", "rbf", "linear", "normal", "ar",...
# algo = rpt.Binseg(model=model,min_size=5,jump=1).fit(timeseries_vector[:,5:-5].T)
# my_bkps = algo.predict(epsilon=3 * timeseries_vector[:,5:-5].shape[1] * (sigma ** 2))
# my_bkps = algo.predict(pen=np.log(timeseries_vector[:,5:-5].shape[1]) * timeseries_vector[:,5:-5].shape[0] * sigma ** 2)
algo = rpt.Binseg(model=model, min_size=5, jump=1).fit(timeseries_vector[:, 5:-5].T)
my_bkps = algo.predict(pen=penalty)

# show results
rpt.show.display(timeseries_vector[0, 5:-5], my_bkps, my_bkps, figsize=(10, 6))
plt.show()

In [None]:
model = "rbf"

my_bkps_list = []
for idx in range(1000):
    timeseries_vector = lowess_trace_df_nan_filtered["Feature Vector"][idx]
    times = lowess_trace_df_nan_filtered["final cell timepoints list"][idx]
    algo = rpt.Binseg(model=model, min_size=5, jump=1).fit(timeseries_vector[:, 5:-5].T)
    my_bkps = algo.predict(pen=penalty)
    my_bkps = my_bkps[:-1]
    my_bkps_list.append(my_bkps)

In [None]:
print(my_bkps)

In [None]:
plt.hist([val for item in my_bkps_list for val in item], bins=20)

In [None]:
# A POSSIBLY LOWESS-free approach

In [None]:
sorted_timeseries_vector[0]

In [None]:
idx = 546
timeseries_vector = lowess_trace_df_nan_filtered["Sampled Feature Vector"][idx]
lowess_timeseries_vector = lowess_trace_df_nan_filtered["Feature Vector"][idx]
times = lowess_trace_df_nan_filtered["final cell timepoints list"][idx]
sorted_timeseries_vector = timeseries_vector[:, np.argsort(times)]
sorted_times = times[np.argsort(times)]

In [None]:
print(lowess_trace_df_nan_filtered["N Observations"][idx])
plt.scatter(times, timeseries_vector[0], s=2)
plt.plot(
    np.linspace(min_tpt, max_tpt, bins),
    lowess_timeseries_vector[0],
    c="tab:orange",
    linewidth=3,
)
plt.ylim(-4.0, 6.0)
plt.show()

In [None]:
lowess_len

In [None]:
len(scaled_breakpoints)

In [None]:
penalty = 200
# change point detection
# model = "linear"  # "l1", "rbf", "linear", "normal", "ar",...
# algo = rpt.Binseg(model=model,min_size=5,jump=1).fit(timeseries_vector[:,5:-5].T)
# my_bkps = algo.predict(epsilon=3 * timeseries_vector[:,5:-5].shape[1] * (sigma ** 2))
# my_bkps = algo.predict(pen=np.log(timeseries_vector[:,5:-5].shape[1]) * timeseries_vector[:,5:-5].shape[0] * sigma ** 2)
algo = rpt.KernelCPD(kernel="linear", min_size=50).fit(sorted_timeseries_vector.T)
my_bkps = algo.predict(pen=penalty)
scaled_breakpoints = sorted_times[np.array(my_bkps) - 1]
lowess_len = len(lowess_timeseries_vector[0])
scaled_breakpoints = scaled_breakpoints * (lowess_len / max_tpt)

# show results
rpt.show.display(sorted_timeseries_vector[0], my_bkps, my_bkps, figsize=(10, 6))
plt.show()

rpt.show.display(
    (lowess_timeseries_vector[0]),
    list(scaled_breakpoints),
    list(scaled_breakpoints),
    figsize=(10, 6),
)
plt.show()

In [None]:
my_bkps_list = []
max_bkps_list = []
for idx in range(300):
    timeseries_vector = lowess_trace_df_nan_filtered["Sampled Feature Vector"][idx]
    lowess_timeseries_vector = lowess_trace_df_nan_filtered["Feature Vector"][idx]
    times = lowess_trace_df_nan_filtered["final cell timepoints list"][idx]
    sorted_timeseries_vector = timeseries_vector[:, np.argsort(times)]
    sorted_times = times[np.argsort(times)]

    algo = rpt.KernelCPD(kernel="linear", min_size=10).fit(sorted_timeseries_vector.T)
    my_bkps = algo.predict(pen=penalty)
    #     my_bkps = my_bkps[:-1]
    scaled_breakpoints = sorted_times[np.array(my_bkps) - 1]
    scaled_breakpoints = scaled_breakpoints[:-1]
    if len(scaled_breakpoints) > 0:
        max_bkps_list.append(np.max(scaled_breakpoints))
    else:
        max_bkps_list.append(0)
    my_bkps_list.append(scaled_breakpoints)

In [None]:
plt.hist([val for item in my_bkps_list for val in item], bins=10)
plt.xlim(0, 143)

In [None]:
plt.hist(max_bkps_list, bins=15)
plt.xlim(0, 143)

In [None]:
## NOW NEED A PRINCIPLED EVAL

In [None]:
my_bkps

In [None]:
lowess_params = ["LOWESS Trace: " + param + ": score" for param in params_to_transform]
feature_vector_series = lowess_trace_df.apply(
    lambda x: np.array(x[lowess_params].tolist()), axis=1
)
lowess_trace_df["Feature Vector"] = feature_vector_series
lowess_trace_df_nan_filtered = lowess_trace_df[
    ~lowess_trace_df["Feature Vector"].apply(lambda x: np.any(np.isnan(x)))
]

In [None]:
feature_vect = np.array(feature_vector_series.tolist())
reversed_feature_vect = feature_vect[::-1]

In [None]:
last_5_timepoints = reversed_feature_vect[:, :, :5]
mean_last_5_timepoints = np.mean(last_5_timepoints, axis=2)
centered_last_5_timepoints = (
    last_5_timepoints - mean_last_5_timepoints[:, :, np.newaxis]
)
max_deviation = np.max(centered_last_5_timepoints, axis=2)

In [None]:
plt.hist(max_deviation[:, 0].flatten(), bins=50)

In [None]:
reversed_feature_vect = feature_vect[::-1]
derivative_vect = reversed_feature_vect[:, :, 1:] - reversed_feature_vect[:, :, :-1]
smoothed_derivative_vect = sp.signal.medfilt(derivative_vect, kernel_size=(1, 1, 5))
abs_smoothed_derivative_vect = np.abs(smoothed_derivative_vect)

In [None]:
plt.plot(abs_smoothed_derivative_vect[0:2000, 0].T)
plt.show()

In [None]:
plt.hist(abs_smoothed_derivative_vect[:, 0].flatten(), bins=50)
plt.show()

In [None]:
lowess_params = ["LOWESS Trace: " + param + ": score" for param in params_to_transform]
mean_vector_df = lowess_trace_df[lowess_params]
mean_vector_df = pd.concat(
    [mean_vector_df[lowess_param].explode() for lowess_param in lowess_params], axis=1
)
mean_vector_df["LOWESS Timepoint"] = mean_vector_df.groupby("sgRNA").cumcount()
mean_vector_df["Inverted LOWESS Timepoint"] = (bins - 1) - mean_vector_df.groupby(
    "sgRNA"
).cumcount()
mean_vector_df = (
    mean_vector_df.reset_index(drop=False)
    .set_index(["sgRNA", "Inverted LOWESS Timepoint"], drop=True)
    .sort_index()
)
cumulative_var_df = mean_vector_df.groupby("sgRNA").apply(
    lambda x: x.expanding(1).var()
)
var_from_end_df = pd.concat(
    [
        cumulative_var_df[lowess_param].groupby("sgRNA").apply(lambda x: x.tolist()[1:])
        for lowess_param in lowess_params
    ],
    axis=1,
)

In [None]:
lowess_params = ["LOWESS Trace: " + param + ": score" for param in params_to_transform]
mean_vector_df = lowess_trace_df[lowess_params]
mean_vector_df = pd.concat(
    [mean_vector_df[lowess_param].explode() for lowess_param in lowess_params], axis=1
)
mean_vector_df["LOWESS Timepoint"] = mean_vector_df.groupby("sgRNA").cumcount()
mean_vector_df["Inverted LOWESS Timepoint"] = (bins - 1) - mean_vector_df.groupby(
    "sgRNA"
).cumcount()
mean_vector_df = (
    mean_vector_df.reset_index(drop=False)
    .set_index(["sgRNA", "Inverted LOWESS Timepoint"], drop=True)
    .sort_index()
)
cumulative_var_df = mean_vector_df.groupby("sgRNA").apply(
    lambda x: x.expanding(1).var()
)
var_from_end_df = pd.concat(
    [
        cumulative_var_df[lowess_param].groupby("sgRNA").apply(lambda x: x.tolist()[1:])
        for lowess_param in lowess_params
    ],
    axis=1,
)

In [None]:
var_over_time_arr = np.array(var_from_end_df["LOWESS Trace: Lb list: score"].tolist())

In [None]:
plt.hist(
    var_over_time_arr[:, 2],
    bins=50,
    histtype="step",
    linewidth=3,
    density=False,
    range=(0.05, 0.5),
)  ## Last timepoint
plt.hist(
    var_over_time_arr[:, 18],
    bins=50,
    histtype="step",
    linewidth=3,
    density=False,
    range=(0.05, 0.5),
)  ## First timepoint
plt.show()

In [None]:
probas_pred = -np.concatenate([var_over_time_arr[:, 4], var_over_time_arr[:, 18]])
y_true = np.array(
    [1 for i in range(len(var_over_time_arr[:, 4]))]
    + [0 for i in range(len(var_over_time_arr[:, 18]))]
)

precision, recall, thresholds = skl.metrics.precision_recall_curve(y_true, probas_pred)
F1 = 2 * (precision * recall) / (precision + recall)
max_F1_idx = np.argmax(F1)
max_F1_prec, max_F1_recall, max_F1_thr = (
    precision[max_F1_idx],
    recall[max_F1_idx],
    -thresholds[max_F1_idx],
)

In [None]:
plt.plot(recall, precision)
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.show()

In [None]:
max_F1_prec, max_F1_recall, max_F1_thr

In [None]:
max_F1_thr

In [None]:
plt.hist(del_var, bins=50, histtype="step", linewidth=3, range=(0.1, 0.5))
plt.plot()
plt.hist(del_var_init, bins=50, histtype="step", linewidth=3, range=(0.1, 0.5))
plt.plot()

In [None]:
plt.hist(
    var_over_time_arr_control[:, 0],
    bins=50,
    histtype="step",
    linewidth=3,
    density=False,
    range=(0.1, 0.3),
)  ## Last timepoint
plt.hist(
    var_over_time_arr_control[:, 4],
    bins=50,
    histtype="step",
    linewidth=3,
    density=False,
    range=(0.1, 0.3),
)  ## 5th to last timepoint
plt.hist(
    var_over_time_arr_control[:, 18],
    bins=50,
    histtype="step",
    linewidth=3,
    density=False,
    range=(0.1, 0.3),
)  ## First timepoint
plt.show()

In [None]:
# 0.15 seems like a reasonable threshold

In [None]:
lowess_trace_df["Mean Vector"][0].shape

In [None]:
val_arr = np.array([0, 1, 2])
intervals = np.array(range(len(val_arr)), dtype=int)
interval_assign = np.where(np.greater.outer(intervals[:-1], intervals[1:]))[1]

In [None]:
intervals

In [None]:
np.greater.outer(intervals[:-1], intervals[1:])

In [None]:
pd.ex

## Piecewise Linear Breakpoints

In [None]:
def get_timepoint_values(
    df,
    label,
    min_timepoint,
    max_timepoint,
    time_label="final cell timepoints list",
    flatten_vals=True,
):
    masked_label_series = df.apply(
        lambda x: np.array(x[label])[
            (np.array(x[time_label]) >= min_timepoint)
            * (np.array(x[time_label]) <= max_timepoint)
        ],
        axis=1,
    )
    if flatten_vals:
        flattened_vals = [val for item in masked_label_series.tolist() for val in item]
        return flattened_vals
    else:
        return masked_label_series


# def get_feature_stats(df, feature_label, min_timepoint, max_timepoint):
#     feature_vals = get_timepoint_values(df, feature_label, min_timepoint, max_timepoint)
#     feature_median = np.median(feature_vals)
#     feature_iqr = sp.stats.iqr(feature_vals)
#     return feature_median,feature_iqr


def get_feature_median_bytrench(df, feature_label, min_timepoint, max_timepoint):
    masked_label_series = get_timepoint_values(
        final_output_df_pd_filtered,
        feature_label,
        min_timepoint,
        max_timepoint,
        flatten_vals=False,
    )
    trench_median_series = masked_label_series.apply(lambda x: np.nanmedian(x))
    return trench_median_series


def compute_foldchange(
    df,
    feature_label,
    trench_median_series,
    time_label="final cell timepoints list",
    timepoint_range=None,
):
    if timepoint_range == None:
        foldchange = (
            df[feature_label].apply(lambda x: np.array(x))
        ) - trench_median_series
    else:
        foldchange = (
            df[feature_label].apply(
                lambda x: np.array(x)[
                    (np.array(x[time_label]) >= timepoint_range[0])
                    * (np.array(x[time_label]) <= timepoint_range[1])
                ]
            )
        ) - trench_median_series
    return foldchange


def get_feature_foldchange(
    df,
    feature_label,
    init_timepoint_range=(0, 20),
    time_label="final cell timepoints list",
    timepoint_range=None,
):
    trench_median_series = get_feature_median_bytrench(
        df, feature_label, init_timepoint_range[0], init_timepoint_range[1]
    )
    scores = compute_foldchange(
        df,
        feature_label,
        trench_median_series,
        time_label=time_label,
        timepoint_range=timepoint_range,
    )
    return scores


def get_all_feature_foldchange(
    df,
    feature_labels,
    init_timepoint_range=(0, 20),
    time_label="final cell timepoints list",
    timepoint_range=None,
):

    for feature_label in feature_labels:
        print(feature_label)
        feature_scores = get_feature_foldchange(
            df,
            feature_label,
            init_timepoint_range=init_timepoint_range,
            time_label=time_label,
            timepoint_range=timepoint_range,
        )
        df[feature_label + ": foldchange"] = feature_scores

    return df


def get_sgrnadf_from_folddf(
    folddf, feature_labels, time_label="final cell timepoints list"
):
    folddf_groupby = folddf.groupby("sgRNA")
    sgrnadf = (
        folddf_groupby.apply(lambda x: x["phenotype trenchid"].tolist())
        .to_frame()
        .rename(columns={0: "phenotype trenchid"})
    )

    for feature_label in feature_labels:
        sgrnadf[feature_label + ": foldchange"] = folddf_groupby.apply(
            lambda x: np.array(
                [
                    val
                    for item in x[feature_label + ": foldchange"].tolist()
                    for val in item
                ]
            )
        )

    sgrnadf[time_label] = folddf_groupby.apply(
        lambda x: np.array([val for item in x[time_label].tolist() for val in item])
    )
    sgrnadf["Gene"] = folddf_groupby.apply(lambda x: x["Gene"].iloc[0])
    sgrnadf["TargetID"] = folddf_groupby.apply(lambda x: x["TargetID"].iloc[0])
    sgrnadf["N Mismatch"] = folddf_groupby.apply(lambda x: x["N Mismatch"].iloc[0])
    sgrnadf["N Observations"] = folddf_groupby.apply(
        lambda x: len(x["phenotype trenchid"].tolist())
    )
    sgrnadf["Category"] = folddf_groupby.apply(lambda x: x["Category"].iloc[0])

    return sgrnadf

In [None]:
params_to_transform = [
    "Lb list",
    "Ld list",
    "delL list",
    "Mean Area Increment list",
    "Mean Length Increment list",
    "Mean Width list",
    "Mean mCherry Intensity list",
    "Delta t list",
]
# yeo_subsample = 0.1

# final_output_df_pd_filtered_dask = dd.from_pandas(final_output_df_pd_filtered,npartitions=100).persist()
# dask.distributed.wait(final_output_df_pd_filtered_dask)

# for i,param in enumerate(params_to_transform):
#     all_param_values = [float(val) for item in final_output_df_pd_filtered_dask[param].sample(frac=yeo_subsample).compute().tolist() for val in item]
#     l_norm = sp.stats.yeojohnson_normmax(all_param_values)
#     final_output_df_pd_filtered_dask[param] = final_output_df_pd_filtered_dask[param].apply(lambda x: sp.stats.yeojohnson(np.array(x).astype(float),lmbda = l_norm), meta='object').persist()
# final_output_df_pd_filtered = final_output_df_pd_filtered_dask.compute()

# NOT A FOLD CHANGE RN
scoredf = get_all_feature_foldchange(final_output_df_pd_filtered, params_to_transform)
sgrnadf = get_sgrnadf_from_folddf(scoredf, params_to_transform)

In [None]:
sgrnadf_wellsampled = sgrnadf[sgrnadf["N Observations"] > 4]

In [None]:
min_tpt = 0
max_tpt = 143
bins = 20
frac = (1 / bins) * 4
iqr_bins = 8  # optimized on minC
iqr_window = 3

sgrnadf_wellsampled_dask = dd.from_pandas(
    sgrnadf_wellsampled, npartitions=100
).persist()
dask.distributed.wait(sgrnadf_wellsampled_dask)

lowess_trace_df = get_all_lowess_regs(
    sgrnadf_wellsampled_dask,
    sgrnadf_wellsampled.columns[1:8],
    min_tpt,
    max_tpt,
    bins,
    frac=frac,
    iqr_bins=iqr_bins,
    iqr_window=iqr_window,
).compute()

In [None]:
plt.plot(lowess_trace_df["LOWESS Trace: Lb list: foldchange"][5])

In [None]:
import numpy as np
import matplotlib.pylab as plt
import ruptures as rpt

penalty = 0.0
model = "linear"  # "l1", "rbf", "linear", "normal", "ar",...

timeseries_vector = lowess_trace_df["LOWESS Trace: Lb list: foldchange"][50].reshape(
    -1, 1
)
# algo = rpt.Binseg(model=model,min_size=5,jump=1).fit(timeseries_vector[:,5:-5].T)
# my_bkps = algo.predict(epsilon=3 * timeseries_vector[:,5:-5].shape[1] * (sigma ** 2))
# my_bkps = algo.predict(pen=np.log(timeseries_vector[:,5:-5].shape[1]) * timeseries_vector[:,5:-5].shape[0] * sigma ** 2)
algo = rpt.Binseg(model=model, min_size=5, jump=1).fit(timeseries_vector)
my_bkps = algo.predict(n_bkps=1)

# show results
rpt.show.display(timeseries_vector, my_bkps, my_bkps, figsize=(10, 6))
plt.show()

In [None]:
### Arbit thr

steady_state_thr = 7

In [None]:
lowess_trace_df[0:1]

In [None]:
val_arr = np.array(
    get_timepoint_values(lowess_trace_df[0:1], "Lb list: foldchange", 100, 143)
)
time_arr = np.array(
    get_timepoint_values(lowess_trace_df[0:1], "final cell timepoints list", 100, 143)
).reshape(-1, 1)

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
reg = LinearRegression().fit(time_arr, val_arr)

In [None]:
reg.coef_

In [None]:
def get_slope(df, label, time_label, min_timepoint, max_timepoint):
    val_arr = np.array(get_timepoint_values(df, label, min_timepoint, max_timepoint))
    time_arr = np.array(
        get_timepoint_values(df, time_label, min_timepoint, max_timepoint)
    )
    good_vals = (~np.isnan(val_arr)) * (~np.isnan(time_arr))
    time_arr = time_arr[good_vals].reshape(-1, 1)
    val_arr = val_arr[good_vals]
    reg = LinearRegression().fit(time_arr, val_arr)
    return reg.coef_[0]

In [None]:
regs = (
    lowess_trace_df[:3000]
    .groupby("sgRNA")
    .apply(
        lambda x: get_slope(
            x, "Lb list: foldchange", "final cell timepoints list", 100, 143
        )
    )
)

In [None]:
equilibrated_subset = lowess_trace_df[:3000][abs(regs) < 0.015]

In [None]:
plt.hist(abs(regs), bins=50, range=(0, 0.1))

In [None]:
plt.plot(lowess_trace_df["LOWESS Trace: Lb list: foldchange"][5])

In [None]:
def get_pearson_r(X):
    corr_coeff = sp.stats.pearsonr(X[0], X[1])[0]
    return corr_coeff


def get_corrcoeff(df, label_x, label_y, min_timepoint, max_timepoint):
    label_x_arr = np.array(
        get_timepoint_values(df, label_x, min_timepoint, max_timepoint)
    )
    label_y_arr = np.array(
        get_timepoint_values(df, label_y, min_timepoint, max_timepoint)
    )
    good_vals = (~np.isnan(label_x_arr)) * (~np.isnan(label_y_arr))
    label_x_arr = label_x_arr[good_vals]
    label_y_arr = label_y_arr[good_vals]
    X = np.stack([label_x_arr, label_y_arr])
    r = get_pearson_r(X)
    return r

In [None]:
corrs = (
    equilibrated_subset[:5000]
    .groupby("sgRNA")
    .apply(
        lambda x: get_corrcoeff(
            x, "Lb list: foldchange", "delL list: foldchange", 100, 143
        )
    )
)

In [None]:
plt.hist(corrs, bins=100)

In [None]:
corrs[corrs < -0.4].index.tolist()

In [None]:
np.unique(
    lowess_trace_df.loc[corrs[corrs < -0.4].index.tolist()]["Gene"], return_counts=True
)

In [None]:
df_of_interest = lowess_trace_df.loc[corrs[corrs < -0.4].index.tolist()]

In [None]:
for i in range(10):
    x = np.array(
        get_timepoint_values(df_of_interest[i : i + 1], "Lb list: foldchange", 100, 143)
    )
    y = np.array(
        get_timepoint_values(
            df_of_interest[i : i + 1], "delL list: foldchange", 100, 143
        )
    )
    plt.scatter(x, y)
    plt.show()

In [None]:
plt.scatter(x, y)

## Fitting Sigmoid

In [None]:
def get_timepoint_values(
    df,
    label,
    min_timepoint,
    max_timepoint,
    time_label="final cell timepoints list",
    flatten_vals=True,
):
    masked_label_series = df.apply(
        lambda x: np.array(x[label])[
            (np.array(x[time_label]) >= min_timepoint)
            * (np.array(x[time_label]) <= max_timepoint)
        ],
        axis=1,
    )
    if flatten_vals:
        flattened_vals = [val for item in masked_label_series.tolist() for val in item]
        return flattened_vals
    else:
        return masked_label_series


# def get_feature_stats(df, feature_label, min_timepoint, max_timepoint):
#     feature_vals = get_timepoint_values(df, feature_label, min_timepoint, max_timepoint)
#     feature_median = np.median(feature_vals)
#     feature_iqr = sp.stats.iqr(feature_vals)
#     return feature_median,feature_iqr


def get_feature_median_bytrench(df, feature_label, min_timepoint, max_timepoint):
    masked_label_series = get_timepoint_values(
        final_output_df_pd_filtered,
        feature_label,
        min_timepoint,
        max_timepoint,
        flatten_vals=False,
    )
    trench_median_series = masked_label_series.apply(lambda x: np.nanmedian(x))
    return trench_median_series


def compute_foldchange(
    df,
    feature_label,
    trench_median_series,
    time_label="final cell timepoints list",
    timepoint_range=None,
):
    if timepoint_range == None:
        foldchange = (
            df[feature_label].apply(lambda x: np.array(x))
        ) - trench_median_series
    else:
        foldchange = (
            df[feature_label].apply(
                lambda x: np.array(x)[
                    (np.array(x[time_label]) >= timepoint_range[0])
                    * (np.array(x[time_label]) <= timepoint_range[1])
                ]
            )
        ) - trench_median_series
    return foldchange


def get_feature_foldchange(
    df,
    feature_label,
    init_timepoint_range=(0, 20),
    time_label="final cell timepoints list",
    timepoint_range=None,
):
    trench_median_series = get_feature_median_bytrench(
        df, feature_label, init_timepoint_range[0], init_timepoint_range[1]
    )
    scores = compute_foldchange(
        df,
        feature_label,
        trench_median_series,
        time_label=time_label,
        timepoint_range=timepoint_range,
    )
    return scores


def get_all_feature_foldchange(
    df,
    feature_labels,
    init_timepoint_range=(0, 20),
    time_label="final cell timepoints list",
    timepoint_range=None,
):

    for feature_label in feature_labels:
        print(feature_label)
        feature_scores = get_feature_foldchange(
            df,
            feature_label,
            init_timepoint_range=init_timepoint_range,
            time_label=time_label,
            timepoint_range=timepoint_range,
        )
        df[feature_label + ": foldchange"] = feature_scores

    return df


def get_sgrnadf_from_folddf(
    folddf, feature_labels, time_label="final cell timepoints list"
):
    folddf_groupby = folddf.groupby("sgRNA")
    sgrnadf = (
        folddf_groupby.apply(lambda x: x["phenotype trenchid"].tolist())
        .to_frame()
        .rename(columns={0: "phenotype trenchid"})
    )

    for feature_label in feature_labels:
        sgrnadf[feature_label + ": foldchange"] = folddf_groupby.apply(
            lambda x: np.array(
                [
                    val
                    for item in x[feature_label + ": foldchange"].tolist()
                    for val in item
                ]
            )
        )

    sgrnadf[time_label] = folddf_groupby.apply(
        lambda x: np.array([val for item in x[time_label].tolist() for val in item])
    )
    sgrnadf["Gene"] = folddf_groupby.apply(lambda x: x["Gene"].iloc[0])
    sgrnadf["TargetID"] = folddf_groupby.apply(lambda x: x["TargetID"].iloc[0])
    sgrnadf["N Mismatch"] = folddf_groupby.apply(lambda x: x["N Mismatch"].iloc[0])
    sgrnadf["N Observations"] = folddf_groupby.apply(
        lambda x: len(x["phenotype trenchid"].tolist())
    )
    sgrnadf["Category"] = folddf_groupby.apply(lambda x: x["Category"].iloc[0])

    return sgrnadf

In [None]:
params_to_transform = [
    "Lb list",
    "Ld list",
    "delL list",
    "Mean Area Increment list",
    "Mean Length Increment list",
    "Mean Width list",
    "Mean mCherry Intensity list",
    "Delta t list",
]
# yeo_subsample = 0.1

# final_output_df_pd_filtered_dask = dd.from_pandas(final_output_df_pd_filtered,npartitions=100).persist()
# dask.distributed.wait(final_output_df_pd_filtered_dask)

# for i,param in enumerate(params_to_transform):
#     all_param_values = [float(val) for item in final_output_df_pd_filtered_dask[param].sample(frac=yeo_subsample).compute().tolist() for val in item]
#     l_norm = sp.stats.yeojohnson_normmax(all_param_values)
#     final_output_df_pd_filtered_dask[param] = final_output_df_pd_filtered_dask[param].apply(lambda x: sp.stats.yeojohnson(np.array(x).astype(float),lmbda = l_norm), meta='object').persist()
# final_output_df_pd_filtered = final_output_df_pd_filtered_dask.compute()

# NOT A FOLD CHANGE RN
scoredf = get_all_feature_foldchange(final_output_df_pd_filtered, params_to_transform)
sgrnadf = get_sgrnadf_from_folddf(scoredf, params_to_transform)

In [None]:
from scipy.optimize import curve_fit


def logifunc(x, A, x0, k, off):
    return A / (1 + np.exp(-k * (x - x0))) + off


def gompfunc(x, a, b, c, off):
    return (a * np.exp(-np.exp(b - (c * x)))) + off

In [None]:
min_tpt = 0
max_tpt = 143
bins = 60

In [None]:
idx = 2

y = sgrnadf["Lb list: foldchange"][idx]
x = sgrnadf["final cell timepoints list"][idx]
x_data = np.linspace(min_tpt, max_tpt, num=bins)
plt.scatter(x, y, label="Logistic function")

popt, pcov = curve_fit(
    gompfunc,
    x,
    y,
    p0=[0.0, 1.0, 1.0, 0.0],
)
midpoint = (popt[1] - np.log(np.log(2))) / popt[2]
plt.plot(x_data, gompfunc(x_data, *popt), "r-", label="Fitted function")
plt.legend()
print(midpoint)

In [None]:
popt

In [None]:
idx = 5

y = sgrnadf["Lb list: foldchange"][idx]
x = sgrnadf["final cell timepoints list"][idx]
x_data = np.linspace(min_tpt, max_tpt, num=bins)

popt, pcov = curve_fit(logifunc, x, y, p0=[50, 185, 0.1, -222])
midpoint = popt[1]
plt.scatter(x, y, label="Logistic function")
plt.plot(x_data, logifunc(x_data, *popt), "r-", label="Fitted function")
plt.legend()
print(midpoint)

In [None]:
plt.plot(sgrnadf["Lb list: foldchange"][0])

In [None]:
plt.plot(sgrnadf["Lb list: foldchange"][1])

In [None]:
plt.plot(sgrnadf["Lb list: foldchange"][2])

## Clustering on Mean Behavior Only

In [None]:
lowess_params = ["LOWESS Trace: " + param + ": score" for param in params_to_transform]
feature_vector_series = lowess_trace_df.apply(
    lambda x: np.array(x[lowess_params].tolist()), axis=1
)
lowess_trace_df["Feature Vector"] = feature_vector_series
lowess_trace_df_nan_filtered = lowess_trace_df[
    ~lowess_trace_df["Feature Vector"].apply(lambda x: np.any(np.isnan(x)))
]

In [None]:
strong_effect_threshold = 35

zero_vector = np.zeros(
    (1, lowess_trace_df_nan_filtered["Feature Vector"].iloc[0].shape[0])
)
feature_arr = np.array(lowess_trace_df_nan_filtered["Feature Vector"].tolist())
flattened_feature_arr = np.swapaxes(feature_arr, 1, 2).reshape(-1, feature_arr.shape[1])
dist_arr = euclidean_distances(flattened_feature_arr, zero_vector).reshape(
    feature_arr.shape[0], feature_arr.shape[2]
)
lowess_trace_df_nan_filtered["Integrated Euclidean Norm"] = sp.integrate.simpson(
    dist_arr
)
# lowess_trace_df["Max Euclidean Norm"] = np.max(dist_arr,axis=1)

sgrnadf_strong_effect = lowess_tlowess_trace_df_nan_filteredrace_df[
    lowess_trace_df_nan_filtered["Integrated Euclidean Norm"] >= strong_effect_threshold
]
min_v, max_v = (
    np.min(lowess_trace_df_nan_filtered["Integrated Euclidean Norm"]),
    np.percentile(lowess_trace_df_nan_filtered["Integrated Euclidean Norm"], 99),
)

plt.figure(figsize=(8, 8))
plt.title("Integrated Euclidean Norm")
plt.hist(
    lowess_trace_df_nan_filtered[
        lowess_trace_df_nan_filtered["Integrated Euclidean Norm"]
        < strong_effect_threshold
    ]["Integrated Euclidean Norm"].tolist(),
    bins=50,
    range=(min_v, max_v),
)
plt.hist(
    lowess_trace_df_nan_filtered[
        lowess_trace_df_nan_filtered["Integrated Euclidean Norm"]
        >= strong_effect_threshold
    ]["Integrated Euclidean Norm"].tolist(),
    bins=50,
    range=(min_v, max_v),
)
plt.show()

unique_genes, gene_counts = np.unique(sgrnadf_strong_effect["Gene"], return_counts=True)
plt.title("sgRNAs per Gene")
plt.xticks(range(0, 20, 2), labels=range(0, 20, 2))
plt.hist(gene_counts, bins=np.arange(20) - 0.5)
plt.show()

### Pick Representative Effect per TargetID
Note this may need to be revisited later to resolve transients that are only resolvable at intermediate KO

1) For each target, pick the sgRNA that has the strongest phenotype (highest integrated euclidean norm)
2) Additionally identify any targets with titration information by saving a dataframe with targetIDs that posess at least N sgRNAs
    - this is in a preliminary form; transfer to a full notebook later

In [None]:
most_rep_example_series = (
    sgrnadf_strong_effect.reset_index(drop=False)
    .groupby("TargetID")
    .apply(lambda x: x.iloc[np.argmax(x["Integrated Euclidean Norm"])])
    .reset_index(drop=True)
    .set_index("sgRNA", drop=True)
)

normalized_timeseries = np.swapaxes(
    normalize_timeseries(most_rep_example_series["Feature Vector"], lmbda=0.5), 1, 2
)
most_rep_example_series["Normalized Feature Vector"] = [
    normalized_timeseries[i] for i in range(normalized_timeseries.shape[0])
]

### Effect Distance Metrics

Now, I want to evaluate the performance of different distance metrics on the data wrt seperating it maximally while also preserving similarity within replicates

- DTW (can be done with cosine similarity) 
- cosine similarity (same as pearson for z-scores)
- cross correlation

Seems like soft-DTW is a pretty good option. Going forward with that for now.

<!-- In the end cosine similarity was chosen as it produced superior silhouette scores for sets of targets from genes with different phenotypes. -->

In [None]:
sgrnadf_examples_for_distance_metric = most_rep_example_series[
    most_rep_example_series["Gene"].isin(["ftsN", "rplA", "mreB", "tufB", "tff"])
]

In [None]:
from tslearn.metrics import (
    dtw,
    cdist_dtw,
    dtw_path_from_metric,
    cdist_soft_dtw,
    cdist_soft_dtw_normalized,
)
import tslearn
from tslearn.clustering import TimeSeriesKMeans

In [None]:
timeseries_arr = np.swapaxes(
    np.array(
        sgrnadf_examples_for_distance_metric["Normalized Feature Vector"].tolist()
    ),
    1,
    2,
)

In [None]:
for gamma in [0.0, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]:

    print(
        "Soft-DTW Gamma="
        + str(gamma)
        + ": "
        + str(
            tslearn.clustering.silhouette_score(
                timeseries_arr,
                sgrnadf_examples_for_distance_metric["Gene"].tolist(),
                metric="softdtw",
                gamma=gamma,
            )
        )
    )

dist_mat = np.zeros((timeseries_arr.shape[0], timeseries_arr.shape[0]))
for i in range(timeseries_arr.shape[0]):
    for j in range(i + 1, timeseries_arr.shape[0]):
        dist = dtw_path_from_metric(
            timeseries_arr[i],
            timeseries_arr[j],
            metric="cosine",
            global_constraint="sakoe_chiba",
            sakoe_chiba_radius=3,
        )[1]
        dist_mat[i, j] = dist
        dist_mat[j, i] = dist
print(
    "Cosine-DTW: "
    + str(
        tslearn.clustering.silhouette_score(
            dist_mat,
            sgrnadf_examples_for_distance_metric["Gene"].tolist(),
            metric="precomputed",
        )
    )
)

dist_mat = np.zeros((timeseries_arr.shape[0], timeseries_arr.shape[0]))
for i in range(timeseries_arr.shape[0]):
    for j in range(i + 1, timeseries_arr.shape[0]):
        dist = dtw_path_from_metric(
            timeseries_arr[i],
            timeseries_arr[j],
            metric="euclidean",
            global_constraint="sakoe_chiba",
            sakoe_chiba_radius=3,
        )[1]
        dist_mat[i, j] = dist
        dist_mat[j, i] = dist
print(
    "Euclidean-DTW: "
    + str(
        tslearn.clustering.silhouette_score(
            dist_mat,
            sgrnadf_examples_for_distance_metric["Gene"].tolist(),
            metric="precomputed",
        )
    )
)

In [None]:
soft_dtw_dist_arr = tslearn.metrics.cdist_soft_dtw(timeseries_arr)

In [None]:
plt.hist(soft_dtw_dist_arr.flatten(), bins=100)
plt.show()

### Detecting different effects against single genes

1) Plot a histogram of minimum soft-DTW similarity within groups of TargetIDs against the same genes (for genes with more than one targetID)
2) Use affinity propagation to select the number of phenotype clusters to use per gene (preference still needs to be dialed in, not sure how to optimize on this)
3) Among each cluster, represent the final effect as the strongest effect (integrated euc norm) of the members of the cluster

~~3) Among each cluster, represent the final effect as the median of the members of the cluster~~


In [None]:
def get_normed_softdtw(feature_vector_series):
    dist_mat = cdist_soft_dtw_normalized(
        np.swapaxes(np.array(feature_vector_series.tolist()), 1, 2)
    )
    timeseries_len = (
        feature_vector_series[0].shape[0] * feature_vector_series[0].shape[1]
    )
    dist_mat = dist_mat / timeseries_len
    return dist_mat


def get_upper_right_vals(a):
    upper_tri = np.triu(a, k=1)
    upper_tri[upper_tri == 0.0] = np.NaN
    return upper_tri


def get_sgRNA_clusters(df, preference=0.6):
    gene_indexed_df = (
        df.reset_index(drop=False)
        .set_index("Gene")[["sgRNA", "Normalized Feature Vector", "TargetID"]]
        .sort_index()
    )
    gene_indexed_df["sgRNA Cluster"] = pd.Series(
        np.zeros(len(gene_indexed_df), dtype=int), dtype=int
    )
    gene_df_list = []
    for gene in gene_indexed_df.index.tolist():
        gene_df = gene_indexed_df.loc[[gene]]
        if len(gene_df) > 1:
            gene_feature_vector = gene_df["Normalized Feature Vector"]
            soft_dtw_dist = get_normed_softdtw(gene_feature_vector)
            af_labels = (
                AffinityPropagation(
                    affinity="precomputed", preference=preference, random_state=42
                )
                .fit_predict(-soft_dtw_dist)
                .astype(int)
            )
            gene_indexed_df.loc[gene, "sgRNA Cluster"] = af_labels
        else:
            gene_indexed_df.loc[gene, "sgRNA Cluster"] = 0
    gene_indexed_df["sgRNA Cluster"] = gene_indexed_df["sgRNA Cluster"].astype(int)
    return gene_indexed_df

In [None]:
n_sgrna_replicate_thr = 2
pref_factor = 3.0

gene_list, counts_list = np.unique(most_rep_example_series["Gene"], return_counts=True)
genes_with_many_replicate_sgRNAs = gene_list[counts_list >= n_sgrna_replicate_thr]
sgrnadf_many_copies_per_gene = most_rep_example_series[
    most_rep_example_series["Gene"].isin(genes_with_many_replicate_sgRNAs)
]

max_distance_within_gene = sgrnadf_many_copies_per_gene.groupby("Gene").apply(
    lambda x: np.nanmax(
        get_upper_right_vals(get_normed_softdtw(x["Normalized Feature Vector"]))
    )
)
plt.title("Maximum soft-DTW Distance per Gene")
plt.hist(max_distance_within_gene, bins=50)
plt.show()

dist_within_gene = sgrnadf_many_copies_per_gene.groupby("Gene").apply(
    lambda x: get_upper_right_vals(
        get_normed_softdtw(x["Normalized Feature Vector"])
    ).flatten()
)
dist_within_gene = [val for item in dist_within_gene.tolist() for val in item]
median_similarity = -np.nanmedian(dist_within_gene)

gene_df = get_sgRNA_clusters(
    most_rep_example_series, preference=pref_factor * median_similarity
)

most_rep_example_series["sgRNA Cluster"] = gene_df.set_index("sgRNA")["sgRNA Cluster"]
most_rep_example_series["sgRNA Cluster Label"] = most_rep_example_series.apply(
    lambda x: str(x["Gene"]) + "-" + str(x["sgRNA Cluster"]), axis=1
)

gene_cluster_df = most_rep_example_series[
    [
        "sgRNA Cluster Label",
        "Normalized Feature Vector",
        "Gene",
        "Integrated Euclidean Norm",
    ]
    + ["LOWESS Trace: " + param + ": score" for param in params_to_transform]
].reset_index(drop=True)
gene_cluster_groupby = gene_cluster_df.groupby("sgRNA Cluster Label")
# median_feature_series = gene_cluster_groupby.apply(lambda x: np.median(np.stack(x["Feature Vector"]).astype(float), axis=0)).to_frame().rename(columns={0:"Feature Vector"})
feature_series = (
    gene_cluster_groupby.apply(
        lambda x: x.iloc[np.argmax(x["Integrated Euclidean Norm"])][
            "Normalized Feature Vector"
        ]
    )
    .to_frame()
    .rename(columns={0: "Normalized Feature Vector"})
)

gene_cluster_df = gene_cluster_groupby.apply(
    lambda x: x.iloc[0][
        ["Gene"]
        + ["LOWESS Trace: " + param + ": score" for param in params_to_transform]
    ]
)
gene_cluster_df = gene_cluster_df.join(feature_series)

### Clustering: TSNE and Affinity Propagation

In [None]:
X_dist = get_normed_softdtw(gene_cluster_df["Normalized Feature Vector"])

In [None]:
X_embedded = TSNE(
    n_components=2, perplexity=5.0, early_exaggeration=50.0, metric="precomputed"
).fit_transform(X_dist - np.min(X_dist))
gene_cluster_df["TSNE Coords"] = [X_embedded[i] for i in range(X_embedded.shape[0])]

af_labels = (
    AffinityPropagation(affinity="precomputed", preference=-0.5)
    .fit_predict(-X_dist)
    .astype(int)
)
gene_cluster_df["Affinity Clusts"] = af_labels

plt.scatter(
    X_embedded[:, 0],
    X_embedded[:, 1],
    s=3,
    alpha=1,
    c=gene_cluster_df["Affinity Clusts"],
)

### Hierarchical Clustering

In [None]:
feature_labels = [
    "Birth Length",
    "Division Length",
    "Area Growth Rate",
    "Length Growth Rate",
    "Average Width",
    "mCherry Intensity",
    "Cell Cycle Duration",
]

hierarchical_labels = gene_cluster_df.index.tolist()


def get_leaf_children(tree, leaf_id):
    cluster_node = tree[leaf_id]
    leaf_children = cluster_node.pre_order(lambda x: x.id)
    return leaf_children


def assign_dendro_clusts(df, children_labels):
    df_out = copy.deepcopy(df)
    df_out["Dendrogram Clusters"] = pd.Series(len(df), dtype=int)
    for clust_i, indices in enumerate(children_labels):
        df_out["Dendrogram Clusters"].iloc[indices] = clust_i
    df_out["Dendrogram Clusters"] = df_out["Dendrogram Clusters"].astype(int)
    return df_out


suppress_thr = 15
min_zscore = -2
max_zscore = 2


def compute_and_plot_dendrogram(
    df,
    X_dist,
    feature_labels,
    suppress_thr,
    min_zscore,
    max_zscore,
    cmap=mpl.cm.coolwarm,
):

    norm = mpl.colors.Normalize(vmin=min_zscore, vmax=max_zscore)

    hierarchical_labels = df.index.tolist()
    X = np.array(df["Normalized Feature Vector"].tolist())

    # Compute and plot dendrogram.
    fig = plt.figure(constrained_layout=True, figsize=(20, 10))
    gs = fig.add_gridspec(2, suppress_thr)
    dendro_ax = fig.add_subplot(gs[0, :])

    Y = sch.linkage(
        sp.spatial.distance.squareform(X_dist), method="weighted", optimal_ordering=True
    )
    #     Y = sch.linkage(X, method='weighted', metric='cosine',optimal_ordering=True)
    cluster_tree = sch.to_tree(Y, rd=True)[1]

    Z = sch.dendrogram(
        Y,
        orientation="top",
        show_leaf_counts=True,
        leaf_rotation=90.0,
        leaf_font_size=12.0,
        truncate_mode="lastp",
        show_contracted=True,
        p=suppress_thr,
        ax=dendro_ax,
        no_labels=True,
    )
    children_labels = [get_leaf_children(cluster_tree, leaf) for leaf in Z["leaves"]]

    #     fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmap),\
    #                  ax=dendro_ax, orientation='vertical', label='Z-score',\
    #                 use_gridspec=True, location='left', pad=-0.05,aspect=10)

    for i, children in enumerate(children_labels):
        children_arr = np.array(
            df.iloc[children]["Normalized Feature Vector"].tolist(), dtype=float
        )
        mean_vector = np.mean(children_arr, axis=0)  # feature,timepoint
        # fig = plt.figure(constrained_layout=True, figsize=(20,10))
        # gs = fig.add_gridspec(2, 10)
        # for v in range(10):
        #     inner_gs = gs[0,v].subgridspec(mean_vector.shape[0], 1, wspace=0, hspace=0, )
        #     inner_grid_sub = inner_gs.subplots()
        #     for c, ax in np.ndenumerate(inner_grid_sub):
        #         ax.plot(mean_vector[c])
        #         ax.set(xticks=[], yticks=[])
        if i == 0:
            inner_gs = gs[1, i].subgridspec(
                mean_vector.shape[0],
                1,
                wspace=0,
                hspace=0,
            )
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):
                ax.plot(mean_vector[c])
                ax.set_ylim(-6, 12)
                ax.set(xticks=[], yticks=[-4, 0.0, 10])
                ax.set_ylabel(
                    feature_labels[c[0]],
                    rotation=0,
                    labelpad=30,
                    fontsize=18,
                    ha="right",
                )  # ,orientation="horizontal")

            #             imshow_first_ax = fig.add_subplot(gs[1, i])
            #             imshow_first_ax.imshow(mean_vector,cmap=cmap,norm=norm)

            #             ax.tick_params(axis='x',which='both',bottom=False,top=False,labelbottom=False)
            #             ax.tick_params(axis='y',which='both',left=False,right=False,labelbottom=False)
            ax.set_xlabel(str(i), fontsize=18)

        #             ax.set_yticks(range(len(feature_labels)))
        #             ax.set_yticklabels(feature_labels, fontsize=18, )

        else:
            inner_gs = gs[1, i].subgridspec(mean_vector.shape[0], 1, wspace=0, hspace=0)
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):
                ax.plot(mean_vector[c])
                ax.set_ylim(-6, 12)
                ax.set(xticks=[], yticks=[])

            #             imshow_ax = fig.add_subplot(gs[1, i], sharey=imshow_first_ax)
            #             imshow_ax.imshow(mean_vector,cmap=cmap,norm=norm)
            #             plt.setp(imshow_ax.get_yticklabels(), visible=False)

            #             imshow_ax.tick_params(axis='x',which='both',bottom=False,top=False,labelbottom=False)
            #             imshow_ax.tick_params(axis='y',which='both',left=False,right=False,labelbottom=False)
            ax.set_xlabel(str(i), fontsize=18)
    plt.tight_layout()

    return children_labels


def plot_subset(
    df_subset,
    min_zscore=min_zscore,
    max_zscore=max_zscore,
    feature_labels=feature_labels,
    figsize=(10, 10),
    wspace=0.0,
):

    df_clusts = (
        df_subset.sort_index()
        .reset_index(drop=False)
        .set_index("Dendrogram Clusters")[
            ["sgRNA Cluster Label", "Normalized Feature Vector"]
        ]
        .sort_index()
    )

    cmap = mpl.cm.coolwarm
    norm = mpl.colors.Normalize(vmin=min_zscore, vmax=max_zscore)

    # Compute and plot dendrogram.
    fig = plt.figure(constrained_layout=True, figsize=figsize)
    gs = fig.add_gridspec(1, len(df_clusts), wspace=wspace)

    for i in range(len(df_clusts)):
        clust_arr = np.array(
            df_clusts["Normalized Feature Vector"].iloc[i].tolist(), dtype=float
        )

        if i == 0:
            inner_gs = gs[0, i].subgridspec(
                clust_arr.shape[0],
                1,
                wspace=0,
                hspace=0,
            )
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):
                ax.plot(clust_arr[c])
                ax.set_ylim(-6, 12)
                ax.set(xticks=[], yticks=[-4, 0.0, 10.0])
                ax.set_ylabel(
                    feature_labels[c[0]],
                    rotation=0,
                    labelpad=30,
                    fontsize=18,
                    ha="right",
                )  # ,orientation="horizontal")

            ax.set_xlabel(
                df_clusts["sgRNA Cluster Label"].iloc[i]
                + "\n Cluster "
                + str(df_clusts.index[i]),
                fontsize=14,
            )

        #             imshow_first_ax = fig.add_subplot(gs[0, i])
        #             imshow_first_ax.imshow(df_clusts["Feature Vector"].iloc[i].astype(float).reshape(-1,1),cmap=cmap,norm=norm)

        #             imshow_first_ax.tick_params(axis='x',which='both',bottom=False,top=False,labelbottom=False)
        #             imshow_first_ax.tick_params(axis='y',which='both',left=False,right=False,labelbottom=False)
        #             imshow_first_ax.set_xlabel(df_clusts["sgRNA Cluster Label"].iloc[i] + "\n Cluster " + str(df_clusts.index[i]), fontsize=14)

        #             imshow_first_ax.set_yticks(range(len(feature_labels)))
        #             imshow_first_ax.set_yticklabels(feature_labels, fontsize=18, )
        else:
            inner_gs = gs[0, i].subgridspec(clust_arr.shape[0], 1, wspace=0, hspace=0)
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):
                ax.plot(clust_arr[c])
                ax.set_ylim(-6, 12)
                ax.set(xticks=[], yticks=[])

            ax.set_xlabel(
                df_clusts["sgRNA Cluster Label"].iloc[i]
                + "\n Cluster "
                + str(df_clusts.index[i]),
                fontsize=14,
            )

    plt.tight_layout()


def make_subset_dendrogram(
    sub_df,
    title,
    feature_labels=feature_labels,
    min_zscore=min_zscore,
    max_zscore=max_zscore,
    figsize=(10, 10),
    fontsize=18,
    linewidth=5,
):
    X_dist = get_normed_softdtw(sub_df["Normalized Feature Vector"])
    X = np.array(sub_df["Normalized Feature Vector"].tolist())

    # Compute and plot dendrogram.
    fig = plt.figure(constrained_layout=True, figsize=figsize)
    gs = fig.add_gridspec(2, len(sub_df))
    dendro_ax = fig.add_subplot(gs[0, :])

    Y = sch.linkage(
        sp.spatial.distance.squareform(X_dist), method="weighted", optimal_ordering=True
    )
    cluster_tree = sch.to_tree(Y, rd=True)[1]

    Z = sch.dendrogram(
        Y,
        orientation="top",
        show_leaf_counts=True,
        leaf_rotation=90.0,
        leaf_font_size=12.0,
        show_contracted=True,
        ax=dendro_ax,
        no_labels=True,
    )

    cmap = mpl.cm.coolwarm
    norm = mpl.colors.Normalize(vmin=min_zscore, vmax=max_zscore)

    fig.suptitle(title, fontsize=fontsize)

    for i, leaf in enumerate(Z["leaves"]):
        leaf_arr = np.array(
            sub_df.iloc[leaf]["Normalized Feature Vector"].tolist(), dtype=float
        )

        if i == 0:
            inner_gs = gs[1, i].subgridspec(
                leaf_arr.shape[0],
                1,
                wspace=0,
                hspace=0,
            )
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):
                ax.plot(leaf_arr[c], linewidth=linewidth)
                ax.set_ylim(-6, 12)
                ax.set(xticks=[], yticks=[-4, 0.0, 8])
                ax.set_ylabel(
                    feature_labels[c[0]],
                    rotation=0,
                    labelpad=30,
                    fontsize=fontsize,
                    ha="right",
                )  # ,orientation="horizontal")

            #             imshow_first_ax = fig.add_subplot(gs[1, i])
            #             imshow_first_ax.imshow(mean_vector,cmap=cmap,norm=norm)

            #             ax.tick_params(axis='x',which='both',bottom=False,top=False,labelbottom=False)
            #             ax.tick_params(axis='y',which='both',left=False,right=False,labelbottom=False)
            ax.set_xlabel(sub_df.index[leaf], fontsize=fontsize, rotation=90)

        #             ax.set_yticks(range(len(feature_labels)))
        #             ax.set_yticklabels(feature_labels, fontsize=18, )

        else:
            inner_gs = gs[1, i].subgridspec(leaf_arr.shape[0], 1, wspace=0, hspace=0)
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):
                ax.plot(leaf_arr[c], linewidth=linewidth)
                ax.set_ylim(-6, 12)
                ax.set(xticks=[], yticks=[])

            #             imshow_ax = fig.add_subplot(gs[1, i], sharey=imshow_first_ax)
            #             imshow_ax.imshow(mean_vector,cmap=cmap,norm=norm)
            #             plt.setp(imshow_ax.get_yticklabels(), visible=False)

            #             imshow_ax.tick_params(axis='x',which='both',bottom=False,top=False,labelbottom=False)
            #             imshow_ax.tick_params(axis='y',which='both',left=False,right=False,labelbottom=False)
            ax.set_xlabel(sub_df.index[leaf], fontsize=fontsize, rotation=90)

    plt.tight_layout()


#         imshow_ax.tick_params(axis='x',which='both',bottom=False,top=False,labelbottom=False)
#         imshow_ax.tick_params(axis='y',which='both',left=False,right=False,labelbottom=False)

#         imshow_ax.set_xlabel(sub_df.index[leaf], fontsize=fontsize)

In [None]:
# compute_and_plot_dendrogram(df,X_dist,feature_labels,suppress_thr,min_zscore,max_zscore,cmap=mpl.cm.coolwarm)

In [None]:
children_labels = compute_and_plot_dendrogram(
    gene_cluster_df,
    X_dist,
    feature_labels,
    suppress_thr,
    min_zscore,
    max_zscore,
    cmap=mpl.cm.coolwarm,
)
# plt.savefig("./Dendrograms/Global_Dendrogram.png", dpi=300)

In [None]:
gene_cluster_df = assign_dendro_clusts(gene_cluster_df, children_labels)

#### Major System Analysis

In [None]:
fts_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "fts" in x["Gene"], axis=1)
]

In [None]:
plot_subset(fts_subset, figsize=(20, 8))
plt.tight_layout()
# plt.savefig("./Gene_Groups/fts.png",dpi=200,bbox_inches="tight")

In [None]:
rpl_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "rpl" in x["Gene"], axis=1)
]

In [None]:
plot_subset(rpl_subset, figsize=(30, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/rpl.png",dpi=200,bbox_inches="tight")

In [None]:
rpm_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "rpm" in x["Gene"], axis=1)
]

In [None]:
plot_subset(rpm_subset, figsize=(30, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/rpm.png",dpi=200,bbox_inches="tight")

In [None]:
rps_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "rps" in x["Gene"], axis=1)
]

In [None]:
plot_subset(rps_subset, figsize=(30, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/rps.png",dpi=200,bbox_inches="tight")

In [None]:
rr_subset = gene_cluster_df[gene_cluster_df.apply(lambda x: "rr" in x["Gene"], axis=1)]

In [None]:
plot_subset(rr_subset, figsize=(15, 10))

In [None]:
tff_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "tff" in x["Gene"], axis=1)
]

In [None]:
plot_subset(tff_subset, figsize=(8, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/tff.png",dpi=200,bbox_inches="tight")

In [None]:
rpo_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "rpo" in x["Gene"], axis=1)
]

In [None]:
plot_subset(rpo_subset, figsize=(10, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/rpo.png",dpi=200,bbox_inches="tight")

In [None]:
min_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "min" in x["Gene"], axis=1)
]

In [None]:
plot_subset(min_subset, figsize=(12, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/min.png",dpi=200,bbox_inches="tight")

In [None]:
dna_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "dna" in x["Gene"], axis=1)
]

In [None]:
plot_subset(dna_subset, figsize=(12, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/dna.png",dpi=200,bbox_inches="tight")

In [None]:
fol_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "fol" in x["Gene"], axis=1)
]

In [None]:
plot_subset(fol_subset, figsize=(12, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/fol.png",dpi=200,bbox_inches="tight")

In [None]:
muk_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "muk" in x["Gene"], axis=1)
]

In [None]:
plot_subset(muk_subset, figsize=(12, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/muk.png",dpi=200,bbox_inches="tight")

In [None]:
mre_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "mre" in x["Gene"], axis=1)
]

In [None]:
plot_subset(mre_subset, figsize=(10, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/mre.png",dpi=200,bbox_inches="tight")

In [None]:
mur_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "mur" in x["Gene"], axis=1)
]

In [None]:
plot_subset(mur_subset, figsize=(12, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/mur.png",dpi=200,bbox_inches="tight")

In [None]:
nus_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "nus" in x["Gene"], axis=1)
]

In [None]:
plot_subset(nus_subset, figsize=(8, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/nus.png",dpi=200,bbox_inches="tight")

In [None]:
sec_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "sec" in x["Gene"], axis=1)
]

In [None]:
plot_subset(sec_subset, figsize=(12, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/sec.png",dpi=200,bbox_inches="tight")

In [None]:
bam_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "bam" in x["Gene"], axis=1)
]

In [None]:
plot_subset(bam_subset, figsize=(6, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/bam.png",dpi=200,bbox_inches="tight")

In [None]:
hol_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "hol" in x["Gene"], axis=1)
]

In [None]:
plot_subset(hol_subset, figsize=(12, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/hol.png",dpi=200,bbox_inches="tight")

In [None]:
hda_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "hda" in x["Gene"], axis=1)
]

In [None]:
plot_subset(hda_subset, figsize=(6, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/hda.png",dpi=200,bbox_inches="tight")

In [None]:
rodZ_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "rodZ" in x["Gene"], axis=1)
]

In [None]:
plot_subset(rodZ_subset, figsize=(6, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/rodz.png",dpi=200,bbox_inches="tight")

#### Cluster Analysis

In [None]:
clusters, cluster_counts = np.unique(
    gene_cluster_df["Dendrogram Clusters"], return_counts=True
)
singleton_clusters = clusters[cluster_counts < 3]
small_clusters = clusters[(cluster_counts <= 40) & (cluster_counts >= 3)]
big_clusters = clusters[cluster_counts > 40]
print(singleton_clusters)
print(small_clusters)
print(big_clusters)

In [None]:
cluster_3to4 = gene_cluster_df[gene_cluster_df["Dendrogram Clusters"].isin([3, 4])]
cluster_9to11 = gene_cluster_df[
    gene_cluster_df["Dendrogram Clusters"].isin([9, 10, 11])
]
cluster_2 = gene_cluster_df[gene_cluster_df["Dendrogram Clusters"].isin([2])]
cluster_12 = gene_cluster_df[gene_cluster_df["Dendrogram Clusters"].isin([12])]
cluster_13 = gene_cluster_df[gene_cluster_df["Dendrogram Clusters"].isin([13])]
cluster_14 = gene_cluster_df[gene_cluster_df["Dendrogram Clusters"].isin([14])]

In [None]:
remaining_small_clusters = list(set(small_clusters) - set([3, 4, 9, 10, 11]))

In [None]:
remaining_small_clusters

In [None]:
for i in remaining_small_clusters:
    cluster_df = gene_cluster_df[gene_cluster_df["Dendrogram Clusters"].isin([i])]
    make_subset_dendrogram(
        cluster_df,
        "Cluster " + str(i) + " Dendrogram",
        figsize=(int((len(cluster_df) * 5.0) - 9.0), int((len(cluster_df) * 2.0) + 3)),
        fontsize=4 + int(len(cluster_df) * 4.0),
        linewidth=1 + int(len(cluster_df) * 1.0),
    )
    plt.show()
#     plt.savefig("./Dendrograms/Cluster_" + str(i) + ".png",dpi=200)

make_subset_dendrogram(
    cluster_3to4,
    "Cluster 3 to 4 Dendrogram",
    figsize=(int((len(cluster_3to4) * 5.0) - 9.0), int((len(cluster_3to4) * 2.0) + 3)),
    fontsize=4 + int(len(cluster_3to4) * 4.0),
    linewidth=1 + int(len(cluster_3to4) * 1.0),
)
plt.show()
# plt.savefig("./Dendrograms/Cluster_6to8.png",dpi=200)

make_subset_dendrogram(
    cluster_9to11,
    "Cluster 9 to 11 Dendrogram",
    figsize=(
        int((len(cluster_9to11) * 5.0) - 9.0),
        int((len(cluster_9to11) * 2.0) + 3),
    ),
    fontsize=4 + int(len(cluster_9to11) * 4.0),
    linewidth=1 + int(len(cluster_9to11) * 1.0),
)
plt.show()
# plt.savefig("./Dendrograms/Cluster_9to10.png",dpi=200)

In [None]:
make_subset_dendrogram(
    cluster_2,
    "Cluster 2 Dendrogram",
    figsize=(int((len(cluster_2) * 5.0) - 9.0), int((len(cluster_2) * 2.0) + 3)),
    fontsize=4 + int(len(cluster_2) * 4.0),
    linewidth=1 + int(len(cluster_2) * 1.0),
)
plt.show()

In [None]:
make_subset_dendrogram(
    cluster_12,
    "Cluster 12 Dendrogram",
    figsize=(int((len(cluster_12) * 5.0) - 9.0), int((len(cluster_12) * 2.0) + 3)),
    fontsize=4 + int(len(cluster_12) * 4.0),
    linewidth=1 + int(len(cluster_12) * 1.0),
)
plt.show()

In [None]:
make_subset_dendrogram(
    cluster_13,
    "Cluster 13 Dendrogram",
    figsize=(int((len(cluster_13) * 5.0) - 9.0), int((len(cluster_13) * 2.0) + 3)),
    fontsize=4 + int(len(cluster_13) * 4.0),
    linewidth=1 + int(len(cluster_13) * 1.0),
)
plt.show()

In [None]:
make_subset_dendrogram(
    cluster_14,
    "Cluster 14 Dendrogram",
    figsize=(int((len(cluster_14) * 5.0) - 9.0), int((len(cluster_14) * 2.0) + 3)),
    fontsize=4 + int(len(cluster_14) * 4.0),
    linewidth=1 + int(len(cluster_14) * 1.0),
)
plt.show()

In [None]:
# gene_cluster_df.to_csv("2021-07-31_Steady_State_Analysis.csv")

## Clustering on Mean and Variance Behavior

In [None]:
lowess_params = ["LOWESS Trace: " + param + ": score" for param in params_to_transform]

lowess_and_iqr_params = [
    "LOWESS Trace: " + param + ": score" for param in params_to_transform
] + ["Binned IQR Interpolation: " + param + ": score" for param in params_to_transform]

In [None]:
lowess_trace_df["Feature Vector"] = lowess_trace_df.apply(
    lambda x: np.array(x[lowess_and_iqr_params].tolist()), axis=1
)
lowess_trace_df["Feature Vector OnlyLOWESS"] = lowess_trace_df.apply(
    lambda x: np.array(x[lowess_params].tolist()), axis=1
)
lowess_trace_df_nan_filtered = lowess_trace_df[
    ~lowess_trace_df["Feature Vector"].apply(lambda x: np.any(np.isnan(x)))
]

In [None]:
strong_effect_threshold = 35

zero_vector = np.zeros(
    (1, lowess_trace_df["Feature Vector OnlyLOWESS"].iloc[0].shape[0])
)
feature_arr = np.array(lowess_trace_df["Feature Vector OnlyLOWESS"].tolist())
flattened_feature_arr = np.swapaxes(feature_arr, 1, 2).reshape(-1, feature_arr.shape[1])
dist_arr = euclidean_distances(flattened_feature_arr, zero_vector).reshape(
    feature_arr.shape[0], feature_arr.shape[2]
)
lowess_trace_df["Integrated Euclidean Norm"] = sp.integrate.simpson(dist_arr)
# lowess_trace_df["Max Euclidean Norm"] = np.max(dist_arr,axis=1)

sgrnadf_strong_effect = lowess_trace_df[
    lowess_trace_df["Integrated Euclidean Norm"] >= strong_effect_threshold
]
min_v, max_v = np.min(lowess_trace_df["Integrated Euclidean Norm"]), np.percentile(
    lowess_trace_df["Integrated Euclidean Norm"], 99
)

plt.figure(figsize=(8, 8))
plt.title("Integrated Euclidean Norm")
plt.hist(
    lowess_trace_df[
        lowess_trace_df["Integrated Euclidean Norm"] < strong_effect_threshold
    ]["Integrated Euclidean Norm"].tolist(),
    bins=50,
    range=(min_v, max_v),
)
plt.hist(
    lowess_trace_df[
        lowess_trace_df["Integrated Euclidean Norm"] >= strong_effect_threshold
    ]["Integrated Euclidean Norm"].tolist(),
    bins=50,
    range=(min_v, max_v),
)
plt.show()

unique_genes, gene_counts = np.unique(sgrnadf_strong_effect["Gene"], return_counts=True)
plt.title("sgRNAs per Gene")
plt.xticks(range(0, 20, 2), labels=range(0, 20, 2))
plt.hist(gene_counts, bins=np.arange(20) - 0.5)
plt.show()

### Pick Representative Effect per TargetID
Note this may need to be revisited later to resolve transients that are only resolvable at intermediate KO

1) For each target, pick the sgRNA that has the strongest phenotype (highest integrated euclidean norm)
2) Additionally identify any targets with titration information by saving a dataframe with targetIDs that posess at least N sgRNAs
    - this is in a preliminary form; transfer to a full notebook later

In [None]:
most_rep_example_series = (
    sgrnadf_strong_effect.reset_index(drop=False)
    .groupby("TargetID")
    .apply(lambda x: x.iloc[np.argmax(x["Integrated Euclidean Norm"])])
    .reset_index(drop=True)
    .set_index("sgRNA", drop=True)
)

normalized_timeseries = np.swapaxes(
    normalize_timeseries(most_rep_example_series["Feature Vector"], lmbda=0.5), 1, 2
)
most_rep_example_series["Normalized Feature Vector"] = [
    normalized_timeseries[i] for i in range(normalized_timeseries.shape[0])
]

### Effect Distance Metrics

Now, I want to evaluate the performance of different distance metrics on the data wrt seperating it maximally while also preserving similarity within replicates

- DTW (can be done with cosine similarity) 
- cosine similarity (same as pearson for z-scores)
- cross correlation

Seems like soft-DTW is a pretty good option. Going forward with that for now.

<!-- In the end cosine similarity was chosen as it produced superior silhouette scores for sets of targets from genes with different phenotypes. -->

In [None]:
sgrnadf_examples_for_distance_metric = most_rep_example_series[
    most_rep_example_series["Gene"].isin(["ftsN", "rplA", "mreB", "tufB", "tff"])
]

In [None]:
from tslearn.metrics import (
    dtw,
    cdist_dtw,
    dtw_path_from_metric,
    cdist_soft_dtw,
    cdist_soft_dtw_normalized,
)
import tslearn
from tslearn.clustering import TimeSeriesKMeans

In [None]:
timeseries_arr = np.swapaxes(
    np.array(
        sgrnadf_examples_for_distance_metric["Normalized Feature Vector"].tolist()
    ),
    1,
    2,
)

In [None]:
for gamma in [0.0, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]:

    print(
        "Soft-DTW Gamma="
        + str(gamma)
        + ": "
        + str(
            tslearn.clustering.silhouette_score(
                timeseries_arr,
                sgrnadf_examples_for_distance_metric["Gene"].tolist(),
                metric="softdtw",
                gamma=gamma,
            )
        )
    )

dist_mat = np.zeros((timeseries_arr.shape[0], timeseries_arr.shape[0]))
for i in range(timeseries_arr.shape[0]):
    for j in range(i + 1, timeseries_arr.shape[0]):
        dist = dtw_path_from_metric(
            timeseries_arr[i],
            timeseries_arr[j],
            metric="cosine",
            global_constraint="sakoe_chiba",
            sakoe_chiba_radius=3,
        )[1]
        dist_mat[i, j] = dist
        dist_mat[j, i] = dist
print(
    "Cosine-DTW: "
    + str(
        tslearn.clustering.silhouette_score(
            dist_mat,
            sgrnadf_examples_for_distance_metric["Gene"].tolist(),
            metric="precomputed",
        )
    )
)

dist_mat = np.zeros((timeseries_arr.shape[0], timeseries_arr.shape[0]))
for i in range(timeseries_arr.shape[0]):
    for j in range(i + 1, timeseries_arr.shape[0]):
        dist = dtw_path_from_metric(
            timeseries_arr[i],
            timeseries_arr[j],
            metric="euclidean",
            global_constraint="sakoe_chiba",
            sakoe_chiba_radius=3,
        )[1]
        dist_mat[i, j] = dist
        dist_mat[j, i] = dist
print(
    "Euclidean-DTW: "
    + str(
        tslearn.clustering.silhouette_score(
            dist_mat,
            sgrnadf_examples_for_distance_metric["Gene"].tolist(),
            metric="precomputed",
        )
    )
)

In [None]:
soft_dtw_dist_arr = tslearn.metrics.cdist_soft_dtw(timeseries_arr)

In [None]:
plt.hist(soft_dtw_dist_arr.flatten(), bins=100)
plt.show()

### Detecting different effects against single genes

1) Plot a histogram of minimum soft-DTW similarity within groups of TargetIDs against the same genes (for genes with more than one targetID)
2) Use affinity propagation to select the number of phenotype clusters to use per gene (preference still needs to be dialed in, not sure how to optimize on this)
3) Among each cluster, represent the final effect as the strongest effect (integrated euc norm) of the members of the cluster

~~3) Among each cluster, represent the final effect as the median of the members of the cluster~~


In [None]:
def get_normed_softdtw(feature_vector_series):
    dist_mat = cdist_soft_dtw_normalized(
        np.swapaxes(np.array(feature_vector_series.tolist()), 1, 2)
    )
    timeseries_len = (
        feature_vector_series[0].shape[0] * feature_vector_series[0].shape[1]
    )
    dist_mat = dist_mat / timeseries_len
    return dist_mat


def get_upper_right_vals(a):
    upper_tri = np.triu(a, k=1)
    upper_tri[upper_tri == 0.0] = np.NaN
    return upper_tri


def get_sgRNA_clusters(df, preference=0.6):
    gene_indexed_df = (
        df.reset_index(drop=False)
        .set_index("Gene")[["sgRNA", "Normalized Feature Vector", "TargetID"]]
        .sort_index()
    )
    gene_indexed_df["sgRNA Cluster"] = pd.Series(
        np.zeros(len(gene_indexed_df), dtype=int), dtype=int
    )
    gene_df_list = []
    for gene in gene_indexed_df.index.tolist():
        gene_df = gene_indexed_df.loc[[gene]]
        if len(gene_df) > 1:
            gene_feature_vector = gene_df["Normalized Feature Vector"]
            soft_dtw_dist = get_normed_softdtw(gene_feature_vector)
            af_labels = (
                AffinityPropagation(
                    affinity="precomputed", preference=preference, random_state=42
                )
                .fit_predict(-soft_dtw_dist)
                .astype(int)
            )
            gene_indexed_df.loc[gene, "sgRNA Cluster"] = af_labels
        else:
            gene_indexed_df.loc[gene, "sgRNA Cluster"] = 0
    gene_indexed_df["sgRNA Cluster"] = gene_indexed_df["sgRNA Cluster"].astype(int)
    return gene_indexed_df

In [None]:
n_sgrna_replicate_thr = 2
pref_factor = 3.0

gene_list, counts_list = np.unique(most_rep_example_series["Gene"], return_counts=True)
genes_with_many_replicate_sgRNAs = gene_list[counts_list >= n_sgrna_replicate_thr]
sgrnadf_many_copies_per_gene = most_rep_example_series[
    most_rep_example_series["Gene"].isin(genes_with_many_replicate_sgRNAs)
]

max_distance_within_gene = sgrnadf_many_copies_per_gene.groupby("Gene").apply(
    lambda x: np.nanmax(
        get_upper_right_vals(get_normed_softdtw(x["Normalized Feature Vector"]))
    )
)
plt.title("Maximum soft-DTW Distance per Gene")
plt.hist(max_distance_within_gene, bins=50)
plt.show()

dist_within_gene = sgrnadf_many_copies_per_gene.groupby("Gene").apply(
    lambda x: get_upper_right_vals(
        get_normed_softdtw(x["Normalized Feature Vector"])
    ).flatten()
)
dist_within_gene = [val for item in dist_within_gene.tolist() for val in item]
median_similarity = -np.nanmedian(dist_within_gene)

gene_df = get_sgRNA_clusters(
    most_rep_example_series, preference=pref_factor * median_similarity
)

most_rep_example_series["sgRNA Cluster"] = gene_df.set_index("sgRNA")["sgRNA Cluster"]
most_rep_example_series["sgRNA Cluster Label"] = most_rep_example_series.apply(
    lambda x: str(x["Gene"]) + "-" + str(x["sgRNA Cluster"]), axis=1
)

gene_cluster_df = most_rep_example_series[
    [
        "sgRNA Cluster Label",
        "Normalized Feature Vector",
        "Gene",
        "Integrated Euclidean Norm",
    ]
    + ["LOWESS Trace: " + param + ": score" for param in params_to_transform]
].reset_index(drop=True)
gene_cluster_groupby = gene_cluster_df.groupby("sgRNA Cluster Label")
# median_feature_series = gene_cluster_groupby.apply(lambda x: np.median(np.stack(x["Feature Vector"]).astype(float), axis=0)).to_frame().rename(columns={0:"Feature Vector"})
feature_series = (
    gene_cluster_groupby.apply(
        lambda x: x.iloc[np.argmax(x["Integrated Euclidean Norm"])][
            "Normalized Feature Vector"
        ]
    )
    .to_frame()
    .rename(columns={0: "Normalized Feature Vector"})
)

gene_cluster_df = gene_cluster_groupby.apply(
    lambda x: x.iloc[0][
        ["Gene"]
        + ["LOWESS Trace: " + param + ": score" for param in params_to_transform]
    ]
)
gene_cluster_df = gene_cluster_df.join(feature_series)

### Clustering: TSNE and Affinity Propagation

In [None]:
X_dist = get_normed_softdtw(gene_cluster_df["Normalized Feature Vector"])

In [None]:
X_embedded = TSNE(
    n_components=2, perplexity=5.0, early_exaggeration=50.0, metric="precomputed"
).fit_transform(X_dist - np.min(X_dist))
gene_cluster_df["TSNE Coords"] = [X_embedded[i] for i in range(X_embedded.shape[0])]

af_labels = (
    AffinityPropagation(affinity="precomputed", preference=-0.5)
    .fit_predict(-X_dist)
    .astype(int)
)
gene_cluster_df["Affinity Clusts"] = af_labels

plt.scatter(
    X_embedded[:, 0],
    X_embedded[:, 1],
    s=3,
    alpha=1,
    c=gene_cluster_df["Affinity Clusts"],
)

### Hierarchical Clustering

In [None]:
feature_labels = [
    "Birth Length",
    "Division Length",
    "Area Growth Rate",
    "Length Growth Rate",
    "Average Width",
    "mCherry Intensity",
    "Cell Cycle Duration",
]

feature_labels = feature_labels + [label + ": IQR" for label in feature_labels]

hierarchical_labels = gene_cluster_df.index.tolist()


def get_leaf_children(tree, leaf_id):
    cluster_node = tree[leaf_id]
    leaf_children = cluster_node.pre_order(lambda x: x.id)
    return leaf_children


def assign_dendro_clusts(df, children_labels):
    df_out = copy.deepcopy(df)
    df_out["Dendrogram Clusters"] = pd.Series(len(df), dtype=int)
    for clust_i, indices in enumerate(children_labels):
        df_out["Dendrogram Clusters"].iloc[indices] = clust_i
    df_out["Dendrogram Clusters"] = df_out["Dendrogram Clusters"].astype(int)
    return df_out


suppress_thr = 15
min_zscore = -2
max_zscore = 2


def compute_and_plot_dendrogram(
    df,
    X_dist,
    feature_labels,
    suppress_thr,
    min_zscore,
    max_zscore,
    cmap=mpl.cm.coolwarm,
):

    norm = mpl.colors.Normalize(vmin=min_zscore, vmax=max_zscore)

    hierarchical_labels = df.index.tolist()
    X = np.array(df["Normalized Feature Vector"].tolist())

    # Compute and plot dendrogram.
    fig = plt.figure(constrained_layout=True, figsize=(20, 10))
    gs = fig.add_gridspec(2, suppress_thr)
    dendro_ax = fig.add_subplot(gs[0, :])

    Y = sch.linkage(
        sp.spatial.distance.squareform(X_dist), method="weighted", optimal_ordering=True
    )
    #     Y = sch.linkage(X, method='weighted', metric='cosine',optimal_ordering=True)
    cluster_tree = sch.to_tree(Y, rd=True)[1]

    Z = sch.dendrogram(
        Y,
        orientation="top",
        show_leaf_counts=True,
        leaf_rotation=90.0,
        leaf_font_size=12.0,
        truncate_mode="lastp",
        show_contracted=True,
        p=suppress_thr,
        ax=dendro_ax,
        no_labels=True,
    )
    children_labels = [get_leaf_children(cluster_tree, leaf) for leaf in Z["leaves"]]

    #     fig.colorbar(mpl.cm.ScalarMappable(norm=norm, cmap=cmap),\
    #                  ax=dendro_ax, orientation='vertical', label='Z-score',\
    #                 use_gridspec=True, location='left', pad=-0.05,aspect=10)

    for i, children in enumerate(children_labels):
        children_arr = np.array(
            df.iloc[children]["Normalized Feature Vector"].tolist(), dtype=float
        )
        mean_vector = np.mean(children_arr, axis=0)  # feature,timepoint
        # fig = plt.figure(constrained_layout=True, figsize=(20,10))
        # gs = fig.add_gridspec(2, 10)
        # for v in range(10):
        #     inner_gs = gs[0,v].subgridspec(mean_vector.shape[0], 1, wspace=0, hspace=0, )
        #     inner_grid_sub = inner_gs.subplots()
        #     for c, ax in np.ndenumerate(inner_grid_sub):
        #         ax.plot(mean_vector[c])
        #         ax.set(xticks=[], yticks=[])
        if i == 0:
            inner_gs = gs[1, i].subgridspec(
                mean_vector.shape[0],
                1,
                wspace=0,
                hspace=0,
            )
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):
                ax.plot(mean_vector[c])
                ax.set_ylim(-6, 12)
                ax.set(xticks=[], yticks=[-4, 0.0, 10])
                ax.set_ylabel(
                    feature_labels[c[0]],
                    rotation=0,
                    labelpad=30,
                    fontsize=18,
                    ha="right",
                )  # ,orientation="horizontal")

            #             imshow_first_ax = fig.add_subplot(gs[1, i])
            #             imshow_first_ax.imshow(mean_vector,cmap=cmap,norm=norm)

            #             ax.tick_params(axis='x',which='both',bottom=False,top=False,labelbottom=False)
            #             ax.tick_params(axis='y',which='both',left=False,right=False,labelbottom=False)
            ax.set_xlabel(str(i), fontsize=18)

        #             ax.set_yticks(range(len(feature_labels)))
        #             ax.set_yticklabels(feature_labels, fontsize=18, )

        else:
            inner_gs = gs[1, i].subgridspec(mean_vector.shape[0], 1, wspace=0, hspace=0)
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):
                ax.plot(mean_vector[c])
                ax.set_ylim(-6, 12)
                ax.set(xticks=[], yticks=[])

            #             imshow_ax = fig.add_subplot(gs[1, i], sharey=imshow_first_ax)
            #             imshow_ax.imshow(mean_vector,cmap=cmap,norm=norm)
            #             plt.setp(imshow_ax.get_yticklabels(), visible=False)

            #             imshow_ax.tick_params(axis='x',which='both',bottom=False,top=False,labelbottom=False)
            #             imshow_ax.tick_params(axis='y',which='both',left=False,right=False,labelbottom=False)
            ax.set_xlabel(str(i), fontsize=18)
    plt.tight_layout()

    return children_labels


def plot_subset(
    df_subset,
    min_zscore=min_zscore,
    max_zscore=max_zscore,
    feature_labels=feature_labels,
    figsize=(10, 10),
    wspace=0.0,
):

    df_clusts = (
        df_subset.sort_index()
        .reset_index(drop=False)
        .set_index("Dendrogram Clusters")[
            ["sgRNA Cluster Label", "Normalized Feature Vector"]
        ]
        .sort_index()
    )

    cmap = mpl.cm.coolwarm
    norm = mpl.colors.Normalize(vmin=min_zscore, vmax=max_zscore)

    # Compute and plot dendrogram.
    fig = plt.figure(constrained_layout=True, figsize=figsize)
    gs = fig.add_gridspec(1, len(df_clusts), wspace=wspace)

    for i in range(len(df_clusts)):
        clust_arr = np.array(
            df_clusts["Normalized Feature Vector"].iloc[i].tolist(), dtype=float
        )

        if i == 0:
            inner_gs = gs[0, i].subgridspec(
                clust_arr.shape[0],
                1,
                wspace=0,
                hspace=0,
            )
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):
                ax.plot(clust_arr[c])
                ax.set_ylim(-6, 12)
                ax.set(xticks=[], yticks=[-4, 0.0, 10.0])
                ax.set_ylabel(
                    feature_labels[c[0]],
                    rotation=0,
                    labelpad=30,
                    fontsize=18,
                    ha="right",
                )  # ,orientation="horizontal")

            ax.set_xlabel(
                df_clusts["sgRNA Cluster Label"].iloc[i]
                + "\n Cluster "
                + str(df_clusts.index[i]),
                fontsize=14,
            )

        #             imshow_first_ax = fig.add_subplot(gs[0, i])
        #             imshow_first_ax.imshow(df_clusts["Feature Vector"].iloc[i].astype(float).reshape(-1,1),cmap=cmap,norm=norm)

        #             imshow_first_ax.tick_params(axis='x',which='both',bottom=False,top=False,labelbottom=False)
        #             imshow_first_ax.tick_params(axis='y',which='both',left=False,right=False,labelbottom=False)
        #             imshow_first_ax.set_xlabel(df_clusts["sgRNA Cluster Label"].iloc[i] + "\n Cluster " + str(df_clusts.index[i]), fontsize=14)

        #             imshow_first_ax.set_yticks(range(len(feature_labels)))
        #             imshow_first_ax.set_yticklabels(feature_labels, fontsize=18, )
        else:
            inner_gs = gs[0, i].subgridspec(clust_arr.shape[0], 1, wspace=0, hspace=0)
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):
                ax.plot(clust_arr[c])
                ax.set_ylim(-6, 12)
                ax.set(xticks=[], yticks=[])

            ax.set_xlabel(
                df_clusts["sgRNA Cluster Label"].iloc[i]
                + "\n Cluster "
                + str(df_clusts.index[i]),
                fontsize=14,
            )

    plt.tight_layout()


def make_subset_dendrogram(
    sub_df,
    title,
    feature_labels=feature_labels,
    min_zscore=min_zscore,
    max_zscore=max_zscore,
    figsize=(10, 10),
    fontsize=18,
    linewidth=5,
):
    X_dist = get_normed_softdtw(sub_df["Normalized Feature Vector"])
    X = np.array(sub_df["Normalized Feature Vector"].tolist())

    # Compute and plot dendrogram.
    fig = plt.figure(constrained_layout=True, figsize=figsize)
    gs = fig.add_gridspec(2, len(sub_df))
    dendro_ax = fig.add_subplot(gs[0, :])

    Y = sch.linkage(
        sp.spatial.distance.squareform(X_dist), method="weighted", optimal_ordering=True
    )
    cluster_tree = sch.to_tree(Y, rd=True)[1]

    Z = sch.dendrogram(
        Y,
        orientation="top",
        show_leaf_counts=True,
        leaf_rotation=90.0,
        leaf_font_size=12.0,
        show_contracted=True,
        ax=dendro_ax,
        no_labels=True,
    )

    cmap = mpl.cm.coolwarm
    norm = mpl.colors.Normalize(vmin=min_zscore, vmax=max_zscore)

    fig.suptitle(title, fontsize=fontsize)

    for i, leaf in enumerate(Z["leaves"]):
        leaf_arr = np.array(
            sub_df.iloc[leaf]["Normalized Feature Vector"].tolist(), dtype=float
        )

        if i == 0:
            inner_gs = gs[1, i].subgridspec(
                leaf_arr.shape[0],
                1,
                wspace=0,
                hspace=0,
            )
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):
                ax.plot(leaf_arr[c], linewidth=linewidth)
                ax.set_ylim(-6, 12)
                ax.set(xticks=[], yticks=[-4, 0.0, 8])
                ax.set_ylabel(
                    feature_labels[c[0]],
                    rotation=0,
                    labelpad=30,
                    fontsize=fontsize,
                    ha="right",
                )  # ,orientation="horizontal")

            #             imshow_first_ax = fig.add_subplot(gs[1, i])
            #             imshow_first_ax.imshow(mean_vector,cmap=cmap,norm=norm)

            #             ax.tick_params(axis='x',which='both',bottom=False,top=False,labelbottom=False)
            #             ax.tick_params(axis='y',which='both',left=False,right=False,labelbottom=False)
            ax.set_xlabel(sub_df.index[leaf], fontsize=fontsize, rotation=90)

        #             ax.set_yticks(range(len(feature_labels)))
        #             ax.set_yticklabels(feature_labels, fontsize=18, )

        else:
            inner_gs = gs[1, i].subgridspec(leaf_arr.shape[0], 1, wspace=0, hspace=0)
            inner_grid_sub = inner_gs.subplots(sharex=True)
            for c, ax in np.ndenumerate(inner_grid_sub):
                ax.plot(leaf_arr[c], linewidth=linewidth)
                ax.set_ylim(-6, 12)
                ax.set(xticks=[], yticks=[])

            #             imshow_ax = fig.add_subplot(gs[1, i], sharey=imshow_first_ax)
            #             imshow_ax.imshow(mean_vector,cmap=cmap,norm=norm)
            #             plt.setp(imshow_ax.get_yticklabels(), visible=False)

            #             imshow_ax.tick_params(axis='x',which='both',bottom=False,top=False,labelbottom=False)
            #             imshow_ax.tick_params(axis='y',which='both',left=False,right=False,labelbottom=False)
            ax.set_xlabel(sub_df.index[leaf], fontsize=fontsize, rotation=90)

    plt.tight_layout()


#         imshow_ax.tick_params(axis='x',which='both',bottom=False,top=False,labelbottom=False)
#         imshow_ax.tick_params(axis='y',which='both',left=False,right=False,labelbottom=False)

#         imshow_ax.set_xlabel(sub_df.index[leaf], fontsize=fontsize)

In [None]:
# compute_and_plot_dendrogram(df,X_dist,feature_labels,suppress_thr,min_zscore,max_zscore,cmap=mpl.cm.coolwarm)

In [None]:
children_labels = compute_and_plot_dendrogram(
    gene_cluster_df,
    X_dist,
    feature_labels,
    suppress_thr,
    min_zscore,
    max_zscore,
    cmap=mpl.cm.coolwarm,
)
# plt.savefig("./Dendrograms/Global_Dendrogram.png", dpi=300)

In [None]:
gene_cluster_df = assign_dendro_clusts(gene_cluster_df, children_labels)

#### Major System Analysis

In [None]:
fts_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "fts" in x["Gene"], axis=1)
]

In [None]:
plot_subset(fts_subset, figsize=(20, 8))
plt.tight_layout()
# plt.savefig("./Gene_Groups/fts.png",dpi=200,bbox_inches="tight")

In [None]:
rpl_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "rpl" in x["Gene"], axis=1)
]

In [None]:
plot_subset(rpl_subset, figsize=(30, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/rpl.png",dpi=200,bbox_inches="tight")

In [None]:
rpm_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "rpm" in x["Gene"], axis=1)
]

In [None]:
plot_subset(rpm_subset, figsize=(30, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/rpm.png",dpi=200,bbox_inches="tight")

In [None]:
rps_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "rps" in x["Gene"], axis=1)
]

In [None]:
plot_subset(rps_subset, figsize=(30, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/rps.png",dpi=200,bbox_inches="tight")

In [None]:
rr_subset = gene_cluster_df[gene_cluster_df.apply(lambda x: "rr" in x["Gene"], axis=1)]

In [None]:
plot_subset(rr_subset, figsize=(15, 10))

In [None]:
tff_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "tff" in x["Gene"], axis=1)
]

In [None]:
plot_subset(tff_subset, figsize=(8, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/tff.png",dpi=200,bbox_inches="tight")

In [None]:
rpo_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "rpo" in x["Gene"], axis=1)
]

In [None]:
plot_subset(rpo_subset, figsize=(10, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/rpo.png",dpi=200,bbox_inches="tight")

In [None]:
min_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "min" in x["Gene"], axis=1)
]

In [None]:
plot_subset(min_subset, figsize=(12, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/min.png",dpi=200,bbox_inches="tight")

In [None]:
dna_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "dna" in x["Gene"], axis=1)
]

In [None]:
plot_subset(dna_subset, figsize=(12, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/dna.png",dpi=200,bbox_inches="tight")

In [None]:
fol_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "fol" in x["Gene"], axis=1)
]

In [None]:
plot_subset(fol_subset, figsize=(12, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/fol.png",dpi=200,bbox_inches="tight")

In [None]:
muk_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "muk" in x["Gene"], axis=1)
]

In [None]:
plot_subset(muk_subset, figsize=(12, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/muk.png",dpi=200,bbox_inches="tight")

In [None]:
mre_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "mre" in x["Gene"], axis=1)
]

In [None]:
plot_subset(mre_subset, figsize=(10, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/mre.png",dpi=200,bbox_inches="tight")

In [None]:
mur_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "mur" in x["Gene"], axis=1)
]

In [None]:
plot_subset(mur_subset, figsize=(12, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/mur.png",dpi=200,bbox_inches="tight")

In [None]:
nus_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "nus" in x["Gene"], axis=1)
]

In [None]:
plot_subset(nus_subset, figsize=(8, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/nus.png",dpi=200,bbox_inches="tight")

In [None]:
sec_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "sec" in x["Gene"], axis=1)
]

In [None]:
plot_subset(sec_subset, figsize=(12, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/sec.png",dpi=200,bbox_inches="tight")

In [None]:
bam_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "bam" in x["Gene"], axis=1)
]

In [None]:
plot_subset(bam_subset, figsize=(6, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/bam.png",dpi=200,bbox_inches="tight")

In [None]:
hol_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "hol" in x["Gene"], axis=1)
]

In [None]:
plot_subset(hol_subset, figsize=(12, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/hol.png",dpi=200,bbox_inches="tight")

In [None]:
hda_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "hda" in x["Gene"], axis=1)
]

In [None]:
plot_subset(hda_subset, figsize=(6, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/hda.png",dpi=200,bbox_inches="tight")

In [None]:
rodZ_subset = gene_cluster_df[
    gene_cluster_df.apply(lambda x: "rodZ" in x["Gene"], axis=1)
]

In [None]:
plot_subset(rodZ_subset, figsize=(6, 10))
plt.tight_layout()
# plt.savefig("./Gene_Groups/rodz.png",dpi=200,bbox_inches="tight")

#### Cluster Analysis

In [None]:
cluster_7 = gene_cluster_df[gene_cluster_df["Dendrogram Clusters"].isin([7])]

In [None]:
make_subset_dendrogram(
    cluster_7,
    "Cluster 7 Dendrogram",
    figsize=(int((len(cluster_7) * 5.0) - 9.0), int((len(cluster_7) * 2.0) + 3)),
    fontsize=4 + int(len(cluster_7) * 4.0),
    linewidth=1 + int(len(cluster_7) * 1.0),
)
plt.show()

In [None]:
clusters, cluster_counts = np.unique(
    gene_cluster_df["Dendrogram Clusters"], return_counts=True
)
singleton_clusters = clusters[cluster_counts < 3]
small_clusters = clusters[(cluster_counts <= 40) & (cluster_counts >= 3)]
big_clusters = clusters[cluster_counts > 40]
print(singleton_clusters)
print(small_clusters)
print(big_clusters)

In [None]:
cluster_3to4 = gene_cluster_df[gene_cluster_df["Dendrogram Clusters"].isin([3, 4])]
cluster_9to11 = gene_cluster_df[
    gene_cluster_df["Dendrogram Clusters"].isin([9, 10, 11])
]
cluster_2 = gene_cluster_df[gene_cluster_df["Dendrogram Clusters"].isin([2])]
cluster_12 = gene_cluster_df[gene_cluster_df["Dendrogram Clusters"].isin([12])]
cluster_13 = gene_cluster_df[gene_cluster_df["Dendrogram Clusters"].isin([13])]
cluster_14 = gene_cluster_df[gene_cluster_df["Dendrogram Clusters"].isin([14])]

In [None]:
remaining_small_clusters = list(set(small_clusters) - set([3, 4, 9, 10, 11]))

In [None]:
remaining_small_clusters

In [None]:
for i in remaining_small_clusters:
    cluster_df = gene_cluster_df[gene_cluster_df["Dendrogram Clusters"].isin([i])]
    make_subset_dendrogram(
        cluster_df,
        "Cluster " + str(i) + " Dendrogram",
        figsize=(int((len(cluster_df) * 5.0) - 9.0), int((len(cluster_df) * 2.0) + 3)),
        fontsize=4 + int(len(cluster_df) * 4.0),
        linewidth=1 + int(len(cluster_df) * 1.0),
    )
    plt.show()
#     plt.savefig("./Dendrograms/Cluster_" + str(i) + ".png",dpi=200)

make_subset_dendrogram(
    cluster_3to4,
    "Cluster 3 to 4 Dendrogram",
    figsize=(int((len(cluster_3to4) * 5.0) - 9.0), int((len(cluster_3to4) * 2.0) + 3)),
    fontsize=4 + int(len(cluster_3to4) * 4.0),
    linewidth=1 + int(len(cluster_3to4) * 1.0),
)
plt.show()
# plt.savefig("./Dendrograms/Cluster_6to8.png",dpi=200)

make_subset_dendrogram(
    cluster_9to11,
    "Cluster 9 to 11 Dendrogram",
    figsize=(
        int((len(cluster_9to11) * 5.0) - 9.0),
        int((len(cluster_9to11) * 2.0) + 3),
    ),
    fontsize=4 + int(len(cluster_9to11) * 4.0),
    linewidth=1 + int(len(cluster_9to11) * 1.0),
)
plt.show()
# plt.savefig("./Dendrograms/Cluster_9to10.png",dpi=200)

In [None]:
make_subset_dendrogram(
    cluster_2,
    "Cluster 2 Dendrogram",
    figsize=(int((len(cluster_2) * 5.0) - 9.0), int((len(cluster_2) * 2.0) + 3)),
    fontsize=4 + int(len(cluster_2) * 4.0),
    linewidth=1 + int(len(cluster_2) * 1.0),
)
plt.show()

In [None]:
make_subset_dendrogram(
    cluster_12,
    "Cluster 12 Dendrogram",
    figsize=(int((len(cluster_12) * 5.0) - 9.0), int((len(cluster_12) * 2.0) + 3)),
    fontsize=4 + int(len(cluster_12) * 4.0),
    linewidth=1 + int(len(cluster_12) * 1.0),
)
plt.show()

In [None]:
make_subset_dendrogram(
    cluster_13,
    "Cluster 13 Dendrogram",
    figsize=(int((len(cluster_13) * 5.0) - 9.0), int((len(cluster_13) * 2.0) + 3)),
    fontsize=4 + int(len(cluster_13) * 4.0),
    linewidth=1 + int(len(cluster_13) * 1.0),
)
plt.show()

In [None]:
make_subset_dendrogram(
    cluster_14,
    "Cluster 14 Dendrogram",
    figsize=(int((len(cluster_14) * 5.0) - 9.0), int((len(cluster_14) * 2.0) + 3)),
    fontsize=4 + int(len(cluster_14) * 4.0),
    linewidth=1 + int(len(cluster_14) * 1.0),
)
plt.show()

In [None]:
# gene_cluster_df.to_csv("2021-07-31_Steady_State_Analysis.csv")

### Gene Browser

In [None]:
df = final_output_df_pd.groupby("sgRNA").apply(lambda x: x.iloc[0])
df["phenotype trenchids"] = final_output_df_pd.groupby("sgRNA").apply(
    lambda x: x["phenotype trenchid"].tolist()
)
df = df[
    [
        "Gene",
        "Target Sequence",
        "phenotype trenchids",
        "N Mismatch",
        "N Target Sites",
        "Category",
        "Strand",
    ]
]

In [None]:
kymo_xarr = tr.kymo_xarr(
    "/home/de64/scratch/de64/sync_folder/2021-06-14_lDE20_biofloat_fullrun_1/Growth_Division"
)
wrapped_kymo_xarr = tr.kymo_xarr(
    "/home/de64/scratch/de64/sync_folder/2021-06-14_lDE20_biofloat_fullrun_1/Growth_Division",
    unwrap=False,
)

In [None]:
(
    gene_table_layout,
    select_gene,
    select_trenchid,
    select_unpacked_trenchid,
) = tr.linked_gene_table(
    df, trenchids_as_list=True, trenchid_column="phenotype trenchids"
)

In [None]:
gene_table_layout

In [None]:
output_display, save_button = tr.linked_kymograph_for_gene_table(
    kymo_xarr,
    wrapped_kymo_xarr,
    df,
    select_gene,
    select_trenchid,
    select_unpacked_trenchid=select_unpacked_trenchid,
    trenchid_column="phenotype trenchids",
    y_scale=3,
    x_window_size=300,
)

In [None]:
output_display

In [None]:
save_button  ## NEED OPTION WHETHER OR NOT TO NORM SIGNAL