## Feature processing before main analysis

- Note that there are fluctuations in the illumination intensity which may be resulting in pathological behavior from the reporter

- This has been normalized out in the upstream processing, but try to fix long term

- Also consider a flat field correction for the final experiment

In [None]:
import paulssonlab.deaton.trenchripper.trenchripper as tr

import numpy as np
import pandas as pd
import seaborn as sns
import scipy as sp
import sklearn as skl
import dask.dataframe as dd
import dask.array as da
import dask
import warnings
import copy
import random
from sklearn.metrics.pairwise import (
    euclidean_distances,
    manhattan_distances,
    cosine_distances,
)

from sklearn.metrics import silhouette_score
import scipy.stats
from sklearn.linear_model import LinearRegression
from sklearn.cluster import AffinityPropagation
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering

from matplotlib import pyplot as plt
import ast


import pylab
import scipy.cluster.hierarchy as sch

import matplotlib.gridspec as gridspec
import matplotlib as mpl

import holoviews as hv

hv.extension("bokeh")
random.seed(42)
np.random.seed(42)

warnings.filterwarnings(action="once", category=UserWarning)

In [None]:
def get_sgrnadf_from_scoredf(
    scoredf, feature_labels, time_label="final cell timepoints list"
):
    scoredf_groupby = scoredf.groupby("sgRNA")
    sgrnadf = (
        scoredf_groupby.apply(lambda x: x["phenotype trenchid"].tolist())
        .to_frame()
        .rename(columns={0: "phenotype trenchid"})
    )

    for feature_label in feature_labels:
        sgrnadf[feature_label + ": score"] = scoredf_groupby.apply(
            lambda x: np.array(
                [val for item in x[feature_label + ": score"].tolist() for val in item]
            )
        )

    sgrnadf[time_label] = scoredf_groupby.apply(
        lambda x: np.array([val for item in x[time_label].tolist() for val in item])
    )
    sgrnadf["Gene"] = scoredf_groupby.apply(lambda x: x["Gene"].iloc[0])
    sgrnadf["TargetID"] = scoredf_groupby.apply(lambda x: x["TargetID"].iloc[0])
    sgrnadf["N Mismatch"] = scoredf_groupby.apply(lambda x: x["N Mismatch"].iloc[0])
    sgrnadf["N Observations"] = scoredf_groupby.apply(
        lambda x: len(x["phenotype trenchid"].tolist())
    )
    sgrnadf["Category"] = scoredf_groupby.apply(lambda x: x["Category"].iloc[0])

    return sgrnadf


def normalize_timeseries(feature_vector_series, lmbda=0.5):
    timeseries_arr = np.swapaxes(np.array(feature_vector_series.tolist()), 1, 2)
    sigma = np.std(timeseries_arr, axis=1)
    if lmbda > 0.0:
        sigma_prime = ((sigma + 1) ** lmbda - 1) / lmbda  ##yeo-johnson
    elif lmbda == 0.0:
        sigma_prime = np.log(sigma + 1)
    else:
        raise ValueError("lmbda cannot be negative")
    normalizer = sigma / sigma_prime
    normalized_timeseries = timeseries_arr / normalizer[:, np.newaxis, :]
    return normalized_timeseries

### Initial Data Processing

Here, I am going to try and replicate (to some extant) the corrections from "Genomewide phenotypic analysis of growth, cell morphogenesis, and cell cycle events in Escherichia coli"

#### Start Dask

In [None]:
# dask_controller = tr.trcluster.dask_controller(
#     walltime="02:00:00",
#     local=False,
#     n_workers=100,
#     n_workers_min=20,
#     memory="16GB",
#     working_directory="/home/de64/scratch/de64/dask",
# )
# dask_controller.startdask()

# small testing deployment

dask_controller = tr.trcluster.dask_controller(
    walltime="02:00:00",
    local=False,
    n_workers=100,
    n_workers_min=100,
    memory="16GB",
    working_directory="/home/de64/scratch/de64/dask",
)
dask_controller.startdask()

In [None]:
dask_controller.daskclient

#### Import Dataframes

In [None]:
final_output_df_lineage = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2022-01-18_lDE20_Final_5/2022-02-15_lDE20_Lineage_Cell_Cycle/",
    engine="pyarrow",
)
# final_output_df_lineage = final_output_df_lineage.loc[:final_output_df_lineage.divisions[4]-1] #getting a small subset
final_output_df_lineage = final_output_df_lineage.dropna(
    subset=["Final timepoints", "Division: major_axis_length"]
)
final_output_df_lineage = (
    final_output_df_lineage.reset_index()
    .set_index("phenotype trenchid", sorted=True)
    .repartition(npartitions=100)
    .persist()
)

final_output_df_lineage_timepoints = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2022-01-18_lDE20_Final_5/2022-02-15_lDE20_Lineage_Observations/",
    engine="pyarrow",
)
# final_output_df_lineage_timepoints = final_output_df_lineage_timepoints.loc[:final_output_df_lineage_timepoints.divisions[4]-1] #getting a small subset
final_output_df_lineage_timepoints = final_output_df_lineage_timepoints.dropna(
    subset=["major_axis_length", "minor_axis_length", "mCherry mean_intensity"]
)
final_output_df_lineage_timepoints = (
    final_output_df_lineage_timepoints.reset_index()
    .set_index("phenotype trenchid", sorted=True)
    .repartition(npartitions=100)
    .persist()
)

final_output_df_lineage_growth = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2022-01-18_lDE20_Final_5/2022-02-15_lDE20_Lineage_Growth_Observations/",
    engine="pyarrow",
)
# final_output_df_lineage_delta_timepoints = final_output_df_lineage_delta_timepoints.loc[:final_output_df_lineage_delta_timepoints.divisions[4]-1] #getting a small subset
final_output_df_lineage_growth = final_output_df_lineage_growth.dropna(
    subset=["Growth Rate: Volume"]
)
final_output_df_lineage_growth = (
    final_output_df_lineage_growth.reset_index()
    .set_index("phenotype trenchid", sorted=True)
    .repartition(npartitions=100)
    .persist()
)

In [None]:
final_output_df_lineage_growth


#### Filter for "Normal" Sizes at Start

1) Fit a gaussian model to each of the specified feature params during the first t timepoints of the experiment (using a subsample for speed) 
2) Compute a normalized probability trenchwise for these features under the gaussian model, during the first t timepoints of the experiment
3) Eliminate trenches that are under some p percentile value of this probability for each feature
4) Display histograms for each property as well as the resulting theshold

Note that these features should be the only features examined in the resulting analysis. For the notebook, I am looking at:
- Birth length (Lb)
- Division length (Ld)
- Mean Area Increment
- Mean Length Increment
- Mean Width
- Cell cycle duration (Delta t)
- Mean mCherry Intensity

In [None]:
def remove_early_outliers(
    final_output_df_lineage,
    final_output_df_lineage_timepoints,
    final_output_df_lineage_growth,
    early_time_cutoff=7200,
    gaussian_subsample_rates=[0.2, 0.1, 0.1],
    percentile_threshold=10,
    cell_cycle_params=["Division: major_axis_length"],
    timepoint_params=[
        "major_axis_length",
        "minor_axis_length",
        "mCherry mean_intensity",
    ],
    growth_params=["Growth Rate: Volume"],
    plot_values_names=[
        "Division Length",
        "Length",
        "Width",
        "mCherry Intensity",
        "Growth Rate",
    ],
):

    final_output_df_trench_groupby = final_output_df_lineage.groupby(
        "phenotype trenchid", sort=False
    )
    final_output_df_lineage_timepoints_groupby = (
        final_output_df_lineage_timepoints.groupby("phenotype trenchid", sort=False)
    )
    final_output_df_lineage_growth_groupby = final_output_df_lineage_growth.groupby(
        "phenotype trenchid", sort=False
    )

    early_tpt_df = final_output_df_trench_groupby.apply(
        lambda x: x[x["Final time (s)"] < early_time_cutoff].reset_index(drop=True)
    ).persist()
    early_tpt_df_timepoints = final_output_df_lineage_timepoints_groupby.apply(
        lambda x: x[x["Observation time (s)"] < early_time_cutoff].reset_index(
            drop=True
        )
    ).persist()
    early_tpt_df_growth = final_output_df_lineage_growth_groupby.apply(
        lambda x: x[x["Measurement time (s)"] < early_time_cutoff].reset_index(
            drop=True
        )
    ).persist()

    for filter_param in cell_cycle_params:
        early_param_series = early_tpt_df[filter_param]
        all_param_values = (
            early_param_series.sample(frac=gaussian_subsample_rates[0])
            .compute()
            .tolist()
        )
        gaussian_fit = sp.stats.norm.fit(all_param_values)
        gaussian_fit = sp.stats.norm(loc=gaussian_fit[0], scale=gaussian_fit[1])

        early_param_series = dd.from_pandas(
            early_param_series.compute().droplevel(1), npartitions=50
        )
        trench_probability = early_param_series.groupby("phenotype trenchid").apply(
            lambda x: np.exp(np.sum(gaussian_fit.logpdf(x)) / len(x)), meta=float
        )

        final_output_df_lineage[
            filter_param + ": Probability"
        ] = trench_probability.persist()

    for filter_param in timepoint_params:
        early_param_series = early_tpt_df_timepoints[filter_param]
        all_param_values = (
            early_param_series.sample(frac=gaussian_subsample_rates[1])
            .compute()
            .tolist()
        )
        gaussian_fit = sp.stats.norm.fit(all_param_values)
        gaussian_fit = sp.stats.norm(loc=gaussian_fit[0], scale=gaussian_fit[1])

        early_param_series = dd.from_pandas(
            early_param_series.compute().droplevel(1), npartitions=50
        )
        trench_probability = early_param_series.groupby("phenotype trenchid").apply(
            lambda x: np.exp(np.sum(gaussian_fit.logpdf(x)) / len(x)), meta=float
        )

        final_output_df_lineage_timepoints[
            filter_param + ": Probability"
        ] = trench_probability.persist()

    for filter_param in growth_params:
        early_param_series = early_tpt_df_growth[filter_param]
        all_param_values = (
            early_param_series.sample(frac=gaussian_subsample_rates[2])
            .compute()
            .tolist()
        )
        gaussian_fit = sp.stats.norm.fit(all_param_values)
        gaussian_fit = sp.stats.norm(loc=gaussian_fit[0], scale=gaussian_fit[1])

        early_param_series = dd.from_pandas(
            early_param_series.compute().droplevel(1), npartitions=50
        )
        trench_probability = early_param_series.groupby("phenotype trenchid").apply(
            lambda x: np.exp(np.sum(gaussian_fit.logpdf(x)) / len(x)), meta=float
        )

        final_output_df_lineage_growth[
            filter_param + ": Probability"
        ] = trench_probability.persist()

    final_output_df_onetrench = (
        final_output_df_lineage.groupby("phenotype trenchid")
        .apply(lambda x: x.iloc[0])
        .compute()
    )
    final_output_df_timepoints_onetrench = (
        final_output_df_lineage_timepoints.groupby("phenotype trenchid")
        .apply(lambda x: x.iloc[0])
        .compute()
    )
    final_output_df_growth_onetrench = (
        final_output_df_lineage_growth.groupby("phenotype trenchid")
        .apply(lambda x: x.iloc[0])
        .compute()
    )

    plt.figure(figsize=(22, 16))

    lineage_query_list = []
    plot_idx = 0
    for i, filter_param in enumerate(cell_cycle_params):
        prob_threshold = np.nanpercentile(
            final_output_df_onetrench[filter_param + ": Probability"].tolist(),
            percentile_threshold,
        )
        query = "`" + filter_param + ": Probability` > " + str(prob_threshold)
        lineage_query_list.append(query)

        min_v, max_v = (
            np.nanpercentile(
                final_output_df_onetrench[filter_param + ": Probability"], 5
            ),
            np.max(final_output_df_onetrench[filter_param + ": Probability"]),
        )

        plt.subplot(2, 3, plot_idx + 1)
        plt.title(plot_values_names[plot_idx], fontsize=22)
        plt.xlabel("Unnormalized Likelihood", fontsize=18)
        plt.xticks(fontsize=18)
        plt.yticks(fontsize=18)
        plt.hist(
            final_output_df_onetrench[
                final_output_df_onetrench[filter_param + ": Probability"]
                < prob_threshold
            ][filter_param + ": Probability"].tolist(),
            bins=50,
            range=(min_v, max_v),
        )
        plt.hist(
            final_output_df_onetrench[
                final_output_df_onetrench[filter_param + ": Probability"]
                >= prob_threshold
            ][filter_param + ": Probability"].tolist(),
            bins=50,
            range=(min_v, max_v),
        )
        plot_idx += 1

    lineage_timepoint_query_list = []
    for i, filter_param in enumerate(timepoint_params):
        prob_threshold = np.nanpercentile(
            final_output_df_timepoints_onetrench[
                filter_param + ": Probability"
            ].tolist(),
            percentile_threshold,
        )
        query = "`" + filter_param + ": Probability` > " + str(prob_threshold)
        lineage_timepoint_query_list.append(query)

        min_v, max_v = (
            np.nanpercentile(
                final_output_df_timepoints_onetrench[filter_param + ": Probability"], 5
            ),
            np.max(
                final_output_df_timepoints_onetrench[filter_param + ": Probability"]
            ),
        )

        plt.subplot(2, 3, plot_idx + 1)
        plt.title(plot_values_names[plot_idx], fontsize=22)
        plt.xlabel("Unnormalized Likelihood", fontsize=18)
        plt.xticks(fontsize=18)
        plt.yticks(fontsize=18)
        plt.hist(
            final_output_df_timepoints_onetrench[
                final_output_df_timepoints_onetrench[filter_param + ": Probability"]
                < prob_threshold
            ][filter_param + ": Probability"].tolist(),
            bins=50,
            range=(min_v, max_v),
        )
        plt.hist(
            final_output_df_timepoints_onetrench[
                final_output_df_timepoints_onetrench[filter_param + ": Probability"]
                >= prob_threshold
            ][filter_param + ": Probability"].tolist(),
            bins=50,
            range=(min_v, max_v),
        )
        plot_idx += 1

    lineage_growth_query_list = []
    for i, filter_param in enumerate(growth_params):
        prob_threshold = np.nanpercentile(
            final_output_df_growth_onetrench[filter_param + ": Probability"].tolist(),
            percentile_threshold,
        )
        query = "`" + filter_param + ": Probability` > " + str(prob_threshold)
        lineage_growth_query_list.append(query)

        min_v, max_v = (
            np.nanpercentile(
                final_output_df_growth_onetrench[filter_param + ": Probability"], 5
            ),
            np.max(final_output_df_growth_onetrench[filter_param + ": Probability"]),
        )

        plt.subplot(2, 3, plot_idx + 1)
        plt.title(plot_values_names[plot_idx], fontsize=22)
        plt.xlabel("Unnormalized Likelihood", fontsize=18)
        plt.xticks(fontsize=18)
        plt.yticks(fontsize=18)
        plt.hist(
            final_output_df_growth_onetrench[
                final_output_df_growth_onetrench[filter_param + ": Probability"]
                < prob_threshold
            ][filter_param + ": Probability"].tolist(),
            bins=50,
            range=(min_v, max_v),
        )
        plt.hist(
            final_output_df_growth_onetrench[
                final_output_df_growth_onetrench[filter_param + ": Probability"]
                >= prob_threshold
            ][filter_param + ": Probability"].tolist(),
            bins=50,
            range=(min_v, max_v),
        )
        plot_idx += 1

    compiled_lineage_query = " and ".join(lineage_query_list)
    compiled_lineage_timepoint_query = " and ".join(lineage_timepoint_query_list)
    compiled_lineage_growth_query = " and ".join(lineage_growth_query_list)

    final_output_df_onetrench_filtered = final_output_df_onetrench.query(
        compiled_lineage_query
    )
    final_output_df_timepoints_onetrench_filtered = (
        final_output_df_timepoints_onetrench.query(compiled_lineage_timepoint_query)
    )
    final_output_df_growth_onetrench_filtered = final_output_df_growth_onetrench.query(
        compiled_lineage_growth_query
    )

    all_idx_list = sorted(
        (
            set(final_output_df_onetrench_filtered.index.tolist())
            & set(final_output_df_timepoints_onetrench_filtered.index.tolist())
        )
        & set(final_output_df_growth_onetrench_filtered.index.tolist())
    )

    final_output_df_filtered = final_output_df_lineage.loc[all_idx_list].persist()
    final_output_df_timepoints_filtered = final_output_df_lineage_timepoints.loc[
        all_idx_list
    ].persist()
    final_output_df_growth_filtered = final_output_df_lineage_growth.loc[
        all_idx_list
    ].persist()

    return (
        final_output_df_filtered,
        final_output_df_timepoints_filtered,
        final_output_df_growth_filtered,
    )

In [None]:
(
    final_output_df_filtered,
    final_output_df_timepoints_filtered,
    final_output_df_growth_filtered,
) = remove_early_outliers(
    final_output_df_lineage,
    final_output_df_lineage_timepoints,
    final_output_df_lineage_growth,
    early_time_cutoff=7200,
    gaussian_subsample_rates=[0.2, 0.1, 0.1],
    percentile_threshold=10,
)

plt.savefig("Prob_threshold_Replicate_3.png", dpi=500)

In [None]:
len(final_output_df_filtered) / len(final_output_df_lineage)
len(final_output_df_timepoints_filtered) / len(final_output_df_lineage_timepoints)
len(final_output_df_growth_filtered) / len(final_output_df_lineage_growth)

In [None]:
dask_controller.daskclient.cancel(final_output_df_lineage)
dask_controller.daskclient.cancel(final_output_df_lineage_timepoints)
dask_controller.daskclient.cancel(final_output_df_lineage_growth)

In [None]:
def get_timepoint_values(
    df, label, min_time, max_time, time_label="time (s)", flatten_vals=True
):
    if flatten_vals:
        masked_label_series = df.apply(
            lambda x: np.array(x[label])[
                (np.array(x[time_label]) >= min_time)
                * (np.array(x[time_label]) <= max_time)
            ],
            axis=1,
            meta="object",
        )
        flattened_vals = np.concatenate(masked_label_series.compute().tolist())
        return flattened_vals
    else:
        masked_label_series = (
            df.groupby("phenotype trenchid")
            .apply(
                lambda x: np.array(x[label])[
                    (np.array(x[time_label]) >= min_time)
                    * (np.array(x[time_label]) <= max_time)
                ],
                meta="object",
            )
            .persist()
        )
        return masked_label_series


def get_feature_stats(df, feature_label, min_time, max_time, time_label="time (s)"):
    feature_vals = get_timepoint_values(
        df, feature_label, min_time, max_time, time_label=time_label
    )
    feature_median = np.median(feature_vals)
    feature_iqr = sp.stats.iqr(feature_vals)
    return feature_median, feature_iqr


def compute_score(df, feature_label, feature_median, feature_iqr):
    scores = 1.35 * ((df[feature_label] - feature_median) / feature_iqr)
    return scores


def get_feature_scores(
    df, feature_label, init_time_range=(0, 7200), time_label="time (s)"
):
    feature_median, feature_iqr = get_feature_stats(
        df, feature_label, init_time_range[0], init_time_range[1], time_label=time_label
    )
    scores = compute_score(df, feature_label, feature_median, feature_iqr)
    return scores


def get_all_feature_scores(
    df, feature_labels, init_time_range=(0, 7200), time_label="time (s)"
):

    for feature_label in feature_labels:
        print(feature_label)
        feature_scores = get_feature_scores(
            df, feature_label, init_time_range=init_time_range, time_label=time_label
        )
        df[feature_label + ": z score"] = feature_scores.persist()

    return df

### Normalize Properties (maybe go back to the per trench normalization?)

1) Yeo-Johnson transform the data to get a more normal-like distribution.
2) Convert transformed values to time-dependent z-scores using the following formula:

$$ z(i,k,t) = 1.35 \times \frac{F_{i,k,t} - median_{t\in \tau}(F_{i,t})}{iqr_{t\in \tau}(F_{i,t})} $$

where $F_{i,k,t}$ are the feature values for feature i, trench k at time t. $\tau$ are the initial pre-induction timepoints. 

Essentially this is a z-score using the more outlier robust median and interquartile range to define the differences from normal bahavior. The 1.35 factor scales the values such that z-scores represent number of standard deviations from the mean for a normal distribution. Finally the values are normalized by initial behaviors trenchwise by the $median_{t\in \tau}(F_{i,k,t})$ factor.

In [None]:
def apply_yj_transform(
    final_output_df_filtered,
    yeo_subsample=0.05,
    early_time_cutoff=7200,
    params_to_transform=["Division: major_axis_length"],
    time_label="time (s)",
):

    subsample_df = final_output_df_filtered.sample(frac=yeo_subsample).persist()

    for i, param in enumerate(params_to_transform):
        all_param_values = subsample_df[param].astype(float).compute().tolist()
        all_param_values = np.array(all_param_values)
        all_param_values = all_param_values[~np.isnan(all_param_values)]
        l_norm = sp.stats.yeojohnson_normmax(all_param_values)
        final_output_df_filtered[param + ": Yeo-Johnson"] = (
            final_output_df_filtered[param]
            .apply(
                lambda x: sp.stats.yeojohnson(x, lmbda=l_norm),
                meta=(param + ": Yeo-Johnson", float),
            )
            .persist()
        )

    final_output_df_filtered = get_all_feature_scores(
        final_output_df_filtered,
        [param + ": Yeo-Johnson" for param in params_to_transform],
        init_time_range=(0, early_time_cutoff),
        time_label=time_label,
    )
    trenchiddf = final_output_df_filtered.reset_index().set_index(
        "phenotype trenchid", drop=True
    )

    return final_output_df_filtered, trenchiddf

In [None]:
final_output_df_filtered, trenchiddf = apply_yj_transform(
    final_output_df_filtered,
    yeo_subsample=0.05,
    early_time_cutoff=7200,
    params_to_transform=["Division: major_axis_length"],
    time_label="time (s)",
)
trenchiddf = trenchiddf.persist()

In [None]:
final_output_df_timepoints_filtered, trenchiddf_timepoints = apply_yj_transform(
    final_output_df_timepoints_filtered,
    yeo_subsample=0.01,
    early_time_cutoff=7200,
    params_to_transform=[
        "major_axis_length",
        "minor_axis_length",
        "mCherry mean_intensity",
    ],
    time_label="Observation time (s)",
)
trenchiddf_timepoints = trenchiddf_timepoints.persist()

In [None]:
final_output_df_growth_filtered, trenchiddf_growth = apply_yj_transform(
    final_output_df_growth_filtered,
    yeo_subsample=0.01,
    early_time_cutoff=7200,
    params_to_transform=["Growth Rate: Volume"],
    time_label="Measurement time (s)",
)
trenchiddf_growth = trenchiddf_growth.persist()

#### Trying to interpolate trenchwise instead of sgRNAwise

In [None]:
from statsmodels.nonparametric import kernel_regression
from scipy.stats import iqr
from statsmodels.nonparametric.smoothers_lowess import lowess
import sklearn as skl
from tslearn.clustering import TimeSeriesKMeans
from tslearn.preprocessing import TimeSeriesScalerMeanVariance


def timeseries_kernel_reg(
    df,
    y_label,
    min_tpt,
    max_tpt,
    kernel_bins,
    kernel_bandwidth,
    time_label="Cell Cycle",
):
    def kernel_reg(
        x_arr,
        y_arr,
        start=min_tpt,
        end=max_tpt,
        kernel_bins=kernel_bins,
        kernel_bandwidth=kernel_bandwidth,
    ):
        intervals = np.linspace(start, end, num=kernel_bins, dtype=float)
        w = kernel_regression.KernelReg(
            y_arr,
            x_arr,
            "c",
            reg_type="lc",
            bw=np.array([kernel_bandwidth]),
            ckertype="gaussian",
        ).fit(intervals)[0]
        reg_x, reg_y = (intervals, w)
        return reg_x, reg_y

    if time_label == "Cell Cycle":
        # if this is the cell cycle df, do the following
        kernel_result = df.groupby("phenotype trenchid").apply(
            lambda x: kernel_reg(
                (x["Final time (s)"].values - (x["Delta time (s)"].values / 2)),
                x[y_label].values,
            )[1],
            meta=(y_label, object),
        )
    else:
        kernel_result = df.groupby("phenotype trenchid").apply(
            lambda x: kernel_reg(
                x[time_label].values,
                x[y_label].values,
            )[1],
            meta=(y_label, object),
        )
    return kernel_result


def get_all_kernel_regs(
    df,
    y_label_list,
    min_tpt=0,
    max_tpt=36000,
    kernel_bins=20,
    kernel_bandwidth=2700,
    time_label="Cell Cycle",
):
    out_df = copy.copy(df)

    for y_label in y_label_list:
        kernel_result = timeseries_kernel_reg(
            out_df,
            y_label,
            min_tpt,
            max_tpt,
            kernel_bins,
            kernel_bandwidth,
            time_label=time_label,
        )
        out_df["Kernel Trace: " + y_label] = kernel_result.persist()

    return out_df

In [None]:
cell_cycle_params_to_transform = ["Division: major_axis_length"]
cell_cycle_score_params = [
    param + ": Yeo-Johnson: z score" for param in cell_cycle_params_to_transform
]
cell_cycle_other_params = [
    "Birth: area",
    "Division: area",
    "Delta: area",
    "Birth: major_axis_length",
    "Division: major_axis_length",
    "Delta: major_axis_length",
    "Birth: minor_axis_length",
    "Division: minor_axis_length",
    "Delta: minor_axis_length",
    "Birth: Volume",
    "Division: Volume",
    "Delta: Volume",
    "Birth: Surface Area",
    "Division: Surface Area",
    "Delta: Surface Area",
    "Final timepoints",
    "Delta Timepoints",
    "Final time (s)",
    "Delta time (s)",
]

cell_cycle_params_to_trace = cell_cycle_score_params + cell_cycle_other_params

# making an observation grid to project vals onto

trenchiddf = get_all_kernel_regs(
    trenchiddf,
    cell_cycle_params_to_trace,
    min_tpt=0,
    max_tpt=36000,
    kernel_bins=20,
    kernel_bandwidth=2700,
    time_label="Cell Cycle",
)

In [None]:
cell_cycle_timepoints_params_to_transform = [
    "major_axis_length",
    "minor_axis_length",
    "mCherry mean_intensity",
]
cell_cycle_timepoints_score_params = [
    param + ": Yeo-Johnson: z score"
    for param in cell_cycle_timepoints_params_to_transform
]
cell_cycle_timepoints_other_params = [
    "area",
    "major_axis_length",
    "minor_axis_length",
    "Volume",
    "Surface Area",
    "mCherry mean_intensity",
]

cell_cycle_timepoints_params_to_trace = (
    cell_cycle_timepoints_score_params + cell_cycle_timepoints_other_params
)

# making an observation grid to project vals onto

trenchiddf_timepoints = get_all_kernel_regs(
    trenchiddf_timepoints,
    cell_cycle_timepoints_params_to_trace,
    min_tpt=0,
    max_tpt=36000,
    kernel_bins=20,
    kernel_bandwidth=2700,
    time_label="Observation time (s)",
)

In [None]:
growth_params_to_transform = ["Growth Rate: Volume"]
growth_score_params = [
    param + ": Yeo-Johnson: z score" for param in growth_params_to_transform
]
growth_other_params = [
    "Growth Rate: Volume",
    "Growth Rate: major_axis_length",
    "Growth Rate: Surface Area",
]

growth_params_to_trace = growth_score_params + growth_other_params

# making an observation grid to project vals onto

trenchiddf_growth = get_all_kernel_regs(
    trenchiddf_growth,
    growth_params_to_trace,
    min_tpt=0,
    max_tpt=36000,
    kernel_bins=20,
    kernel_bandwidth=2700,
    time_label="Measurement time (s)",
)

In [None]:
# can move this around to compute earlier, or write to disk
all_traced_params = (
    cell_cycle_params_to_trace
    + cell_cycle_timepoints_params_to_trace
    + growth_params_to_trace
)
all_traced_params = ["Kernel Trace: " + item for item in all_traced_params]

trenchiddf_out = trenchiddf.groupby("phenotype trenchid").apply(lambda x: x.iloc[0])
trenchiddf_timepoints_out = trenchiddf_timepoints.groupby("phenotype trenchid").apply(
    lambda x: x.iloc[0]
)
trenchiddf_growth_out = trenchiddf_growth.groupby("phenotype trenchid").apply(
    lambda x: x.iloc[0]
)

traced_col_names = [
    col_name
    for col_name in trenchiddf_out.columns.tolist()
    if col_name in all_traced_params
]
trenchiddf_out = trenchiddf_out[
    [
        "Global CellID",
        "File Parquet Index",
        "fov",
        "row",
        "trench",
        "initial timepoints",
        "File Index",
        "File Trench Index",
        "CellID",
        "Trench Score",
        "Mother CellID",
        "Daughter CellID 1",
        "Daughter CellID 2",
        "Sister CellID",
        "Centroid X",
        "Centroid Y",
        "Kymograph File Parquet Index",
        "Kymograph FOV Parquet Index",
        "FOV Parquet Index",
    ]
    + traced_col_names
]

traced_col_names = [
    col_name
    for col_name in trenchiddf_timepoints_out.columns.tolist()
    if col_name in all_traced_params
]
trenchiddf_timepoints_out = trenchiddf_timepoints_out[traced_col_names]

traced_col_names = [
    col_name
    for col_name in trenchiddf_growth_out.columns.tolist()
    if col_name in all_traced_params
]
trenchiddf_growth_out = trenchiddf_growth_out[traced_col_names]

In [None]:
# temp until split up a bit at earlier stages to save memory
trenchiddf_merged_out = dd.concat(
    [trenchiddf_out, trenchiddf_timepoints_out, trenchiddf_growth_out], axis=1
)
trenchiddf_merged_out = trenchiddf_merged_out.compute()

dask_controller.daskclient.cancel(final_output_df_filtered)
dask_controller.daskclient.cancel(final_output_df_timepoints_filtered)
dask_controller.daskclient.cancel(final_output_df_growth_filtered)

dask_controller.reset_worker_memory()

In [None]:
trenchiddf_merged_out_pandas = dd.from_pandas(trenchiddf_merged_out, npartitions=100)

In [None]:
trenchiddf_merged_out_pandas = trenchiddf_merged_out_pandas.persist()

In [None]:
final_output_df_barcodes = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2022-01-18_lDE20_Final_5/2022-02-15_lDE20_Final_Barcodes_df/",
    engine="pyarrow",
)
final_output_df_barcodes = (
    final_output_df_barcodes.set_index("phenotype trenchid", sorted=True)
    .groupby("phenotype trenchid", sort=False)
    .apply(lambda x: x.iloc[0])
    .persist()
)
final_output_df_barcodes = final_output_df_barcodes.reset_index().set_index(
    "oDEPool7_id", drop=True
)
final_output_df_barcodes = (
    final_output_df_barcodes.reset_index().set_index("phenotype trenchid").persist()
)

trenchiddf_merged_out_pandas = trenchiddf_merged_out_pandas.merge(
    final_output_df_barcodes[
        [
            "oDEPool7_id",
            "Barcode",
            "sgRNA",
            "Closest Hamming Distance",
            "EcoWG1_id",
            "Gene",
            "N Mismatch",
            "Category",
            "TargetID",
            "barcodeid",
        ]
    ],
    how="inner",
    left_index=True,
    right_index=True,
).persist()

n_obs = (
    trenchiddf_merged_out_pandas.groupby("oDEPool7_id", sort=False)
    .apply(lambda x: len(x.index.unique()), meta=int)
    .compute()
)
n_obs = pd.DataFrame(n_obs).rename({0: "N Observations"}, axis=1).sort_index()
trenchiddf_merged_out_pandas = trenchiddf_merged_out_pandas.merge(
    n_obs, on="oDEPool7_id", how="inner", right_index=True
)

In [None]:
all_transformed_params = (
    cell_cycle_params_to_transform
    + cell_cycle_timepoints_params_to_transform
    + growth_params_to_transform
)
kernel_score_params = [
    "Kernel Trace: " + param + ": Yeo-Johnson: z score"
    for param in all_transformed_params
]

feature_vector_series = trenchiddf_merged_out_pandas.apply(
    lambda x: np.array(x[kernel_score_params].tolist()), axis=1
)
trenchiddf_merged_out_pandas["Feature Vector"] = feature_vector_series
trenchiddf_merged_nan_filtered = trenchiddf_merged_out_pandas[
    ~trenchiddf_merged_out_pandas["Feature Vector"].apply(
        lambda x: np.any(np.isnan(x)), meta=("No NaN", bool)
    )
].compute()

In [None]:
# strong_effect_threshold = 15

# zero_vector = np.zeros((1,trenchiddf_nan_filtered["Feature Vector"].iloc[0].shape[0]))
# feature_arr = np.array(trenchiddf_nan_filtered["Feature Vector"].tolist())
# feature_arr_abs = np.abs(feature_arr)
# trenchiddf_nan_filtered["Integrated Feature Vector"] = [item for item in sp.integrate.simpson(feature_arr_abs,axis=2)]
# trenchiddf_nan_filtered["Integrated Feature Max"] = trenchiddf_nan_filtered["Integrated Feature Vector"].apply(lambda x: np.max(x))
# trenchiddf_nan_filtered["Integrated Euclidean Norm"] = np.linalg.norm(np.array(trenchiddf_nan_filtered["Integrated Feature Vector"].tolist()), axis=1)

# sgrnadf_strong_effect = trenchiddf_nan_filtered[trenchiddf_nan_filtered["Integrated Feature Max"]>=strong_effect_threshold]
# min_v,max_v = np.min(trenchiddf_nan_filtered["Integrated Feature Max"]),np.percentile(trenchiddf_nan_filtered["Integrated Feature Max"],99)

# plt.figure(figsize=(8,8))
# plt.title("Integrated Feature Max")
# plt.hist(trenchiddf_nan_filtered[trenchiddf_nan_filtered["Integrated Feature Max"]<strong_effect_threshold]["Integrated Feature Max"].tolist(),bins=50,range=(min_v,max_v))
# plt.hist(trenchiddf_nan_filtered[trenchiddf_nan_filtered["Integrated Feature Max"]>=strong_effect_threshold]["Integrated Feature Max"].tolist(),bins=50,range=(min_v,max_v))
# plt.show()

# unique_genes, gene_counts = np.unique(sgrnadf_strong_effect["Gene"][sgrnadf_strong_effect["Gene"].apply(lambda x: type(x)==str)].tolist(), return_counts=True)
# plt.title("sgRNAs per Gene")
# plt.xticks(range(0,20,2),labels=range(0,20,2))
# plt.hist(gene_counts,bins=np.arange(20)-0.5)
# plt.show()

### Pick Representative Effect per TargetID
Note this may need to be revisited later to resolve transients that are only resolvable at intermediate KO

1) For each target, pick the sgRNA that has the strongest phenotype (highest integrated euclidean norm)
2) Additionally identify any targets with titration information by saving a dataframe with targetIDs that posess at least N sgRNAs
    - this is in a preliminary form; transfer to a full notebook later

In [None]:
trenchiddf_merged_nan_filtered.to_pickle(
    "/home/de64/scratch/de64/sync_folder/2022-01-18_lDE20_Final_5/2022-03-12_gene_cluster_df_no_filter.pkl"
)

In [None]:
dask_controller.shutdown()