## Feature processing before main analysis

- Note that there are fluctuations in the illumination intensity which may be resulting in pathological behavior from the reporter

- This has been normalized out in the upstream processing, but try to fix long term

- Also consider a flat field correction for the final experiment

In [None]:
import paulssonlab.deaton.trenchripper.trenchripper as tr

import numpy as np
import pandas as pd
import seaborn as sns
import scipy as sp
import sklearn as skl
import dask.dataframe as dd
import dask.array as da
import dask
import warnings
import copy
import random
from sklearn.metrics.pairwise import (
    euclidean_distances,
    manhattan_distances,
    cosine_distances,
)

from sklearn.metrics import silhouette_score
import scipy.stats
from sklearn.linear_model import LinearRegression
from sklearn.cluster import AffinityPropagation
from sklearn.manifold import TSNE
from sklearn.cluster import AgglomerativeClustering

from matplotlib import pyplot as plt
import ast


import pylab
import scipy.cluster.hierarchy as sch

import matplotlib.gridspec as gridspec
import matplotlib as mpl

import holoviews as hv

hv.extension("bokeh")
random.seed(42)
np.random.seed(42)

warnings.filterwarnings(action="once", category=UserWarning)

In [None]:
def get_sgrnadf_from_scoredf(
    scoredf, feature_labels, time_label="final cell timepoints list"
):
    scoredf_groupby = scoredf.groupby("sgRNA")
    sgrnadf = (
        scoredf_groupby.apply(lambda x: x["phenotype trenchid"].tolist())
        .to_frame()
        .rename(columns={0: "phenotype trenchid"})
    )

    for feature_label in feature_labels:
        sgrnadf[feature_label + ": score"] = scoredf_groupby.apply(
            lambda x: np.array(
                [val for item in x[feature_label + ": score"].tolist() for val in item]
            )
        )

    sgrnadf[time_label] = scoredf_groupby.apply(
        lambda x: np.array([val for item in x[time_label].tolist() for val in item])
    )
    sgrnadf["Gene"] = scoredf_groupby.apply(lambda x: x["Gene"].iloc[0])
    sgrnadf["TargetID"] = scoredf_groupby.apply(lambda x: x["TargetID"].iloc[0])
    sgrnadf["N Mismatch"] = scoredf_groupby.apply(lambda x: x["N Mismatch"].iloc[0])
    sgrnadf["N Observations"] = scoredf_groupby.apply(
        lambda x: len(x["phenotype trenchid"].tolist())
    )
    sgrnadf["Category"] = scoredf_groupby.apply(lambda x: x["Category"].iloc[0])

    return sgrnadf


def normalize_timeseries(feature_vector_series, lmbda=0.5):
    timeseries_arr = np.swapaxes(np.array(feature_vector_series.tolist()), 1, 2)
    sigma = np.std(timeseries_arr, axis=1)
    if lmbda > 0.0:
        sigma_prime = ((sigma + 1) ** lmbda - 1) / lmbda  ##yeo-johnson
    elif lmbda == 0.0:
        sigma_prime = np.log(sigma + 1)
    else:
        raise ValueError("lmbda cannot be negative")
    normalizer = sigma / sigma_prime
    normalized_timeseries = timeseries_arr / normalizer[:, np.newaxis, :]
    return normalized_timeseries

### Initial Data Processing

Here, I am going to try and replicate (to some extant) the corrections from "Genomewide phenotypic analysis of growth, cell morphogenesis, and cell cycle events in Escherichia coli"

#### Start Dask

In [None]:
dask_controller = tr.trcluster.dask_controller(
    walltime="02:00:00",
    local=False,
    n_workers=100,
    n_workers_min=20,
    memory="16GB",
    working_directory="/home/de64/scratch/de64/dask",
)
dask_controller.startdask()

In [None]:
dask_controller.displaydashboard()

#### Import Dataframe

In [None]:
final_output_df_lineage = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2021-11-08_lDE20_Final_3/2021-12-01_lDE20_Final_Lineage_df/",
    engine="pyarrow",
)
final_output_df_lineage = final_output_df_lineage.dropna(
    subset=[
        "Final timepoints",
        "Mean Exponential Growth Rate: area",
        "Birth: minor_axis_length",
        "Birth: Surface Area",
    ]
)
final_output_df_lineage = (
    final_output_df_lineage.reset_index()
    .set_index("phenotype trenchid", sorted=True)
    .repartition(npartitions=100)
    .persist()
)


#### Filter for "Normal" Sizes at Start

1) Fit a gaussian model to each of the specified feature params during the first t timepoints of the experiment (using a subsample for speed) 
2) Compute a normalized probability trenchwise for these features under the gaussian model, during the first t timepoints of the experiment
3) Eliminate trenches that are under some p percentile value of this probability for each feature
4) Display histograms for each property as well as the resulting theshold

Note that these features should be the only features examined in the resulting analysis. For the notebook, I am looking at:
- Birth length (Lb)
- Division length (Ld)
- Mean Area Increment
- Mean Length Increment
- Mean Width
- Cell cycle duration (Delta t)
- Mean mCherry Intensity

In [None]:
def remove_early_outliers(
    final_output_df_lineage,
    early_timepoint_cutoff=12,
    gaussian_subsample=0.2,
    percentile_threshold=10,
    filter_params=[
        "Mean Linear Growth Rate: Volume",
        "Mean Exponential Growth Rate: Volume",
        "Division: major_axis_length",
        "Mean: minor_axis_length",
        "Mean: mCherry Intensity",
        "Delta time (s)",
    ],
    plot_values_names=[
        "Volume Growth Rate (linear)",
        "Volume Growth Rate (ratio)",
        "Division Length",
        "Minor Axis Length",
        "Mean mCherry Intensity",
        "Interdivision Time",
    ],
):

    final_output_df_trench_groupby = final_output_df_lineage.groupby(
        "phenotype trenchid", sort=False
    )
    early_tpt_df = final_output_df_trench_groupby.apply(
        lambda x: x[x["Final timepoints"] < early_timepoint_cutoff].reset_index(
            drop=True
        )
    ).persist()
    for filter_param in filter_params:
        early_param_series = early_tpt_df[filter_param]
        all_param_values = (
            early_param_series.sample(frac=gaussian_subsample).compute().tolist()
        )
        gaussian_fit = sp.stats.norm.fit(all_param_values)
        gaussian_fit = sp.stats.norm(loc=gaussian_fit[0], scale=gaussian_fit[1])

        early_param_series = dd.from_pandas(
            early_param_series.compute().droplevel(1), npartitions=50
        )
        trench_probability = early_param_series.groupby("phenotype trenchid").apply(
            lambda x: np.exp(np.sum(gaussian_fit.logpdf(x)) / len(x)), meta=float
        )

        final_output_df_lineage[
            filter_param + ": Probability"
        ] = trench_probability.persist()

    final_output_df_onetrench = (
        final_output_df_lineage.groupby("phenotype trenchid")
        .apply(lambda x: x.iloc[0])
        .compute()
    )

    plt.figure(figsize=(22, 16))
    query_list = []
    for i, filter_param in enumerate(filter_params):
        prob_threshold = np.nanpercentile(
            final_output_df_onetrench[filter_param + ": Probability"].tolist(),
            percentile_threshold,
        )
        query = "`" + filter_param + ": Probability` > " + str(prob_threshold)
        query_list.append(query)

        min_v, max_v = (
            np.min(final_output_df_onetrench[filter_param + ": Probability"]),
            np.max(final_output_df_onetrench[filter_param + ": Probability"]),
        )

        plt.subplot(2, 3, i + 1)
        plt.title(plot_values_names[i], fontsize=22)
        plt.xlabel("Unnormalized Likelihood", fontsize=18)
        plt.xticks(fontsize=18)
        plt.yticks(fontsize=18)
        plt.hist(
            final_output_df_onetrench[
                final_output_df_onetrench[filter_param + ": Probability"]
                < prob_threshold
            ][filter_param + ": Probability"].tolist(),
            bins=50,
            range=(min_v, max_v),
        )
        plt.hist(
            final_output_df_onetrench[
                final_output_df_onetrench[filter_param + ": Probability"]
                >= prob_threshold
            ][filter_param + ": Probability"].tolist(),
            bins=50,
            range=(min_v, max_v),
        )

    compiled_query = " and ".join(query_list)
    final_output_df_onetrench_filtered = final_output_df_onetrench.query(compiled_query)
    final_output_df_filtered = final_output_df_lineage.loc[
        final_output_df_onetrench_filtered.index.tolist()
    ].persist()

    return final_output_df_filtered

In [None]:
final_output_df_filtered = remove_early_outliers(
    final_output_df_lineage,
    early_timepoint_cutoff=12,
    gaussian_subsample=0.2,
    percentile_threshold=10,
)
plt.savefig("Prob_threshold_Replicate_1.png", dpi=500)

In [None]:
len(final_output_df_filtered) / len(final_output_df_lineage)

In [None]:
def get_timepoint_values(
    df,
    label,
    min_timepoint,
    max_timepoint,
    time_label="Final timepoints",
    flatten_vals=True,
):
    if flatten_vals:
        masked_label_series = df.apply(
            lambda x: np.array(x[label])[
                (np.array(x[time_label]) >= min_timepoint)
                * (np.array(x[time_label]) <= max_timepoint)
            ],
            axis=1,
            meta="object",
        )
        flattened_vals = np.concatenate(masked_label_series.compute().tolist())
        return flattened_vals
    else:
        masked_label_series = (
            df.groupby("phenotype trenchid")
            .apply(
                lambda x: np.array(x[label])[
                    (np.array(x[time_label]) >= min_timepoint)
                    * (np.array(x[time_label]) <= max_timepoint)
                ],
                meta="object",
            )
            .persist()
        )
        return masked_label_series


def get_feature_stats(
    df, feature_label, min_timepoint, max_timepoint, time_label="Final timepoints"
):
    feature_vals = get_timepoint_values(
        df, feature_label, min_timepoint, max_timepoint, time_label=time_label
    )
    feature_median = np.median(feature_vals)
    feature_iqr = sp.stats.iqr(feature_vals)
    return feature_median, feature_iqr


def compute_score(df, feature_label, feature_median, feature_iqr):
    scores = 1.35 * ((df[feature_label] - feature_median) / feature_iqr)
    return scores


def get_feature_scores(
    df, feature_label, init_timepoint_range=(0, 20), time_label="Final timepoints"
):
    feature_median, feature_iqr = get_feature_stats(
        df,
        feature_label,
        init_timepoint_range[0],
        init_timepoint_range[1],
        time_label=time_label,
    )
    scores = compute_score(df, feature_label, feature_median, feature_iqr)
    return scores


def get_all_feature_scores(
    df, feature_labels, init_timepoint_range=(0, 20), time_label="Final timepoints"
):

    for feature_label in feature_labels:
        print(feature_label)
        feature_scores = get_feature_scores(
            df,
            feature_label,
            init_timepoint_range=init_timepoint_range,
            time_label=time_label,
        )
        df[feature_label + ": z score"] = feature_scores.persist()

    return df

### Normalize Properties (maybe go back to the per trench normalization?)

1) Yeo-Johnson transform the data to get a more normal-like distribution.
2) Convert transformed values to time-dependent z-scores using the following formula:

$$ z(i,k,t) = 1.35 \times \frac{F_{i,k,t} - median_{t\in \tau}(F_{i,t})}{iqr_{t\in \tau}(F_{i,t})} $$

where $F_{i,k,t}$ are the feature values for feature i, trench k at time t. $\tau$ are the initial pre-induction timepoints. 

Essentially this is a z-score using the more outlier robust median and interquartile range to define the differences from normal bahavior. The 1.35 factor scales the values such that z-scores represent number of standard deviations from the mean for a normal distribution. Finally the values are normalized by initial behaviors trenchwise by the $median_{t\in \tau}(F_{i,k,t})$ factor.

In [None]:
def apply_yj_transform(
    final_output_df_filtered,
    yeo_subsample=0.05,
    early_timepoint_cutoff=12,
    params_to_transform=[
        "Mean Linear Growth Rate: Volume",
        "Mean Exponential Growth Rate: Volume",
        "Division: major_axis_length",
        "Mean: minor_axis_length",
        "Mean: mCherry Intensity",
        "Delta time (s)",
    ],
):

    subsample_df = final_output_df_filtered.sample(frac=yeo_subsample).persist()

    for i, param in enumerate(params_to_transform):
        all_param_values = subsample_df[param].astype(float).compute().tolist()
        all_param_values = np.array(all_param_values)
        all_param_values = all_param_values[~np.isnan(all_param_values)]
        l_norm = sp.stats.yeojohnson_normmax(all_param_values)
        final_output_df_filtered[param + ": Yeo-Johnson"] = (
            final_output_df_filtered[param]
            .apply(lambda x: sp.stats.yeojohnson(x, lmbda=l_norm), meta=float)
            .persist()
        )

    final_output_df_filtered = get_all_feature_scores(
        final_output_df_filtered,
        [param + ": Yeo-Johnson" for param in params_to_transform],
        init_timepoint_range=(0, early_timepoint_cutoff),
    )
    trenchiddf = final_output_df_filtered.reset_index().set_index(
        "phenotype trenchid", drop=True
    )

    return final_output_df_filtered, trenchiddf

In [None]:
final_output_df_filtered, trenchiddf = apply_yj_transform(
    final_output_df_filtered, yeo_subsample=0.05, early_timepoint_cutoff=12
)

#### Trying to interpolate trenchwise instead of sgRNAwise

In [None]:
from statsmodels.nonparametric import kernel_regression
from scipy.stats import iqr
from statsmodels.nonparametric.smoothers_lowess import lowess
import sklearn as skl
from tslearn.clustering import TimeSeriesKMeans
from tslearn.preprocessing import TimeSeriesScalerMeanVariance


def timeseries_kernel_reg(df, y_label, min_tpt, max_tpt, kernel_bins, kernel_bandwidth):
    def kernel_reg(
        x_arr,
        y_arr,
        start=min_tpt,
        end=max_tpt,
        kernel_bins=kernel_bins,
        kernel_bandwidth=kernel_bandwidth,
    ):
        intervals = np.linspace(start, end, num=kernel_bins, dtype=float)
        w = kernel_regression.KernelReg(
            y_arr,
            x_arr,
            "c",
            reg_type="lc",
            bw=np.array([kernel_bandwidth]),
            ckertype="gaussian",
        ).fit(intervals)[0]
        reg_x, reg_y = (intervals, w)
        return reg_x, reg_y

    kernel_result = df.groupby("phenotype trenchid").apply(
        lambda x: kernel_reg(
            (x["Final time (s)"].values - (x["Delta time (s)"].values / 2)),
            x[y_label].values,
        )[1],
        meta=float,
    )
    return kernel_result


def get_all_kernel_regs(
    df, y_label_list, min_tpt=0, max_tpt=36000, kernel_bins=20, kernel_bandwidth=2700
):
    out_df = copy.copy(df)

    for y_label in y_label_list:
        kernel_result = timeseries_kernel_reg(
            out_df, y_label, min_tpt, max_tpt, kernel_bins, kernel_bandwidth
        )
        out_df["Kernel Trace: " + y_label] = kernel_result.persist()

    return out_df

In [None]:
params_to_transform = [
    "Mean Linear Growth Rate: Volume",
    "Mean Exponential Growth Rate: Volume",
    "Division: major_axis_length",
    "Mean: minor_axis_length",
    "Mean: mCherry Intensity",
    "Delta time (s)",
]
score_params = [param + ": Yeo-Johnson: z score" for param in params_to_transform]
other_params = [
    "Mean Linear Growth Rate: Volume",
    "Mean Exponential Growth Rate: Volume",
    "Birth: major_axis_length",
    "Division: major_axis_length",
    "Birth: Volume",
    "Division: Volume",
    "Birth: Surface Area",
    "Division: Surface Area",
    "Mean: minor_axis_length",
    "Mean: mCherry Intensity",
    "Delta time (s)",
]

params_to_trace = score_params + other_params

# making an observation grid to project vals onto

trenchiddf = get_all_kernel_regs(
    trenchiddf,
    params_to_trace,
    min_tpt=0,
    max_tpt=36000,
    kernel_bins=20,
    kernel_bandwidth=2700,
)

In [None]:
final_output_df_barcodes = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2021-11-08_lDE20_Final_3/2021-12-01_lDE20_Final_Barcodes_df/",
    engine="pyarrow",
)
final_output_df_barcodes = (
    final_output_df_barcodes.set_index("phenotype trenchid", sorted=True)
    .groupby("phenotype trenchid", sort=False)
    .apply(lambda x: x.iloc[0])
    .persist()
)
final_output_df_barcodes = final_output_df_barcodes.reset_index().set_index(
    "oDEPool7_id", drop=True
)
# final_output_df_barcodes["N Observations"] = final_output_df_barcodes.groupby("oDEPool7_id",sort=False)["phenotype trenchid"].apply(lambda x: len(x.unique()), meta=int)
# final_output_df_barcodes["N Observations"] = final_output_df_barcodes["N Observations"].astype(int)
final_output_df_barcodes = (
    final_output_df_barcodes.reset_index().set_index("phenotype trenchid").persist()
)

trenchiddf = trenchiddf.merge(
    final_output_df_barcodes[
        [
            "oDEPool7_id",
            "Barcode",
            "sgRNA",
            "Closest Hamming Distance",
            "EcoWG1_id",
            "Gene",
            "N Mismatch",
            "Category",
            "TargetID",
            "barcodeid",
            "N Observations",
        ]
    ],
    how="inner",
    left_index=True,
    right_index=True,
).persist()
trenchiddf_out = trenchiddf.groupby("phenotype trenchid").apply(lambda x: x.iloc[0])
trenchiddf_out["N Observations"] = trenchiddf_out.groupby("oDEPool7_id", sort=False)[
    "phenotype trenchid"
].apply(lambda x: len(x.unique()), meta=int)
trenchiddf_out["N Observations"] = trenchiddf_out["N Observations"].astype(int)

In [None]:
kernel_params = ["Kernel Trace: " + param for param in other_params]
kernel_score_params = [
    "Kernel Trace: " + param + ": Yeo-Johnson: z score" for param in params_to_transform
]

feature_vector_series = trenchiddf_out.apply(
    lambda x: np.array(x[kernel_score_params].tolist()), axis=1
)
trenchiddf_out["Feature Vector"] = feature_vector_series
trenchiddf_nan_filtered = trenchiddf_out[
    ~trenchiddf_out["Feature Vector"].apply(lambda x: np.any(np.isnan(x)), meta=bool)
].compute()

In [None]:
# strong_effect_threshold = 15

# zero_vector = np.zeros((1,trenchiddf_nan_filtered["Feature Vector"].iloc[0].shape[0]))
# feature_arr = np.array(trenchiddf_nan_filtered["Feature Vector"].tolist())
# feature_arr_abs = np.abs(feature_arr)
# trenchiddf_nan_filtered["Integrated Feature Vector"] = [item for item in sp.integrate.simpson(feature_arr_abs,axis=2)]
# trenchiddf_nan_filtered["Integrated Feature Max"] = trenchiddf_nan_filtered["Integrated Feature Vector"].apply(lambda x: np.max(x))
# trenchiddf_nan_filtered["Integrated Euclidean Norm"] = np.linalg.norm(np.array(trenchiddf_nan_filtered["Integrated Feature Vector"].tolist()), axis=1)

# sgrnadf_strong_effect = trenchiddf_nan_filtered[trenchiddf_nan_filtered["Integrated Feature Max"]>=strong_effect_threshold]
# min_v,max_v = np.min(trenchiddf_nan_filtered["Integrated Feature Max"]),np.percentile(trenchiddf_nan_filtered["Integrated Feature Max"],99)

# plt.figure(figsize=(8,8))
# plt.title("Integrated Feature Max")
# plt.hist(trenchiddf_nan_filtered[trenchiddf_nan_filtered["Integrated Feature Max"]<strong_effect_threshold]["Integrated Feature Max"].tolist(),bins=50,range=(min_v,max_v))
# plt.hist(trenchiddf_nan_filtered[trenchiddf_nan_filtered["Integrated Feature Max"]>=strong_effect_threshold]["Integrated Feature Max"].tolist(),bins=50,range=(min_v,max_v))
# plt.show()

# unique_genes, gene_counts = np.unique(sgrnadf_strong_effect["Gene"][sgrnadf_strong_effect["Gene"].apply(lambda x: type(x)==str)].tolist(), return_counts=True)
# plt.title("sgRNAs per Gene")
# plt.xticks(range(0,20,2),labels=range(0,20,2))
# plt.hist(gene_counts,bins=np.arange(20)-0.5)
# plt.show()

### Pick Representative Effect per TargetID
Note this may need to be revisited later to resolve transients that are only resolvable at intermediate KO

1) For each target, pick the sgRNA that has the strongest phenotype (highest integrated euclidean norm)
2) Additionally identify any targets with titration information by saving a dataframe with targetIDs that posess at least N sgRNAs
    - this is in a preliminary form; transfer to a full notebook later

In [None]:
trenchiddf_nan_filtered.to_pickle(
    "/home/de64/scratch/de64/sync_folder/2021-11-08_lDE20_Final_3/2022-02-10_gene_cluster_df_no_filter.pkl"
)

In [None]:
dask_controller.shutdown()