## Feature processing before main analysis

- Note that there are fluctuations in the illumination intensity which may be resulting in pathological behavior from the reporter

- This has been normalized out in the upstream processing, but try to fix long term

- Also consider a flat field correction for the final experiment

In [None]:
import ast
import copy
import random
import warnings

import dask
import dask.array as da
import dask.dataframe as dd
import holoviews as hv
import matplotlib as mpl
import matplotlib.gridspec as gridspec
import numpy as np
import pandas as pd
import pylab
import scipy as sp
import scipy.cluster.hierarchy as sch
import scipy.stats
import seaborn as sns
import sklearn as skl
from matplotlib import pyplot as plt
from sklearn.cluster import AffinityPropagation, AgglomerativeClustering
from sklearn.linear_model import LinearRegression
from sklearn.manifold import TSNE
from sklearn.metrics import silhouette_score
from sklearn.metrics.pairwise import (
    cosine_distances,
    euclidean_distances,
    manhattan_distances,
)

import paulssonlab.deaton.trenchripper.trenchripper as tr

hv.extension("bokeh")
random.seed(42)
np.random.seed(42)

warnings.filterwarnings(action="once", category=UserWarning)

In [None]:
def get_sgrnadf_from_scoredf(
    scoredf, feature_labels, time_label="final cell timepoints list"
):
    scoredf_groupby = scoredf.groupby("sgRNA")
    sgrnadf = (
        scoredf_groupby.apply(lambda x: x["phenotype trenchid"].tolist())
        .to_frame()
        .rename(columns={0: "phenotype trenchid"})
    )

    for feature_label in feature_labels:
        sgrnadf[feature_label + ": score"] = scoredf_groupby.apply(
            lambda x: np.array(
                [val for item in x[feature_label + ": score"].tolist() for val in item]
            )
        )

    sgrnadf[time_label] = scoredf_groupby.apply(
        lambda x: np.array([val for item in x[time_label].tolist() for val in item])
    )
    sgrnadf["Gene"] = scoredf_groupby.apply(lambda x: x["Gene"].iloc[0])
    sgrnadf["TargetID"] = scoredf_groupby.apply(lambda x: x["TargetID"].iloc[0])
    sgrnadf["N Mismatch"] = scoredf_groupby.apply(lambda x: x["N Mismatch"].iloc[0])
    sgrnadf["N Observations"] = scoredf_groupby.apply(
        lambda x: len(x["phenotype trenchid"].tolist())
    )
    sgrnadf["Category"] = scoredf_groupby.apply(lambda x: x["Category"].iloc[0])

    return sgrnadf


def normalize_timeseries(feature_vector_series, lmbda=0.5):
    timeseries_arr = np.swapaxes(np.array(feature_vector_series.tolist()), 1, 2)
    sigma = np.std(timeseries_arr, axis=1)
    if lmbda > 0.0:
        sigma_prime = ((sigma + 1) ** lmbda - 1) / lmbda  ##yeo-johnson
    elif lmbda == 0.0:
        sigma_prime = np.log(sigma + 1)
    else:
        raise ValueError("lmbda cannot be negative")
    normalizer = sigma / sigma_prime
    normalized_timeseries = timeseries_arr / normalizer[:, np.newaxis, :]
    return normalized_timeseries

### Initial Data Processing

Here, I am going to try and replicate (to some extant) the corrections from "Genomewide phenotypic analysis of growth, cell morphogenesis, and cell cycle events in Escherichia coli"

#### Start Dask

In [None]:
dask_controller = tr.trcluster.dask_controller(
    walltime="01:00:00",
    local=False,
    n_workers=100,
    n_workers_min=20,
    memory="16GB",
    working_directory="/home/de64/scratch/de64/dask",
)
dask_controller.startdask()

In [None]:
dask_controller.displaydashboard()

In [None]:
dask_controller.daskclient.restart()

In [None]:
dask_controller.shutdown()

#### Import Dataframe

In [None]:
final_output_df_lineage = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2022-01-20_lDE20_Final_6/2022-02-09_lDE20_Final_Lineage_df/",
    engine="pyarrow",
)
final_output_df_lineage = final_output_df_lineage.dropna(
    subset=[
        "Final timepoints",
        "Mean Exponential Growth Rate: area",
        "Birth: minor_axis_length",
        "Birth: Surface Area",
    ]
)
final_output_df_lineage = (
    final_output_df_lineage.reset_index()
    .set_index("phenotype trenchid", sorted=True)
    .repartition(npartitions=100)
    .persist()
)

In [None]:
final_output_df_trench_groupby = final_output_df_lineage.groupby(
    "phenotype trenchid", sort=False
)


#### Filter for "Normal" Sizes at Start

1) Fit a gaussian model to each of the specified feature params during the first t timepoints of the experiment (using a subsample for speed) 
2) Compute a normalized probability trenchwise for these features under the gaussian model, during the first t timepoints of the experiment
3) Eliminate trenches that are under some p percentile value of this probability for each feature
4) Display histograms for each property as well as the resulting theshold

Note that these features should be the only features examined in the resulting analysis. For the notebook, I am looking at:
- Birth length (Lb)
- Division length (Ld)
- Mean Area Increment
- Mean Length Increment
- Mean Width
- Cell cycle duration (Delta t)
- Mean mCherry Intensity

In [None]:
early_timepoint_cutoff = 12
gaussian_subsample = 0.2
percentile_threshold = 10

filter_params = [
    "Mean Linear Growth Rate: Volume",
    "Mean Exponential Growth Rate: Volume",
    "Division: major_axis_length",
    "Mean: minor_axis_length",
    "Mean: mCherry Intensity",
    "Delta time (s)",
]

In [None]:
early_tpt_df = final_output_df_trench_groupby.apply(
    lambda x: x[x["Final timepoints"] < early_timepoint_cutoff].reset_index(drop=True)
).persist()

In [None]:
for filter_param in filter_params:
    early_param_series = early_tpt_df[filter_param]
    all_param_values = (
        early_param_series.sample(frac=gaussian_subsample).compute().tolist()
    )
    gaussian_fit = sp.stats.norm.fit(all_param_values)
    gaussian_fit = sp.stats.norm(loc=gaussian_fit[0], scale=gaussian_fit[1])

    early_param_series = dd.from_pandas(
        early_param_series.compute().droplevel(1), npartitions=50
    )
    trench_probability = early_param_series.groupby("phenotype trenchid").apply(
        lambda x: np.exp(np.sum(gaussian_fit.logpdf(x)) / len(x)), meta=float
    )

    final_output_df_lineage[
        filter_param + ": Probability"
    ] = trench_probability.persist()

final_output_df_onetrench = (
    final_output_df_lineage.groupby("phenotype trenchid")
    .apply(lambda x: x.iloc[0])
    .compute()
)

values_names = [
    "Volume Growth Rate (linear)",
    "Volume Growth Rate (ratio)",
    "Division Length",
    "Minor Axis Length",
    "Mean mCherry Intensity",
    "Interdivision Time",
]
plt.figure(figsize=(22, 16))
query_list = []
for i, filter_param in enumerate(filter_params):
    prob_threshold = np.nanpercentile(
        final_output_df_onetrench[filter_param + ": Probability"].tolist(),
        percentile_threshold,
    )
    query = "`" + filter_param + ": Probability` > " + str(prob_threshold)
    query_list.append(query)

    min_v, max_v = (
        np.min(final_output_df_onetrench[filter_param + ": Probability"]),
        np.max(final_output_df_onetrench[filter_param + ": Probability"]),
    )

    plt.subplot(2, 3, i + 1)
    plt.title(values_names[i], fontsize=22)
    plt.xlabel("Unnormalized Likelihood", fontsize=18)
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)
    plt.hist(
        final_output_df_onetrench[
            final_output_df_onetrench[filter_param + ": Probability"] < prob_threshold
        ][filter_param + ": Probability"].tolist(),
        bins=50,
        range=(min_v, max_v),
    )
    plt.hist(
        final_output_df_onetrench[
            final_output_df_onetrench[filter_param + ": Probability"] >= prob_threshold
        ][filter_param + ": Probability"].tolist(),
        bins=50,
        range=(min_v, max_v),
    )
plt.savefig("Prob_threshold.png", dpi=500)

compiled_query = " and ".join(query_list)
final_output_df_onetrench_filtered = final_output_df_onetrench.query(compiled_query)
final_output_df_filtered = final_output_df_lineage.loc[
    final_output_df_onetrench_filtered.index.tolist()
].persist()

In [None]:
len(final_output_df_filtered) / len(final_output_df_lineage)

In [None]:
def get_timepoint_values(
    df,
    label,
    min_timepoint,
    max_timepoint,
    time_label="Final timepoints",
    flatten_vals=True,
):
    if flatten_vals:
        masked_label_series = df.apply(
            lambda x: np.array(x[label])[
                (np.array(x[time_label]) >= min_timepoint)
                * (np.array(x[time_label]) <= max_timepoint)
            ],
            axis=1,
            meta="object",
        )
        flattened_vals = np.concatenate(masked_label_series.compute().tolist())
        return flattened_vals
    else:
        masked_label_series = (
            df.groupby("phenotype trenchid")
            .apply(
                lambda x: np.array(x[label])[
                    (np.array(x[time_label]) >= min_timepoint)
                    * (np.array(x[time_label]) <= max_timepoint)
                ],
                meta="object",
            )
            .persist()
        )
        return masked_label_series


def get_feature_stats(
    df, feature_label, min_timepoint, max_timepoint, time_label="Final timepoints"
):
    feature_vals = get_timepoint_values(
        df, feature_label, min_timepoint, max_timepoint, time_label=time_label
    )
    feature_median = np.median(feature_vals)
    feature_iqr = sp.stats.iqr(feature_vals)
    return feature_median, feature_iqr


def compute_score(df, feature_label, feature_median, feature_iqr):
    scores = 1.35 * ((df[feature_label] - feature_median) / feature_iqr)
    return scores


def get_feature_scores(
    df, feature_label, init_timepoint_range=(0, 20), time_label="Final timepoints"
):
    feature_median, feature_iqr = get_feature_stats(
        df,
        feature_label,
        init_timepoint_range[0],
        init_timepoint_range[1],
        time_label=time_label,
    )
    scores = compute_score(df, feature_label, feature_median, feature_iqr)
    return scores


def get_all_feature_scores(
    df, feature_labels, init_timepoint_range=(0, 20), time_label="Final timepoints"
):
    for feature_label in feature_labels:
        print(feature_label)
        feature_scores = get_feature_scores(
            df,
            feature_label,
            init_timepoint_range=init_timepoint_range,
            time_label=time_label,
        )
        df[feature_label + ": z score"] = feature_scores.persist()

    return df

### Normalize Properties (maybe go back to the per trench normalization?)

1) Yeo-Johnson transform the data to get a more normal-like distribution.
2) Convert transformed values to time-dependent z-scores using the following formula:

$$ z(i,k,t) = 1.35 \times \frac{F_{i,k,t} - median_{t\in \tau}(F_{i,t})}{iqr_{t\in \tau}(F_{i,t})} $$

where $F_{i,k,t}$ are the feature values for feature i, trench k at time t. $\tau$ are the initial pre-induction timepoints. 

Essentially this is a z-score using the more outlier robust median and interquartile range to define the differences from normal bahavior. The 1.35 factor scales the values such that z-scores represent number of standard deviations from the mean for a normal distribution. Finally the values are normalized by initial behaviors trenchwise by the $median_{t\in \tau}(F_{i,k,t})$ factor.

In [None]:
yeo_subsample = 0.1

subsample_df = final_output_df_filtered.sample(frac=yeo_subsample).persist()

In [None]:
subsample_df

In [None]:
late_phenotypes = subsample_df[subsample_df["Final timepoints"] > 40]

In [None]:
all_param_values = (
    subsample_df["Mean: mCherry Intensity"].astype(float).compute().tolist()
)
all_param_values = np.array(all_param_values)
all_param_values = all_param_values[~np.isnan(all_param_values)]
l_norm = sp.stats.yeojohnson_normmax(all_param_values)

all_param_values = (
    late_phenotypes["Mean: mCherry Intensity"].astype(float).compute().tolist()
)
all_param_values = np.array(all_param_values)
all_param_values = all_param_values[~np.isnan(all_param_values)]

yj_transformed = sp.stats.yeojohnson(all_param_values, lmbda=l_norm)

In [None]:
plt.figure(figsize=(22, 16))

plt.subplot(2, 2, 1)
plt.title("Mean mCherry Intensity", fontsize=22)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.hist(all_param_values, bins=100)
plt.subplot(2, 2, 2)
plt.title("Mean mCherry Intensity (log frequency)", fontsize=22)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.hist(all_param_values, bins=100, log=True)
plt.subplot(2, 2, 3)
plt.title("Centered Yeo-Johnson Transformed mCherry", fontsize=22)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.hist(yj_transformed - np.median(yj_transformed), bins=100, range=(-0.002, 0.01))
plt.subplot(2, 2, 4)
plt.title("Centered Yeo-Johnson Transformed mCherry (log frequency)", fontsize=22)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.hist(
    yj_transformed - np.median(yj_transformed), bins=100, range=(-0.002, 0.01), log=True
)

plt.savefig("YJ_transform.png", dpi=500)

In [None]:
params_to_transform = [
    "Mean Linear Growth Rate: Volume",
    "Mean Exponential Growth Rate: Volume",
    "Division: major_axis_length",
    "Mean: minor_axis_length",
    "Mean: mCherry Intensity",
    "Delta time (s)",
]

yeo_subsample = 0.05

subsample_df = final_output_df_filtered.sample(frac=yeo_subsample).persist()

for i, param in enumerate(params_to_transform):
    all_param_values = subsample_df[param].astype(float).compute().tolist()
    all_param_values = np.array(all_param_values)
    all_param_values = all_param_values[~np.isnan(all_param_values)]
    l_norm = sp.stats.yeojohnson_normmax(all_param_values)
    final_output_df_filtered[param + ": Yeo-Johnson"] = (
        final_output_df_filtered[param]
        .apply(lambda x: sp.stats.yeojohnson(x, lmbda=l_norm), meta=float)
        .persist()
    )

final_output_df_filtered = get_all_feature_scores(
    final_output_df_filtered,
    [param + ": Yeo-Johnson" for param in params_to_transform],
    init_timepoint_range=(0, early_timepoint_cutoff),
)
trenchiddf = final_output_df_filtered.reset_index().set_index(
    "phenotype trenchid", drop=True
)

### sgRNA Effect Size Filtering (within Gene groups)

1) Threshold sgRNAs to include by number of observations
2) Use Kernel smoothing to smooth out both score and raw timeseries into 20 points
3) For each timepoint, measure the euclidean norm of the feature vector and integrate it over all time as a measure of effect size
4) Threshold sgRNAs for strong effects by applying a threshold to the euclidean norm that will be displayed with histogram
5) Display a histogram for the sgRNA number per gene

In [None]:
# from statsmodels.nonparametric import kernel_regression
# from scipy.stats import iqr
# from statsmodels.nonparametric.smoothers_lowess import lowess
# import sklearn as skl
# from tslearn.clustering import TimeSeriesKMeans
# from tslearn.preprocessing import TimeSeriesScalerMeanVariance


# def timeseries_kernel_reg(df, y_label, min_tpt, max_tpt, kernel_bins, kernel_bandwidth):
#     def kernel_reg(x_arr,y_arr,start=min_tpt,end=max_tpt,kernel_bins=kernel_bins,kernel_bandwidth=kernel_bandwidth):
#         intervals = np.linspace(start, end, num=kernel_bins, dtype=float)
#         w = kernel_regression.KernelReg(y_arr,x_arr,"c",reg_type="lc",bw=np.array([kernel_bandwidth]),ckertype="gaussian").fit(intervals)[0]
#         reg_x, reg_y = (intervals, w)
#         return reg_x, reg_y

#     kernel_result = df.groupby("sgRNAid").apply(lambda x: kernel_reg((x["final timepoints"].values + x["initial timepoints"].values) / 2,x[y_label].values,)[1],meta=float,)
#     return kernel_result


# def get_all_kernel_regs(df, y_label_list, min_tpt, max_tpt, kernel_bins=20, kernel_bandwidth=10):
#     out_df = copy.copy(df)

#     for y_label in y_label_list:
#         kernel_result = timeseries_kernel_reg(out_df, y_label, min_tpt, max_tpt, kernel_bins, kernel_bandwidth)
#         out_df["Kernel Trace: " + y_label] = kernel_result.persist()

#     return out_df

#### Trying to interpolate trenchwise instead of sgRNAwise

In [None]:
import sklearn as skl
from scipy.stats import iqr
from statsmodels.nonparametric import kernel_regression
from statsmodels.nonparametric.smoothers_lowess import lowess
from tslearn.clustering import TimeSeriesKMeans
from tslearn.preprocessing import TimeSeriesScalerMeanVariance


def timeseries_kernel_reg(df, y_label, min_tpt, max_tpt, kernel_bins, kernel_bandwidth):
    def kernel_reg(
        x_arr,
        y_arr,
        start=min_tpt,
        end=max_tpt,
        kernel_bins=kernel_bins,
        kernel_bandwidth=kernel_bandwidth,
    ):
        intervals = np.linspace(start, end, num=kernel_bins, dtype=float)
        w = kernel_regression.KernelReg(
            y_arr,
            x_arr,
            "c",
            reg_type="lc",
            bw=np.array([kernel_bandwidth]),
            ckertype="gaussian",
        ).fit(intervals)[0]
        reg_x, reg_y = (intervals, w)
        return reg_x, reg_y

    kernel_result = df.groupby("phenotype trenchid").apply(
        lambda x: kernel_reg(
            (x["Final time (s)"].values - (x["Delta time (s)"].values / 2)),
            x[y_label].values,
        )[1],
        meta=float,
    )
    return kernel_result


def get_all_kernel_regs(
    df, y_label_list, min_tpt, max_tpt, kernel_bins=20, kernel_bandwidth=5
):
    out_df = copy.copy(df)

    for y_label in y_label_list:
        kernel_result = timeseries_kernel_reg(
            out_df, y_label, min_tpt, max_tpt, kernel_bins, kernel_bandwidth
        )
        out_df["Kernel Trace: " + y_label] = kernel_result.persist()

    return out_df

In [None]:
# making an observation grid to project vals onto

min_tpt = 0
max_tpt = 36000

kernel_bins = 20
kernel_bandwidth = 2700

score_params = [param + ": Yeo-Johnson: z score" for param in params_to_transform]
other_params = [
    "Mean Linear Growth Rate: Volume",
    "Mean Exponential Growth Rate: Volume",
    "Birth: major_axis_length",
    "Division: major_axis_length",
    "Birth: Volume",
    "Division: Volume",
    "Birth: Surface Area",
    "Division: Surface Area",
    "Mean: minor_axis_length",
    "Mean: mCherry Intensity",
    "Delta time (s)",
]


def temp_kernel_reg(
    x_arr,
    y_arr,
    start=min_tpt,
    end=max_tpt,
    kernel_bins=kernel_bins,
    kernel_bandwidth=kernel_bandwidth,
):
    intervals = np.linspace(start, end, num=kernel_bins, dtype=float)
    w = kernel_regression.KernelReg(
        y_arr,
        x_arr,
        "c",
        reg_type="lc",
        bw=np.array([kernel_bandwidth]),
        ckertype="gaussian",
    ).fit(intervals)[0]
    reg_x, reg_y = (intervals, w)
    return reg_x, reg_y


trenchid_list = trenchiddf.index.unique().compute().tolist()

In [None]:
rand_trenchid = np.random.choice(trenchid_list)

rand_trenchid_df = trenchiddf.loc[rand_trenchid].compute()

In [None]:
x_arr = rand_trenchid_df["Final time (s)"].values - (
    rand_trenchid_df["Delta time (s)"].values / 2
)
y_arr = rand_trenchid_df["Division: major_axis_length"]
reg_x, reg_y = temp_kernel_reg(x_arr, y_arr)

In [None]:
plt.figure(figsize=(12, 8))

plt.title("Division Length Kernel Regression", fontsize=22)
plt.xticks(fontsize=18)
plt.yticks(fontsize=18)
plt.xlabel("Timepoint", fontsize=18)
plt.ylabel("Division Length", fontsize=18)
plt.scatter(x_arr, y_arr)
plt.plot(reg_x, reg_y)
plt.ylim(3, 9)

plt.savefig("Kernel_Reg_Example.png", dpi=500)

In [None]:
# making an observation grid to project vals onto

min_tpt = 0
max_tpt = 36000

kernel_bins = 20
kernel_bandwidth = 2700

score_params = [param + ": Yeo-Johnson: z score" for param in params_to_transform]
other_params = [
    "Mean Linear Growth Rate: Volume",
    "Mean Exponential Growth Rate: Volume",
    "Birth: major_axis_length",
    "Division: major_axis_length",
    "Birth: Volume",
    "Division: Volume",
    "Birth: Surface Area",
    "Division: Surface Area",
    "Mean: minor_axis_length",
    "Mean: mCherry Intensity",
    "Delta time (s)",
]

trenchiddf = get_all_kernel_regs(
    trenchiddf,
    other_params,
    min_tpt,
    max_tpt,
    kernel_bins=kernel_bins,
    kernel_bandwidth=kernel_bandwidth,
)
trenchiddf = get_all_kernel_regs(
    trenchiddf,
    score_params,
    min_tpt,
    max_tpt,
    kernel_bins=kernel_bins,
    kernel_bandwidth=kernel_bandwidth,
)

In [None]:
final_output_df_barcodes = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2022-01-20_lDE20_Final_6/2022-02-09_lDE20_Final_Barcodes_df/",
    engine="pyarrow",
)
final_output_df_barcodes = (
    final_output_df_barcodes.set_index("phenotype trenchid", sorted=True)
    .groupby("phenotype trenchid", sort=False)
    .apply(lambda x: x.iloc[0])
    .persist()
)
# sgRNA_dict = {sgRNA:i for i,sgRNA in enumerate(sorted(final_output_df_barcodes["sgRNA"].unique().compute().tolist()))}

In [None]:
# final_output_df_barcodes["sgRNAid"] = final_output_df_barcodes["sgRNA"].apply(lambda x: sgRNA_dict[x], meta=int)
# final_output_df_barcodes["sgRNAid"] = final_output_df_barcodes["sgRNAid"].astype(int)
final_output_df_barcodes = final_output_df_barcodes.reset_index().set_index(
    "oDEPool7_id", drop=True
)
final_output_df_barcodes["N Observations"] = final_output_df_barcodes.groupby(
    "oDEPool7_id", sort=False
)["phenotype trenchid"].apply(lambda x: len(x.unique()), meta=int)
final_output_df_barcodes["N Observations"] = final_output_df_barcodes[
    "N Observations"
].astype(int)
final_output_df_barcodes = (
    final_output_df_barcodes.reset_index().set_index("phenotype trenchid").persist()
)

In [None]:
final_output_df_barcodes

In [None]:
trenchiddf = trenchiddf.merge(
    final_output_df_barcodes[["oDEPool7_id"]],
    how="inner",
    left_index=True,
    right_index=True,
).persist()
trenchiddf_out = trenchiddf.groupby("phenotype trenchid").apply(lambda x: x.iloc[0])

In [None]:
final_output_df_barcodes = (
    final_output_df_barcodes.reset_index().set_index("oDEPool7_id").persist()
)

In [None]:
all_kernel_traces = score_params + other_params
all_kernel_traces = ["Kernel Trace: " + item for item in all_kernel_traces]

trenchiddf_out_groupby = (
    trenchiddf_out.reset_index().set_index("oDEPool7_id").groupby("oDEPool7_id")
)

for kernel_trace in all_kernel_traces:
    mean_kernel = (
        trenchiddf_out_groupby[kernel_trace]
        .apply(lambda x: np.mean(np.array(x.tolist()), axis=0))
        .persist()
    )
    final_output_df_barcodes[kernel_trace] = mean_kernel

final_output_df_barcodes = (
    final_output_df_barcodes.groupby("oDEPool7_id").apply(lambda x: x.iloc[0]).persist()
)

In [None]:
obs_list = final_output_df_barcodes["N Observations"].compute()

In [None]:
plt.hist(obs_list, bins=50)
plt.show()

In [None]:
plt.hist(obs_list, bins=50, range=(0, 10))
plt.show()

In [None]:
N_Observations_thr = 5

sgrnadf = final_output_df_barcodes
sgrnadf_wellsampled = final_output_df_barcodes[
    final_output_df_barcodes["N Observations"] >= N_Observations_thr
].persist()

In [None]:
# # making an observation grid to project vals onto

# min_tpt = 0
# max_tpt = 143

# kernel_bins = 20
# kernel_bandwidth = 10

# score_params = [param + ": Yeo-Johnson: z score" for param in params_to_transform]
# other_params = [
#     "Mean Linear Growth Rate: Volume",
#     "Mean Exponential Growth Rate: Volume",
#     "Birth: major_axis_length",
#     "Division: major_axis_length",
#     "Birth: Volume",
#     "Division: Volume",
#     "Birth: Surface Area",
#     "Division: Surface Area",
#     "Mean: minor_axis_length",
#     "Mean: mCherry Intensity",
#     "Delta t",
# ]

# trace_df_raw = get_all_kernel_regs(
#     sgrnadf_wellsampled,
#     other_params,
#     min_tpt,
#     max_tpt,
#     kernel_bins=kernel_bins,
#     kernel_bandwidth=kernel_bandwidth,
# )
# trace_df_raw = get_all_kernel_regs(
#     trace_df_raw,
#     score_params,
#     min_tpt,
#     max_tpt,
#     kernel_bins=kernel_bins,
#     kernel_bandwidth=kernel_bandwidth,
# )
# trace_df = trace_df_raw.groupby("sgRNAid").apply(lambda x: x.iloc[0]).compute()

In [None]:
# non_group_measurements = ['Global CellID', 'phenotype trenchid', 'File Parquet Index', 'fov',
#        'row', 'trench', 'initial timepoints', 'lane orientation', 'y (local)',
#        'x (local)', 'File Index', 'File Trench Index', 'CellID',
#        'Trench Score', 'Mother CellID', 'Daughter CellID 1',
#        'Daughter CellID 2', 'Sister CellID', 'Centroid X', 'Centroid Y',
#        'FOV Parquet Index', 'mCherry mean_intensity', 'area',
#        'major_axis_length', 'minor_axis_length', 'Volume', 'Surface Area',
#        'Delta: area', 'Birth: area', 'Division: area',
#        'Delta: major_axis_length', 'Birth: major_axis_length',
#        'Division: major_axis_length', 'Delta: minor_axis_length',
#        'Birth: minor_axis_length', 'Division: minor_axis_length',
#        'Delta: Volume', 'Birth: Volume', 'Division: Volume',
#        'Delta: Surface Area', 'Birth: Surface Area', 'Division: Surface Area',
#        'Final timepoints', 'Delta time (s)','Mean: area', 'Mean Linear Growth Rate: area',
#        'Mean Exponential Growth Rate: area', 'Mean: major_axis_length',
#        'Mean Linear Growth Rate: major_axis_length',
#        'Mean Exponential Growth Rate: major_axis_length',
#        'Mean: minor_axis_length', 'Mean Linear Growth Rate: minor_axis_length',
#        'Mean Exponential Growth Rate: minor_axis_length', 'Mean: Volume',
#        'Mean Linear Growth Rate: Volume',
#        'Mean Exponential Growth Rate: Volume', 'Mean: Surface Area',
#        'Mean Linear Growth Rate: Surface Area',
#        'Mean Exponential Growth Rate: Surface Area', 'Mean: mCherry Intensity',
#        'Mean Linear Growth Rate: Volume: Probability',
#        'Mean Exponential Growth Rate: Volume: Probability',
#        'Division: major_axis_length: Probability',
#        'Mean: minor_axis_length: Probability',
#        'Mean: mCherry Intensity: Probability', 'Delta time (s): Probability',
#        'Mean Linear Growth Rate: Volume: Yeo-Johnson',
#        'Mean Exponential Growth Rate: Volume: Yeo-Johnson',
#        'Division: major_axis_length: Yeo-Johnson',
#        'Mean: minor_axis_length: Yeo-Johnson',
#        'Mean: mCherry Intensity: Yeo-Johnson', 'Delta time (s): Yeo-Johnson',
#        'Mean Linear Growth Rate: Volume: Yeo-Johnson: z score',
#        'Mean Exponential Growth Rate: Volume: Yeo-Johnson: z score',
#        'Division: major_axis_length: Yeo-Johnson: z score',
#        'Mean: minor_axis_length: Yeo-Johnson: z score',
#        'Mean: mCherry Intensity: Yeo-Johnson: z score',
#        'Delta time (s): Yeo-Johnson: z score','index', 'trenchid']


non_group_measurements = [
    "phenotype trenchid",
    "File Parquet Index",
    "fov",
    "row",
    "trench",
    "lane orientation",
    "y (local)",
    "x (local)",
    "File Index",
    "File Trench Index",
    "index",
    "trenchid",
]

trenchiddf_out = sgrnadf.drop(columns=non_group_measurements).compute()
trenchiddf_out["phenotype trenchids"] = (
    trenchiddf.reset_index()
    .set_index("oDEPool7_id")
    .groupby("oDEPool7_id", sort=False)
    .apply(lambda x: x["phenotype trenchid"].unique().tolist())
    .compute()
)

In [None]:
kernel_params = ["Kernel Trace: " + param for param in other_params]
kernel_score_params = [
    "Kernel Trace: " + param + ": Yeo-Johnson: z score" for param in params_to_transform
]

feature_vector_series = trenchiddf_out.apply(
    lambda x: np.array(x[kernel_score_params].tolist()), axis=1
)
trenchiddf_out["Feature Vector"] = feature_vector_series
trenchiddf_nan_filtered = trenchiddf_out[
    ~trenchiddf_out["Feature Vector"].apply(lambda x: np.any(np.isnan(x)))
]

In [None]:
strong_effect_threshold = 15

zero_vector = np.zeros((1, trenchiddf_nan_filtered["Feature Vector"].iloc[0].shape[0]))
feature_arr = np.array(trenchiddf_nan_filtered["Feature Vector"].tolist())
feature_arr_abs = np.abs(feature_arr)
trenchiddf_nan_filtered["Integrated Feature Vector"] = [
    item for item in sp.integrate.simpson(feature_arr_abs, axis=2)
]
trenchiddf_nan_filtered["Integrated Feature Max"] = trenchiddf_nan_filtered[
    "Integrated Feature Vector"
].apply(lambda x: np.max(x))
trenchiddf_nan_filtered["Integrated Euclidean Norm"] = np.linalg.norm(
    np.array(trenchiddf_nan_filtered["Integrated Feature Vector"].tolist()), axis=1
)

sgrnadf_strong_effect = trenchiddf_nan_filtered[
    trenchiddf_nan_filtered["Integrated Feature Max"] >= strong_effect_threshold
]
min_v, max_v = np.min(trenchiddf_nan_filtered["Integrated Feature Max"]), np.percentile(
    trenchiddf_nan_filtered["Integrated Feature Max"], 99
)

plt.figure(figsize=(8, 8))
plt.title("Integrated Feature Max")
plt.hist(
    trenchiddf_nan_filtered[
        trenchiddf_nan_filtered["Integrated Feature Max"] < strong_effect_threshold
    ]["Integrated Feature Max"].tolist(),
    bins=50,
    range=(min_v, max_v),
)
plt.hist(
    trenchiddf_nan_filtered[
        trenchiddf_nan_filtered["Integrated Feature Max"] >= strong_effect_threshold
    ]["Integrated Feature Max"].tolist(),
    bins=50,
    range=(min_v, max_v),
)
plt.show()

sgrnadf_strong_effect["Gene"]

unique_genes, gene_counts = np.unique(
    sgrnadf_strong_effect["Gene"][
        sgrnadf_strong_effect["Gene"].apply(lambda x: type(x) == str)
    ].tolist(),
    return_counts=True,
)
plt.title("sgRNAs per Gene")
plt.xticks(range(0, 20, 2), labels=range(0, 20, 2))
plt.hist(gene_counts, bins=np.arange(20) - 0.5)
plt.show()

### Pick Representative Effect per TargetID
Note this may need to be revisited later to resolve transients that are only resolvable at intermediate KO

1) For each target, pick the sgRNA that has the strongest phenotype (highest integrated euclidean norm)
2) Additionally identify any targets with titration information by saving a dataframe with targetIDs that posess at least N sgRNAs
    - this is in a preliminary form; transfer to a full notebook later

In [None]:
trenchiddf_nan_filtered.to_pickle(
    "/home/de64/scratch/de64/sync_folder/2022-01-20_lDE20_Final_6/2022-02-10_gene_cluster_df_no_filter.pkl"
)

In [None]:
sgrnadf_strong_effect.to_pickle(
    "/home/de64/scratch/de64/sync_folder/2022-01-20_lDE20_Final_6/2022-02-10_gene_cluster_df.pkl"
)

In [None]:
most_rep_example_series = (
    sgrnadf_strong_effect.reset_index(drop=False)
    .groupby("TargetID")
    .apply(lambda x: x.iloc[np.argmax(x["Integrated Euclidean Norm"])])
    .reset_index(drop=True)
    .set_index("sgRNA", drop=True)
)

## THIS IS FOR A LOG TRANSFORMATION, try to do this earlier when it makes more sense....
# normalized_timeseries = np.swapaxes(normalize_timeseries(most_rep_example_series["Feature Vector"], lmbda=0.5),1,2)
# most_rep_example_series["Normalized Feature Vector"] = [normalized_timeseries[i] for i in range(normalized_timeseries.shape[0])]

In [None]:
dask_controller.shutdown()

### Effect Distance Metrics

Now, I want to evaluate the performance of different distance metrics on the data wrt seperating it maximally while also preserving similarity within replicates

- DTW (can be done with cosine similarity) 
- cosine similarity (same as pearson for z-scores)
- cross correlation

Seems like soft-DTW is a pretty good option. Going forward with that for now.

<!-- In the end cosine similarity was chosen as it produced superior silhouette scores for sets of targets from genes with different phenotypes. -->

In [None]:
sgrnadf_examples_for_distance_metric = most_rep_example_series[
    most_rep_example_series["Gene"].isin(["ftsN", "rplA", "mreB", "tufB", "tff"])
]

In [None]:
import tslearn
from tslearn.clustering import TimeSeriesKMeans
from tslearn.metrics import (
    cdist_dtw,
    cdist_soft_dtw,
    cdist_soft_dtw_normalized,
    dtw,
    dtw_path_from_metric,
)

In [None]:
timeseries_arr = np.swapaxes(
    np.array(sgrnadf_examples_for_distance_metric["Feature Vector"].tolist()), 1, 2
)

In [None]:
for gamma in [0.0, 0.01, 0.1, 1.0, 10.0, 100.0, 1000.0]:
    print(
        "Soft-DTW Gamma="
        + str(gamma)
        + ": "
        + str(
            tslearn.clustering.silhouette_score(
                timeseries_arr,
                sgrnadf_examples_for_distance_metric["Gene"].tolist(),
                metric="softdtw",
                gamma=gamma,
            )
        )
    )

dist_mat = np.zeros((timeseries_arr.shape[0], timeseries_arr.shape[0]))
for i in range(timeseries_arr.shape[0]):
    for j in range(i + 1, timeseries_arr.shape[0]):
        dist = dtw_path_from_metric(
            timeseries_arr[i],
            timeseries_arr[j],
            metric="cosine",
            global_constraint="sakoe_chiba",
            sakoe_chiba_radius=3,
        )[1]
        dist_mat[i, j] = dist
        dist_mat[j, i] = dist
print(
    "Cosine-DTW: "
    + str(
        tslearn.clustering.silhouette_score(
            dist_mat,
            sgrnadf_examples_for_distance_metric["Gene"].tolist(),
            metric="precomputed",
        )
    )
)

dist_mat = np.zeros((timeseries_arr.shape[0], timeseries_arr.shape[0]))
for i in range(timeseries_arr.shape[0]):
    for j in range(i + 1, timeseries_arr.shape[0]):
        dist = dtw_path_from_metric(
            timeseries_arr[i],
            timeseries_arr[j],
            metric="euclidean",
            global_constraint="sakoe_chiba",
            sakoe_chiba_radius=3,
        )[1]
        dist_mat[i, j] = dist
        dist_mat[j, i] = dist
print(
    "Euclidean-DTW: "
    + str(
        tslearn.clustering.silhouette_score(
            dist_mat,
            sgrnadf_examples_for_distance_metric["Gene"].tolist(),
            metric="precomputed",
        )
    )
)

In [None]:
soft_dtw_dist_arr = tslearn.metrics.cdist_soft_dtw(timeseries_arr)

In [None]:
plt.hist(soft_dtw_dist_arr.flatten(), bins=100)
plt.show()

### Detecting different effects against single genes

1) Plot a histogram of minimum soft-DTW similarity within groups of TargetIDs against the same genes (for genes with more than one targetID)
2) Use affinity propagation to select the number of phenotype clusters to use per gene (preference still needs to be dialed in, not sure how to optimize on this)
3) Among each cluster, represent the final effect as the strongest effect (integrated euc norm) of the members of the cluster

~~3) Among each cluster, represent the final effect as the median of the members of the cluster~~


In [None]:
def get_normed_softdtw(feature_vector_series):
    dist_mat = cdist_soft_dtw_normalized(
        np.swapaxes(np.array(feature_vector_series.tolist()), 1, 2)
    )
    timeseries_len = (
        feature_vector_series[0].shape[0] * feature_vector_series[0].shape[1]
    )
    dist_mat = dist_mat / timeseries_len
    return dist_mat


def get_upper_right_vals(a):
    upper_tri = np.triu(a, k=1)
    upper_tri[upper_tri == 0.0] = np.NaN
    return upper_tri


def get_sgRNA_clusters(df, preference=0.6):
    gene_indexed_df = (
        df.reset_index(drop=False)
        .set_index("Gene")[["sgRNA", "Feature Vector", "TargetID"]]
        .sort_index()
    )
    gene_indexed_df["sgRNA Cluster"] = pd.Series(
        np.zeros(len(gene_indexed_df), dtype=int), dtype=int
    )
    gene_df_list = []
    for gene in gene_indexed_df.index.tolist():
        gene_df = gene_indexed_df.loc[[gene]]
        if len(gene_df) > 1:
            gene_feature_vector = gene_df["Feature Vector"]
            soft_dtw_dist = get_normed_softdtw(gene_feature_vector)
            af_labels = (
                AffinityPropagation(
                    affinity="precomputed", preference=preference, random_state=42
                )
                .fit_predict(-soft_dtw_dist)
                .astype(int)
            )
            gene_indexed_df.loc[gene, "sgRNA Cluster"] = af_labels
        else:
            gene_indexed_df.loc[gene, "sgRNA Cluster"] = 0
    gene_indexed_df["sgRNA Cluster"] = gene_indexed_df["sgRNA Cluster"].astype(int)
    return gene_indexed_df

In [None]:
n_sgrna_replicate_thr = 2
pref_factor = 3.0

gene_list, counts_list = np.unique(most_rep_example_series["Gene"], return_counts=True)
genes_with_many_replicate_sgRNAs = gene_list[counts_list >= n_sgrna_replicate_thr]
sgrnadf_many_copies_per_gene = most_rep_example_series[
    most_rep_example_series["Gene"].isin(genes_with_many_replicate_sgRNAs)
]

max_distance_within_gene = sgrnadf_many_copies_per_gene.groupby("Gene").apply(
    lambda x: np.nanmax(get_upper_right_vals(get_normed_softdtw(x["Feature Vector"])))
)
plt.title("Maximum soft-DTW Distance per Gene")
plt.hist(max_distance_within_gene, bins=50)
plt.show()

dist_within_gene = sgrnadf_many_copies_per_gene.groupby("Gene").apply(
    lambda x: get_upper_right_vals(get_normed_softdtw(x["Feature Vector"])).flatten()
)
dist_within_gene = [val for item in dist_within_gene.tolist() for val in item]
median_similarity = -np.nanmedian(dist_within_gene)

gene_df = get_sgRNA_clusters(
    most_rep_example_series, preference=pref_factor * median_similarity
)

most_rep_example_series["sgRNA Cluster"] = gene_df.set_index("sgRNA")["sgRNA Cluster"]
most_rep_example_series["sgRNA Cluster Label"] = most_rep_example_series.apply(
    lambda x: str(x["Gene"]) + "-" + str(x["sgRNA Cluster"]), axis=1
)

gene_cluster_df = most_rep_example_series[
    ["sgRNA Cluster Label", "Feature Vector", "Gene", "Integrated Euclidean Norm"]
    + kernel_params
].reset_index(drop=True)
gene_cluster_groupby = gene_cluster_df.groupby("sgRNA Cluster Label")
# median_feature_series = gene_cluster_groupby.apply(lambda x: np.median(np.stack(x["Feature Vector"]).astype(float), axis=0)).to_frame().rename(columns={0:"Feature Vector"})
feature_series = (
    gene_cluster_groupby.apply(
        lambda x: x.iloc[np.argmax(x["Integrated Euclidean Norm"])]["Feature Vector"]
    )
    .to_frame()
    .rename(columns={0: "Feature Vector"})
)

gene_cluster_df = gene_cluster_groupby.apply(
    lambda x: x.iloc[0][["Gene"] + kernel_params]
)
gene_cluster_df = gene_cluster_df.join(feature_series)

In [None]:
gene_cluster_df.to_pickle(
    "/home/de64/scratch/de64/sync_folder/2021-06-14_lDE20_biofloat_fullrun_1/2021-08-16_gene_cluster_df.pkl"
)