In [None]:
import paulssonlab.deaton.trenchripper.trenchripper as tr

import numpy as np
import pandas as pd
import seaborn as sns
import scipy as sp
import sklearn as skl
import dask.dataframe as dd
import dask.array as da

from matplotlib.ticker import FuncFormatter

import scipy.stats
from sklearn.linear_model import LinearRegression

from matplotlib import pyplot as plt
import holoviews as hv

hv.extension("bokeh")

In [None]:
headpath = "/home/de64/scratch/de64/sync_folder/2021-10-21_lDE15_Final_1/barcodes"

In [None]:
dask_controller = tr.trcluster.dask_controller(
    walltime="04:00:00",
    local=False,
    n_workers=50,
    death_timeout=5.0,
    memory="8GB",
    working_directory="/home/de64/scratch/de64/temp/dask",
)
dask_controller.startdask()

In [None]:
dask_controller.displaydashboard()

In [None]:
# dask_controller.shutdown()

#### Import Dataframe

In [None]:
final_output_df = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2021-10-21_lDE15_Final_1/2021-11-17_lDE15_Analysis",
    engine="pyarrow",
)
# final_output_df = final_output_df.dropna(subset=["final timepoints",'Mean Exponential Growth Rate: area','Birth: minor_axis_length',"Birth: Surface Area"])
final_output_df = (
    final_output_df.reset_index()
    .set_index("phenotype trenchid", sorted=False)
    .persist()
)


#### Filter for "Normal" mCherry Intensity and Area

go back later to make sure i have the area quantification

1) Fit a gaussian model to each of the specified feature params during the first t timepoints of the experiment (using a subsample for speed) 
2) Compute a normalized probability trenchwise for these features under the gaussian model, during the first t timepoints of the experiment
3) Eliminate trenches that are under some p percentile value of this probability for each feature
4) Display histograms for each property as well as the resulting theshold

In [None]:
gaussian_subsample = 0.2
percentile_threshold = 10

filter_params = ["RFP-Penta mean_intensity_wo_bkd"]

In [None]:
for filter_param in filter_params:
    param_series = final_output_df[filter_param]
    all_param_values = param_series.sample(frac=gaussian_subsample).compute().tolist()
    gaussian_fit = sp.stats.norm.fit(all_param_values)
    gaussian_fit = sp.stats.norm(loc=gaussian_fit[0], scale=gaussian_fit[1])

    trench_probability = param_series.groupby("phenotype trenchid").apply(
        lambda x: np.exp(np.sum(gaussian_fit.logpdf(x)) / len(x)), meta=float
    )

    final_output_df[filter_param + ": Probability"] = trench_probability.persist()

final_output_df_onetrench = (
    final_output_df.groupby("phenotype trenchid").apply(lambda x: x.iloc[0]).compute()
)

values_names = ["RFP-Penta mean_intensity_wo_bkd"]
plt.figure(figsize=(22, 16))
query_list = []
for i, filter_param in enumerate(filter_params):
    prob_threshold = np.nanpercentile(
        final_output_df_onetrench[filter_param + ": Probability"].tolist(),
        percentile_threshold,
    )
    query = "`" + filter_param + ": Probability` > " + str(prob_threshold)
    query_list.append(query)

    min_v, max_v = (
        np.min(final_output_df_onetrench[filter_param + ": Probability"]),
        np.max(final_output_df_onetrench[filter_param + ": Probability"]),
    )

    plt.subplot(2, 3, i + 1)
    plt.title(values_names[i], fontsize=22)
    plt.xlabel("Unnormalized Likelihood", fontsize=18)
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)
    plt.hist(
        final_output_df_onetrench[
            final_output_df_onetrench[filter_param + ": Probability"] < prob_threshold
        ][filter_param + ": Probability"].tolist(),
        bins=50,
        range=(min_v, max_v),
    )
    plt.hist(
        final_output_df_onetrench[
            final_output_df_onetrench[filter_param + ": Probability"] >= prob_threshold
        ][filter_param + ": Probability"].tolist(),
        bins=50,
        range=(min_v, max_v),
    )
plt.savefig("Prob_threshold.png", dpi=500)

compiled_query = " and ".join(query_list)
final_output_df_onetrench_filtered = final_output_df_onetrench.query(compiled_query)
final_output_df_filtered = final_output_df.loc[
    final_output_df_onetrench_filtered.index.tolist()
].persist()

In [None]:
final_output_df_filtered = final_output_df_filtered.reset_index().set_index(
    "File Parquet Index", sorted=False
)
final_output_df_filtered = final_output_df_filtered.repartition(npartitions=500)
final_output_df_filtered.to_parquet(
    "/home/de64/scratch/de64/sync_folder/2021-10-21_lDE15_Final_1/2021-11-17_lDE15_Analysis_Filtered",
    engine="pyarrow",
    overwrite=True,
)