## Mapping Barcodes and Cleaning Data

In [None]:
import dask.array as da
import dask.dataframe as dd
import holoviews as hv
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats
import seaborn as sns
import sklearn as skl
from matplotlib import pyplot as plt
from matplotlib.ticker import FuncFormatter
from sklearn.linear_model import LinearRegression

import paulssonlab.deaton.trenchripper.trenchripper as tr

hv.extension("bokeh")

In [None]:
headpath = "/home/de64/scratch/de64/sync_folder/2021-10-21_lDE15_Final_1/barcodes"

In [None]:
dask_controller = tr.trcluster.dask_controller(
    walltime="04:00:00",
    local=False,
    n_workers=100,
    death_timeout=5.0,
    memory="8GB",
    working_directory="/home/de64/scratch/de64/temp/dask",
)
dask_controller.startdask()

In [None]:
dask_controller.displaydashboard()

In [None]:
dask_controller.shutdown()

#### Import Barcode Dataframe

In [None]:
meta_handle = tr.pandas_hdf5_handler(
    "/home/de64/scratch/de64/sync_folder/2021-10-21_lDE15_Final_1/barcodes/metadata.hdf5"
)
pandas_barcode_df = meta_handle.read_df("barcodes", read_metadata=True)
barcode_df = dd.from_pandas(pandas_barcode_df, npartitions=1000, sort=True)
barcode_df = barcode_df.persist()

In [None]:
ttl_called = len(barcode_df.index)
ttl_trenches = pandas_barcode_df.metadata["Total Trenches"]
ttl_trenches_w_cells = pandas_barcode_df.metadata["Total Trenches With Cells"]
percent_called = ttl_called / ttl_trenches
percent_called_w_cells = ttl_called / ttl_trenches_w_cells

In [None]:
print(ttl_called)
print(ttl_trenches)
print(ttl_trenches_w_cells)
print(percent_called)
print(percent_called_w_cells)

### Import Analysis 

In [None]:
analysis_df = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2021-10-21_lDE15_Final_1/GFP/analysis"
)
last_trenchid = int(analysis_df.tail(1)["trenchid"])

In [None]:
import skimage as sk


def hrm_find_mode(series, max_iter=1000, min_binsize=50):
    working_series = series
    for i in range(max_iter):
        range_max, range_min = np.max(working_series), np.min(working_series)
        midpoint = (range_max + range_min) / 2
        above_middle = working_series[working_series > midpoint]
        below_middle = working_series[working_series <= midpoint]

        count_above = len(above_middle)
        count_below = len(below_middle)

        if count_above > count_below:
            working_series = above_middle
        else:
            working_series = below_middle

        if i > 0:
            if (len(working_series) < min_binsize) or (last_midpoint == midpoint):
                return np.mean(working_series)

        last_midpoint = midpoint


def bootstrap_hrm(series, n_bootstraps=100, max_n_per_bootstrap=100):
    modes = []

    series_len = len(series)

    n_per_bootstrap = min(series_len, max_n_per_bootstrap)

    for n in range(n_bootstraps):
        modes.append(hrm_find_mode(series.sample(n=n_per_bootstrap)))
    return np.mean(modes)


def get_GFPpos_modes(
    GFP_series, series_groupby, frac=0.01, n_bootstraps=100, max_n_per_bootstrap=100
):
    gfp_vals = GFP_series.sample(frac=frac).compute()
    tri_thr = sk.filters.threshold_triangle(gfp_vals)
    mode_series = (
        series_groupby.apply(
            lambda x: bootstrap_hrm(
                x[x > tri_thr],
                n_bootstraps=n_bootstraps,
                max_n_per_bootstrap=max_n_per_bootstrap,
            )
        )
        .compute()
        .sort_index()
    )
    return mode_series

### Variables over FOV

In [None]:
analysis_df

In [None]:
analysis_df_nobkd = analysis_df[analysis_df["Objectid"] != 0]

In [None]:
values_to_rescale = ["RFP-Penta mean_intensity", "GFP-Penta mean_intensity"]

In [None]:
fig = plt.figure(figsize=(30, 20))
values_names = ["Median mCherry Intensity", "Median GFPmut2 Intensity"]

for i, label in enumerate(values_to_rescale):
    fov_series_groupby = analysis_df_nobkd.groupby("fov")[label]
    if label == "RFP-Penta mean_intensity":
        fov_mode_series = (
            fov_series_groupby.apply(
                lambda x: bootstrap_hrm(x, max_n_per_bootstrap=100)
            )
            .compute()
            .sort_index()
        )
    elif label == "GFP-Penta mean_intensity":
        fov_mode_series = get_GFPpos_modes(
            analysis_df["GFP-Penta mean_intensity"],
            fov_series_groupby,
            max_n_per_bootstrap=100,
        )
    else:
        print("Weird Label")

    fov_correction_series = fov_mode_series / np.max(fov_mode_series)
    fov_correction_dict = fov_correction_series.to_dict()
    plt.subplot(2, 3, i + 1)
    plt.plot(fov_correction_series)
    plt.title(values_names[i], fontsize=22)
    plt.xlabel("FOV #", fontsize=18)
    plt.ylabel("Scaling", fontsize=18)
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)
    plt.ylim(0.0, 1.0)
    label_scaling = analysis_df["fov"].apply(lambda x: fov_correction_dict[x]).persist()
    analysis_df[label + ": FOV Corrected"] = (
        analysis_df[label] / label_scaling
    ).persist()
plt.savefig("FOV_correction.png", dpi=500)

### Variables over time

In [None]:
analysis_df_nobkd = analysis_df[analysis_df["Objectid"] != 0]
values_to_rescale_step_2 = [value + ": FOV Corrected" for value in values_to_rescale]

In [None]:
# add real time later when fixed

In [None]:
fig = plt.figure(figsize=(30, 20))
values_names = ["Median mCherry Intensity", "Median GFPmut2 Intensity"]

for i, label in enumerate(values_to_rescale_step_2):
    time_series_groupby = analysis_df_nobkd.groupby("timepoints")[label]
    if label == "RFP-Penta mean_intensity: FOV Corrected":
        time_mode_series = (
            time_series_groupby.apply(
                lambda x: bootstrap_hrm(x, max_n_per_bootstrap=100)
            )
            .compute()
            .sort_index()
        )
    elif label == "GFP-Penta mean_intensity: FOV Corrected":
        time_mode_series = get_GFPpos_modes(
            analysis_df["GFP-Penta mean_intensity"],
            time_series_groupby,
            max_n_per_bootstrap=100,
        )
    else:
        print("Weird Label")

    time_correction_series = time_mode_series / np.max(time_mode_series)
    time_correction_dict = time_correction_series.to_dict()
    plt.subplot(2, 3, i + 1)
    plt.plot(time_correction_series)
    plt.title(values_names[i], fontsize=22)
    plt.xlabel("Timepoint (3 min steps)", fontsize=18)
    plt.ylabel("Scaling", fontsize=18)
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)
    plt.ylim(0.0, 1.2)
    label_scaling = analysis_df["timepoints"].apply(lambda x: time_correction_dict[x])
    analysis_df[label + ": Time Corrected"] = (
        analysis_df[label] / label_scaling
    ).persist()
plt.savefig("Time_correction.png", dpi=500)

### Overwrite Variables with Correction

In [None]:
for label in values_to_rescale:
    analysis_df[label] = analysis_df[label + ": FOV Corrected: Time Corrected"]

In [None]:
analysis_df = analysis_df[
    [
        "File Index",
        "File Trench Index",
        "timepoints",
        "Objectid",
        "centroid_y",
        "centroid_x",
        "area",
        "fov",
        "row",
        "trench",
        "time (s)",
        "lane orientation",
        "y (local)",
        "x (local)",
        "y (global)",
        "x (global)",
        "trenchid",
        "Trenchid Timepoint Index",
    ]
    + values_to_rescale
].persist()

In [None]:
analysis_df

### GFP Quantification Function

In [None]:
def local_background_subtract(series, intensity_key):
    intensity_vals = series[intensity_key]
    bkd_val = series[series["Objectid"] == 0][intensity_key].iloc[0]
    bkd_sub_intensity = intensity_vals - bkd_val
    bkd_sub_intensity = bkd_sub_intensity.to_dict()
    return bkd_sub_intensity

In [None]:
analysis_df_trenchtimepoint_groupby = analysis_df.groupby("Trenchid Timepoint Index")

gfp_intensity_wo_bkd = (
    analysis_df_trenchtimepoint_groupby.apply(
        lambda x: local_background_subtract(x, "GFP-Penta mean_intensity"),
        meta=("GFP-Penta mean_intensity", float),
    )
    .compute()
    .reset_index(drop=True)
    .to_list()
)
gfp_intensity_wo_bkd = {k: v for d in gfp_intensity_wo_bkd for k, v in d.items()}
gfp_intensity_wo_bkd = pd.DataFrame.from_dict(
    gfp_intensity_wo_bkd, orient="index", columns=["GFP-Penta mean_intensity_wo_bkd"]
)

mchy_intensity_wo_bkd = (
    analysis_df_trenchtimepoint_groupby.apply(
        lambda x: local_background_subtract(x, "RFP-Penta mean_intensity"),
        meta=("RFP-Penta mean_intensity", float),
    )
    .compute()
    .reset_index(drop=True)
    .to_list()
)
mchy_intensity_wo_bkd = {k: v for d in mchy_intensity_wo_bkd for k, v in d.items()}
mchy_intensity_wo_bkd = pd.DataFrame.from_dict(
    mchy_intensity_wo_bkd, orient="index", columns=["RFP-Penta mean_intensity_wo_bkd"]
)

In [None]:
analysis_df = analysis_df.join(mchy_intensity_wo_bkd)
analysis_df = analysis_df.join(gfp_intensity_wo_bkd)
analysis_df_nobkd = analysis_df[analysis_df["Objectid"] != 0].persist()

ratio_series = (
    analysis_df_nobkd["GFP-Penta mean_intensity_wo_bkd"]
    / analysis_df_nobkd["RFP-Penta mean_intensity_wo_bkd"]
)
analysis_df_nobkd["gfp/mchy Ratio"] = ratio_series

#### Get Trench Mapping

In [None]:
phenotype_kymopath = "/home/de64/scratch/de64/sync_folder/2021-10-21_lDE15_Final_1/GFP/kymograph/metadata"
barcode_kymopath = "/home/de64/scratch/de64/sync_folder/2021-10-21_lDE15_Final_1/barcodes/kymograph/metadata"

trenchid_map = tr.files_to_trenchid_map(phenotype_kymopath, barcode_kymopath)

#### Get GFP Call Error and Recovery Rate

In [None]:
def get_barcode_pheno_df(phenotype_df, barcode_df, trenchid_map):
    ##phenotype_df must contain trenchids column and a File Parquet Index

    phenotype_df_idx = phenotype_df["trenchid"].unique().compute().tolist()
    valid_barcode_df = barcode_df[
        barcode_df["trenchid"].isin(trenchid_map.keys())
    ].compute()
    barcode_df_mapped_trenchids = valid_barcode_df["trenchid"].apply(
        lambda x: trenchid_map[x]
    )

    valid_init_df_indices = barcode_df_mapped_trenchids.isin(phenotype_df_idx)
    barcode_df_mapped_trenchids = barcode_df_mapped_trenchids[valid_init_df_indices]
    final_valid_barcode_df_indices = barcode_df_mapped_trenchids.index.to_list()

    called_df = barcode_df.loc[final_valid_barcode_df_indices]
    called_df["phenotype trenchid"] = barcode_df_mapped_trenchids
    called_df = (
        called_df.reset_index()
        .set_index("phenotype trenchid", drop=True, sorted=False)
        .persist()
    )

    output_df = phenotype_df.rename(columns={"trenchid": "phenotype trenchid"})
    output_df = output_df.reset_index().set_index(
        "phenotype trenchid", drop=True, sorted=False
    )
    output_df = output_df.merge(
        called_df, how="inner", left_index=True, right_index=True
    )
    output_df = output_df.drop(["Barcode Signal"], axis=1)
    output_df = output_df.reset_index().set_index("File Parquet Index").persist()

    return output_df

In [None]:
output_df = get_barcode_pheno_df(analysis_df_nobkd, barcode_df, trenchid_map)
del analysis_df_nobkd

In [None]:
output_df = output_df.repartition(npartitions=500)
output_df.to_parquet(
    "/home/de64/scratch/de64/sync_folder/2021-10-21_lDE15_Final_1/2021-11-17_lDE15_Analysis",
    engine="pyarrow",
    overwrite=True,
)

In [None]:
single_trench_timepoint_df = (
    output_df.groupby("Trenchid Timepoint Index")
    .apply(lambda x: x.iloc[0])
    .set_index("Trenchid Timepoint Index")
)
single_trench_timepoint_df.to_parquet(
    "/home/de64/scratch/de64/sync_folder/2021-10-21_lDE15_Final_1/2021-11-17_lDE15_Analysis_Trench-Timepoint",
    engine="pyarrow",
    overwrite=True,
)