## Mapping Barcodes and Cleaning Data

In [None]:
import dask.array as da
import dask.dataframe as dd
import holoviews as hv
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats
import seaborn as sns
import sklearn as skl
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression

import paulssonlab.deaton.trenchripper.trenchripper as tr

hv.extension("bokeh")

In [None]:
# addition of active memory manager
import dask

dask.config.set({"distributed.scheduler.active-memory-manager.start": True})
dask.config.set({"distributed.scheduler.worker-ttl": "5m"})
dask.config.set({"distributed.scheduler.allowed-failures": 100})

In [None]:
headpath = "/home/de64/scratch/de64/sync_folder/2022-01-20_lDE20_Final_6/Barcodes/"

In [None]:
dask_controller = tr.trcluster.dask_controller(
    walltime="2:00:00",
    local=False,
    n_workers=200,
    n_workers_min=50,
    memory="8GB",
    working_directory="/home/de64/scratch/de64/dask",
)
dask_controller.startdask()

In [None]:
dask_controller.displaydashboard()

In [None]:
dask_controller.reset_worker_memory()

In [None]:
# dask_controller.shutdown()

In [None]:
dask_controller.daskclient.restart()

#### Import Lineage Output

### Optimizing Growth Quantification

In [None]:
def filter_df(df, query_list, client=False, repartition=False, persist=False):
    # filter_list must be in df.query format (see pandas docs)

    # returns persisted dataframe either in cluster or local

    compiled_query = " and ".join(query_list)
    out_df = df.query(compiled_query)
    if persist:
        if client:
            out_df = client.daskclient.persist(out_df)
        else:
            out_df = out_df.persist()

    if repartition:
        init_size = len(df)
        final_size = len(out_df)
        ratio = init_size // final_size
        out_df = out_df.repartition(npartitions=(df.npartitions // ratio) + 1)
        if persist:
            if client:
                out_df = client.daskclient.persist(out_df)
            else:
                out_df = out_df.persist()

    return out_df


def get_first_cell_timepoint(df):
    min_tpts = df.groupby(["Global CellID"])["timepoints"].idxmin().tolist()
    init_cells = df.loc[min_tpts]
    return init_cells


def get_last_cell_timepoint(df):
    max_tpts = df.groupby(["Global CellID"])["timepoints"].idxmax().tolist()
    fin_cells = df.loc[max_tpts]
    return fin_cells


def get_growth_and_division_stats(
    lineage_df,
    headpath,
    trench_score_thr=-75,
    absolute_time=True,
    delta_t_min=None,
    size_metrics=[
        "area",
        "major_axis_length",
        "minor_axis_length",
        "Volume",
        "Surface Area",
    ],
):
    kymo_df_path = headpath + "/kymograph/metadata"
    kymo_df = dd.read_parquet(kymo_df_path)
    kymo_idx_list = lineage_df["Kymograph FOV Parquet Index"].tolist()

    if not absolute_time:
        kymo_df["time (s)"] = kymo_df["timepoints"] * delta_t_min * 60.0

    kymo_time_series = (
        kymo_df["time (s)"].loc[kymo_idx_list].compute(scheduler="threads")
    )
    kymo_time_series.index = lineage_df.index
    lineage_df["time (s)"] = kymo_time_series

    reference = filter_df(lineage_df, ["`Trench Score` < " + str(trench_score_thr)])
    query = filter_df(
        lineage_df,
        [
            "`Mother CellID` != -1",
            "`Daughter CellID 1` != -1",
            "`Daughter CellID 2` != -1",
            "`Sister CellID` != -1",
            "`Trench Score` < " + str(trench_score_thr),
        ],
    )

    init_cells = (
        get_first_cell_timepoint(query)
        .reset_index()
        .set_index("Global CellID")
        .sort_index()
    )
    fin_cells = (
        get_last_cell_timepoint(query)
        .reset_index()
        .set_index("Global CellID")
        .sort_index()
    )

    cell_min_tpt_df = (
        get_first_cell_timepoint(reference)
        .reset_index()
        .set_index("Global CellID")
        .sort_index()
    )
    cell_max_tpt_df = (
        get_last_cell_timepoint(reference)
        .reset_index()
        .set_index("Global CellID")
        .sort_index()
    )

    mother_df = cell_max_tpt_df.loc[init_cells["Mother CellID"].tolist()]
    sister_df = cell_min_tpt_df.loc[init_cells["Sister CellID"].tolist()]
    daughter_1_df = cell_min_tpt_df.loc[fin_cells["Daughter CellID 1"].tolist()]
    daughter_2_df = cell_min_tpt_df.loc[fin_cells["Daughter CellID 2"].tolist()]

    for metric in size_metrics:
        if metric == "minor_axis_length":
            init_cells["Birth: " + metric] = init_cells[metric].values
            init_cells["Division: " + metric] = fin_cells[metric].values
            init_cells["Delta: " + metric] = (
                fin_cells[metric].values - init_cells[metric].values
            )

        else:
            interp_mother_final_size = (
                (init_cells[metric].values + sister_df[metric].values)
                * mother_df[metric].values
            ) ** (1 / 2)
            sister_frac = init_cells[metric].values / (
                sister_df[metric].values + init_cells[metric].values
            )
            init_cells["Birth: " + metric] = sister_frac * interp_mother_final_size

            init_cells["Division: " + metric] = (
                (daughter_1_df[metric].values + daughter_2_df[metric].values)
                * fin_cells[metric].values
            ) ** (1 / 2)

            init_cells["Delta: " + metric] = (
                init_cells["Division: " + metric].values
                - init_cells["Birth: " + metric].values
            )

    init_cells["Final timepoints"] = daughter_1_df[
        "timepoints"
    ].values  # counting a timepoint in which a division occurs as a full timepoint, hacky
    init_cells["Delta Timepoints"] = (
        init_cells["Final timepoints"] - init_cells["timepoints"]
    )

    # if absolute_time:
    interpolated_final_time = (
        fin_cells["time (s)"].values + daughter_1_df["time (s)"].values
    ) / 2  # interpolating under the same assumptions as the size quantification
    interpolated_init_time = (
        init_cells["time (s)"].values + mother_df["time (s)"].values
    ) / 2
    init_cells["Final time (s)"] = interpolated_final_time
    init_cells["Delta time (s)"] = interpolated_final_time - interpolated_init_time

    query = (
        query.reset_index()
        .set_index(["Global CellID", "timepoints"])
        .sort_index()
        .reset_index(level=1)
    )

    # if absolute_time:

    delta_t_series = query.groupby(level=0, sort=False)["time (s)"].apply(
        lambda x: ((x[1:].values - x[:-1].values))
    )

    init_time_gap = init_cells["time (s)"].values - interpolated_init_time
    final_time_gap = interpolated_final_time - fin_cells["time (s)"].values

    for size_metric in size_metrics:  # Havn't decided between mean and median
        init_size = query.groupby(level=0, sort=False)[size_metric].apply(
            lambda x: x.iloc[0]
        )
        final_size = query.groupby(level=0, sort=False)[size_metric].apply(
            lambda x: x.iloc[-1]
        )

        init_linear_gr = init_size - (init_cells["Birth: " + size_metric].values)
        init_linear_gr = init_linear_gr / init_time_gap
        final_linear_gr = (init_cells["Division: " + size_metric].values) - final_size
        final_linear_gr = final_linear_gr / final_time_gap

        init_exp_gr = 2 * (
            (init_size - (init_cells["Birth: " + size_metric].values))
            / (init_size + (init_cells["Birth: " + size_metric].values))
        )
        init_exp_gr = init_exp_gr / init_time_gap
        final_exp_gr = 2 * (
            ((init_cells["Division: " + size_metric].values) - final_size)
            / ((init_cells["Division: " + size_metric].values) + final_size)
        )
        final_exp_gr = final_exp_gr / final_time_gap

        all_linear_gr = query.groupby(level=0, sort=False)[size_metric].apply(
            lambda x: x[1:].values - x[:-1].values
        )  ##needs to interpolate last growth rate
        all_linear_gr = all_linear_gr / delta_t_series
        all_linear_gr = all_linear_gr.apply(lambda x: x.tolist())
        all_linear_gr = all_linear_gr.to_frame()
        all_linear_gr = all_linear_gr.rename(columns={0: "Main List"})
        all_linear_gr["Start"] = init_linear_gr
        all_linear_gr["End"] = final_linear_gr
        all_linear_gr["Appended"] = all_linear_gr.apply(
            lambda x: [x["Start"]] + x["Main List"] + [x["End"]], axis=1
        )
        mean_linear_gr = all_linear_gr["Appended"].apply(lambda x: np.nanmean(x))
        del all_linear_gr
        mean_linear_gr = mean_linear_gr * 3600  # size unit per hr

        all_exp_gr = query.groupby(level=0, sort=False)[size_metric].apply(
            lambda x: 2
            * ((x[1:].values - x[:-1].values) / (x[1:].values + x[:-1].values))
        )  ##needs to interpolate last growth rate
        all_exp_gr = all_exp_gr / delta_t_series
        all_exp_gr = all_exp_gr.apply(lambda x: x.tolist())
        all_exp_gr = all_exp_gr.to_frame()
        all_exp_gr = all_exp_gr.rename(columns={0: "Main List"})
        all_exp_gr["Start"] = init_exp_gr
        all_exp_gr["End"] = final_exp_gr
        all_exp_gr["Appended"] = all_exp_gr.apply(
            lambda x: [x["Start"]] + x["Main List"] + [x["End"]], axis=1
        )
        mean_exp_gr = all_exp_gr["Appended"].apply(lambda x: np.nanmean(x))
        del all_exp_gr
        mean_exp_gr = mean_exp_gr * 3600  # size unit per hr

        mean_cell_size_metric = query.groupby(level=0, sort=False)[size_metric].apply(
            lambda x: np.nanmean(x.values)
        )

        init_cells["Mean: " + size_metric] = mean_cell_size_metric
        init_cells["Mean Linear Growth Rate: " + size_metric] = mean_linear_gr
        init_cells["Mean Exponential Growth Rate: " + size_metric] = mean_exp_gr
    #     else:
    #         for size_metric in size_metrics: # Havn't decided between mean and median
    #             mean_linear_gr = query.groupby(level=0,sort=False)[size_metric].apply(lambda x: np.nanmean(x[1:].values - x[:-1].values))
    #             mean_linear_gr = (mean_linear_gr/delta_t_min)*60 #size unit per hr
    #             mean_exp_gr = query.groupby(level=0,sort=False)[size_metric].apply(lambda x: np.nanmean((2*(x[1:].values - x[:-1].values))/(x[1:].values + x[:-1].values)))
    #             mean_exp_gr = (mean_exp_gr/delta_t_min)*60 #exponential size unit per hr
    #             mean_cell_size_metric = query.groupby(level=0,sort=False)[size_metric].apply(lambda x: np.nanmean(x.values))

    #             init_cells["Mean: " + size_metric] = mean_cell_size_metric
    #             init_cells["Mean Linear Growth Rate: " + size_metric] = mean_linear_gr
    #             init_cells["Mean Exponential Growth Rate: " + size_metric] = mean_exp_gr

    median_mchy_intensity = query.groupby("Global CellID")[
        "mCherry mean_intensity"
    ].apply(lambda x: np.nanmean(x.values))
    init_cells["Mean: mCherry Intensity"] = median_mchy_intensity

    init_cells = init_cells.rename(columns={"timepoints": "initial timepoints"})

    return init_cells


def get_all_growth_and_division_stats(
    lineage_df,
    headpath,
    trench_score_thr=-75,
    absolute_time=True,
    delta_t_min=None,
    size_metrics=[
        "area",
        "major_axis_length",
        "minor_axis_length",
        "Volume",
        "Surface Area",
    ],
):
    test_partition = lineage_df.get_partition(0).compute()
    test_partition = get_growth_and_division_stats(
        test_partition,
        headpath,
        trench_score_thr=trench_score_thr,
        absolute_time=absolute_time,
        delta_t_min=delta_t_min,
        size_metrics=size_metrics,
    )

    growth_div_df = dd.map_partitions(
        get_growth_and_division_stats,
        lineage_df,
        headpath,
        trench_score_thr=trench_score_thr,
        absolute_time=absolute_time,
        delta_t_min=delta_t_min,
        size_metrics=size_metrics,
        meta=test_partition,
    )

    return growth_div_df

### Import Lineage

In [None]:
lineage_df = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2022-01-20_lDE20_Final_6/Growth_Division/lineage/"
)

##temp fix
lineage_df["CellID"] = lineage_df["CellID"].astype(int)
lineage_df["Global CellID"] = lineage_df["Global CellID"].astype(int)

In [None]:
def hrm_find_mode(series, max_iter=1000, min_binsize=50):
    working_series = series
    for i in range(max_iter):
        range_max, range_min = np.max(working_series), np.min(working_series)
        midpoint = (range_max + range_min) / 2
        above_middle = working_series[working_series > midpoint]
        below_middle = working_series[working_series <= midpoint]

        count_above = len(above_middle)
        count_below = len(below_middle)

        if count_above > count_below:
            working_series = above_middle
        else:
            working_series = below_middle

        if i > 0:
            if (len(working_series) < min_binsize) or (last_midpoint == midpoint):
                return np.mean(working_series)

        last_midpoint = midpoint


def bootstrap_hrm(series, n_bootstraps=100, max_n_per_bootstrap=100):
    modes = []

    series_len = len(series)

    n_per_bootstrap = min(series_len, max_n_per_bootstrap)

    for n in range(n_bootstraps):
        modes.append(hrm_find_mode(series.sample(n=n_per_bootstrap)))
    return np.mean(modes)


def get_normal_fovs(fov_series, med_filter_size=5, n_stds=2):
    median_series = sp.ndimage.median_filter(
        fov_series, size=(med_filter_size,), mode="mirror"
    )

    residuals = fov_series - median_series

    gaussian_fit = sp.stats.norm.fit(residuals)
    gaussian_fit = sp.stats.norm(loc=gaussian_fit[0], scale=gaussian_fit[1])

    lower, upper = (-n_stds * gaussian_fit.std(), n_stds * gaussian_fit.std())

    thr_mask = (residuals > lower) & (residuals < upper)

    return thr_mask

### Variables over FOV

In [None]:
values_to_rescale = [
    "mCherry mean_intensity",
    "area",
    "major_axis_length",
    "minor_axis_length",
    "Volume",
    "Surface Area",
]
fov_sorted_lineage_df = (
    lineage_df[["fov", "timepoints"] + values_to_rescale]
    .reset_index()
    .set_index("fov", sorted=True)
    .persist()
)

fig = plt.figure(figsize=(30, 20))
values_names = [
    "Mean mCherry Intensity",
    "Area",
    "Major Axis Length",
    "Minor Axis Length",
    "Volume",
    "Surface Area",
]
fov_correction_dicts = {}
lineage_df_subsample = fov_sorted_lineage_df[fov_sorted_lineage_df["timepoints"] < 12]

for i, label in enumerate(values_to_rescale):
    fov_series_groupby = lineage_df_subsample.groupby("fov", sort=False)[label]
    fov_median_series = (
        fov_series_groupby.apply(lambda x: np.median(x), meta=float)
        .compute()
        .sort_index()
    )

    normal_fov_series = get_normal_fovs(fov_median_series)
    fov_median_series = fov_median_series[normal_fov_series]

    fov_correction_series = fov_median_series / np.max(fov_median_series)
    fov_correction_dicts[label] = fov_correction_series.to_dict()

    plt.subplot(2, 3, i + 1)
    plt.plot(fov_correction_series)
    plt.title(values_names[i], fontsize=22)
    plt.xlabel("FOV #", fontsize=18)
    plt.ylabel("Scaling", fontsize=18)
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)
    plt.ylim(0.0, 1.0)

fov_list = [set(val.keys()) for key, val in fov_correction_dicts.items()]
filtered_fov_list = list(set.intersection(*fov_list))

dask_controller.daskclient.cancel(fov_sorted_lineage_df)

lineage_df_fov_correction = lineage_df[
    ["fov", "timepoints"] + list(fov_correction_dicts.keys())
]
lineage_df_fov_correction = lineage_df_fov_correction[
    lineage_df_fov_correction["fov"].isin(filtered_fov_list)
].persist()

for label, fov_correction_dict in fov_correction_dicts.items():
    fov_correction_series = lineage_df_fov_correction["fov"].apply(
        lambda x: fov_correction_dict[x], meta=float
    )
    lineage_df_fov_correction[label + ": FOV Corrected"] = (
        lineage_df_fov_correction[label] / fov_correction_series
    ).persist()

plt.savefig("FOV_correction.png", dpi=500)

### Variables over time

In [None]:
values_to_rescale_step_2 = [value + ": FOV Corrected" for value in values_to_rescale]

In [None]:
target_samples = 100000

ttl_samples = len(lineage_df_fov_correction)
frac_to_sample = target_samples / ttl_samples
lineage_df_subsample = lineage_df_fov_correction.sample(frac=frac_to_sample).compute()

In [None]:
len(lineage_df_subsample)

In [None]:
fig = plt.figure(figsize=(30, 20))
values_names = [
    "Mean mCherry Intensity",
    "Area",
    "Major Axis Length",
    "Minor Axis Length",
    "Volume",
    "Surface Area",
]
for i, label in enumerate(values_to_rescale_step_2):
    time_series_groupby = lineage_df_subsample.groupby("timepoints")[label]
    time_mode_series = time_series_groupby.apply(
        lambda x: bootstrap_hrm(x)
    ).sort_index()
    time_correction_series = time_mode_series / np.max(time_mode_series)
    time_correction_dict = time_correction_series.to_dict()
    plt.subplot(2, 3, i + 1)
    plt.plot(time_correction_series)
    plt.title(values_names[i], fontsize=22)
    plt.xlabel("Timepoint (3 min steps)", fontsize=18)
    plt.ylabel("Scaling", fontsize=18)
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)
    plt.ylim(0.0, 1.0)
    lineage_df_fov_correction[
        label + ": Time Corrected"
    ] = lineage_df_fov_correction.apply(
        lambda x: x[label] / time_correction_dict[x["timepoints"]], meta=float, axis=1
    ).persist()
plt.savefig("Time_correction.png", dpi=500)

The HSM method [2] iteratively divides the data set into samples of half the size as the original set and uses the half-sample with the minimum range, where range is defined as the difference between the maximum and the minimum value of the sample. This method terminates when the half-sample is less than three data points. An average of these three or fewer values is the mode. The HRM method [2] is similar but uses the sub-sample with the densest half-range, where range is defined as the absolute difference between the maximum and the minimum values in a sample. Of these two methods, only the HRM was used in this study because HRM has been shown to have lower bias with increasing contamination and asymmetry [2].

### Overwrite Variables with Correction

In [None]:
def get_aligned_loc_from_index(df, idx_series):
    df_out = df.loc[idx_series.tolist()]
    return df_out


def index_loc_lookup(df, idx_series):
    df_out = dd.map_partitions(
        get_aligned_loc_from_index, df, idx_series, align_dataframes=False
    )
    return df_out

In [None]:
corrected_lineage_df = index_loc_lookup(lineage_df, lineage_df_fov_correction.index)

In [None]:
##here
for label in values_to_rescale:
    corrected_lineage_df[label] = lineage_df_fov_correction[
        label + ": FOV Corrected: Time Corrected"
    ]

corrected_lineage_df = corrected_lineage_df[
    [
        "fov",
        "row",
        "trench",
        "trenchid",
        "timepoints",
        "File Index",
        "File Trench Index",
        "CellID",
        "Global CellID",
        "Trench Score",
        "Mother CellID",
        "Daughter CellID 1",
        "Daughter CellID 2",
        "Sister CellID",
        "Centroid X",
        "Centroid Y",
        "Kymograph File Parquet Index",
        "Kymograph FOV Parquet Index",
        "FOV Parquet Index",
    ]
    + values_to_rescale
]

corrected_lineage_df.to_parquet(
    "/home/de64/scratch/de64/sync_folder/2022-01-20_lDE20_Final_6/2022-02-09_lDE20_Lineage_Analysis_with_Correction",
    engine="pyarrow",
    overwrite=True,
)

In [None]:
dask_controller.reset_worker_memory()

In [None]:
corrected_lineage_df = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2022-01-20_lDE20_Final_6/2022-02-09_lDE20_Lineage_Analysis_with_Correction"
)

In [None]:
growth_div_df = get_all_growth_and_division_stats(
    corrected_lineage_df,
    "/home/de64/scratch/de64/sync_folder/2022-01-20_lDE20_Final_6/Growth_Division",
    absolute_time=False,
    delta_t_min=10,
).persist()

#### Import Barcode Dataframe

In [None]:
meta_handle = tr.pandas_hdf5_handler(
    "/home/de64/scratch/de64/sync_folder/2022-01-20_lDE20_Final_6/Barcodes/barcode_output_df.hdf5"
)
pandas_barcode_df = meta_handle.read_df("barcodes", read_metadata=True)
barcode_df = dd.from_pandas(pandas_barcode_df, npartitions=500, sort=True)
barcode_df = barcode_df.persist()

ttl_called = len(barcode_df.index)
ttl_trenches = pandas_barcode_df.metadata["Total Trenches"]
ttl_trenches_w_cells = pandas_barcode_df.metadata["Total Trenches With Cells"]
percent_called = ttl_called / ttl_trenches
percent_called_w_cells = ttl_called / ttl_trenches_w_cells

print(ttl_called)
print(ttl_trenches)
print(ttl_trenches_w_cells)
print(percent_called)
print(percent_called_w_cells)

In [None]:
pandas_barcode_df

#### Get Trench Mapping

In [None]:
phenotype_kymopath = "/home/de64/scratch/de64/sync_folder/2022-01-20_lDE20_Final_6/Growth_Division/kymograph/metadata"
barcode_kymopath = "/home/de64/scratch/de64/sync_folder/2022-01-20_lDE20_Final_6/Barcodes/kymograph/metadata"

trenchid_map = tr.files_to_trenchid_map(phenotype_kymopath, barcode_kymopath)

#### Get Output Dataframe

In [None]:
# ##phenotype_df must contain trenchids column and a File Parquet Index
# output_df = tr.get_barcode_pheno_df(growth_div_df, barcode_df, trenchid_map)

In [None]:
def get_barcode_pheno_df(
    phenotype_df, barcode_df, trenchid_map, output_index="File Parquet Index"
):
    ##phenotype_df must contain trenchids column and a File Parquet Index

    valid_barcode_df = barcode_df[
        barcode_df["trenchid"].isin(trenchid_map.keys())
    ].compute()
    barcode_df_mapped_trenchids = valid_barcode_df["trenchid"].apply(
        lambda x: trenchid_map[x]
    )
    phenotype_df_idx = phenotype_df["trenchid"].unique().compute().tolist()

    valid_init_df_indices = barcode_df_mapped_trenchids.isin(phenotype_df_idx)
    barcode_df_mapped_trenchids = barcode_df_mapped_trenchids[valid_init_df_indices]
    barcode_df_mapped_trenchids_list = barcode_df_mapped_trenchids.tolist()
    final_valid_barcode_df_indices = barcode_df_mapped_trenchids.index.to_list()

    called_df = barcode_df.loc[final_valid_barcode_df_indices]
    called_df["phenotype trenchid"] = barcode_df_mapped_trenchids
    called_df["phenotype trenchid"] = called_df["phenotype trenchid"].astype(int)
    called_df = called_df.drop(["Barcode Signal"], axis=1)
    called_df = called_df.reset_index().set_index(
        "phenotype trenchid", drop=True, sorted=False
    )

    output_df = phenotype_df.rename(columns={"trenchid": "phenotype trenchid"})
    output_df = output_df.reset_index().set_index(
        "phenotype trenchid", drop=True, sorted=True
    )
    output_df = output_df.loc[barcode_df_mapped_trenchids_list]

    called_df = called_df.repartition(divisions=output_df.divisions).persist()
    output_df = output_df.merge(
        called_df, how="inner", left_index=True, right_index=True
    )
    output_df = output_df.reset_index().set_index(output_index)

    return output_df


def get_barcode_pheno_df_split(
    phenotype_df,
    phenotype_kymo_df,
    barcode_df,
    trenchid_map,
    output_index="File Parquet Index",
    output_kymo_index="FOV Parquet Index",
):
    ## phenotype_df must contain trenchids column and a File Parquet Index
    ## can be made more effecient still, with a direct output to parquet
    ## More effecient implementation that splits outputs into two smaller parts (every cell, every trench)
    ## Saves by elmininating some redundent entries
    ## Still filters only for cells that made it to final "phenotype_df" in most cases lineage traced

    valid_barcode_df = barcode_df[
        barcode_df["trenchid"].isin(trenchid_map.keys())
    ].compute()
    barcode_df_mapped_trenchids = valid_barcode_df["trenchid"].apply(
        lambda x: trenchid_map[x]
    )
    phenotype_df_idx = phenotype_df["trenchid"].unique().compute().tolist()

    valid_init_df_indices = barcode_df_mapped_trenchids.isin(phenotype_df_idx)
    barcode_df_mapped_trenchids = barcode_df_mapped_trenchids[valid_init_df_indices]
    barcode_df_mapped_trenchids_list = barcode_df_mapped_trenchids.tolist()
    final_valid_barcode_df_indices = barcode_df_mapped_trenchids.index.to_list()

    called_df = barcode_df.loc[final_valid_barcode_df_indices]
    called_df["phenotype trenchid"] = barcode_df_mapped_trenchids
    called_df["phenotype trenchid"] = called_df["phenotype trenchid"].astype(int)
    called_df = called_df.drop(["Barcode Signal"], axis=1)
    called_df = called_df.reset_index().set_index(
        "phenotype trenchid", drop=True, sorted=False
    )

    output_phenotype_df = phenotype_df.rename(
        columns={"trenchid": "phenotype trenchid"}
    )
    output_phenotype_df = output_phenotype_df.reset_index().set_index(
        "phenotype trenchid", drop=True, sorted=True
    )
    output_phenotype_df = output_phenotype_df.loc[barcode_df_mapped_trenchids_list]
    output_phenotype_df = output_phenotype_df.reset_index().set_index(output_index)

    output_phenotype_kmyo_df = phenotype_kymo_df.rename(
        columns={"trenchid": "phenotype trenchid"}
    )
    output_phenotype_kmyo_df = output_phenotype_kmyo_df.reset_index().set_index(
        "phenotype trenchid", drop=True, sorted=True
    )
    output_phenotype_kmyo_df = output_phenotype_kmyo_df.loc[
        barcode_df_mapped_trenchids_list
    ]
    called_df = called_df.repartition(divisions=output_phenotype_kmyo_df.divisions)
    output_phenotype_kmyo_df = output_phenotype_kmyo_df.merge(
        called_df, how="inner", left_index=True, right_index=True
    )
    output_phenotype_kmyo_df = output_phenotype_kmyo_df.reset_index().set_index(
        output_kymo_index
    )

    return output_phenotype_df, output_phenotype_kmyo_df

In [None]:
kymo_df = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2022-01-20_lDE20_Final_6/Growth_Division/kymograph/metadata"
)

In [None]:
output_phenotype_df, output_phenotype_kmyo_df = get_barcode_pheno_df_split(
    growth_div_df, kymo_df, barcode_df, trenchid_map, output_index="Global CellID"
)
output_phenotype_df.to_parquet(
    "/home/de64/scratch/de64/sync_folder/2022-01-20_lDE20_Final_6/2022-02-09_lDE20_Final_Lineage_df/",
    engine="pyarrow",
    overwrite=True,
)
output_phenotype_kmyo_df.to_parquet(
    "/home/de64/scratch/de64/sync_folder/2022-01-20_lDE20_Final_6/2022-02-09_lDE20_Final_Barcodes_df/",
    engine="pyarrow",
    overwrite=True,
)
# output_df = output_df.repartition(npartitions=500).persist()

In [None]:
dask_controller.shutdown()

In [None]:
dask_controller.daskclient.restart()

In [None]:
output_phenotype_kmyo_df = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2021-11-08_lDE20_Final_3/2021-12-01_lDE20_Final_Barcodes_df/"
)

In [None]:
output_phenotype_kmyo_df["FOV-Timepoint Index"] = fov_timepoint_idx

In [None]:
headpath = (
    "/home/de64/scratch/de64/sync_folder/2021-11-08_lDE20_Final_3/Growth_Division"
)
# note: shutdown dask when doing this...fix bug later
overlay_handle = tr.variant_overlay(
    headpath,
    "/home/de64/scratch/de64/sync_folder/2021-11-08_lDE20_Final_3/2021-12-01_lDE20_Final_Barcodes_df/",
    display_values_list=["Gene", "TargetID", "N Mismatch"],
    persist_data=False,
)  ##fix this, was improperly made (only initial cellID timepoints)

In [None]:
overlay_handle.view_overlay(vmin=0, vmax=10000)

In [None]:
gene_table = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2021-11-08_lDE20_Final_3/2021-12-01_lDE20_Final_Barcodes_df/"
)
gene_table = gene_table.reset_index().set_index("phenotype trenchid", sorted=True)
gene_table = (
    gene_table.groupby("phenotype trenchid", sort=False)
    .apply(lambda x: x.iloc[0])
    .reset_index()
    .set_index("FOV Parquet Index")
)
gene_table_out = gene_table.groupby("sgRNA").apply(lambda x: x.iloc[0])
gene_table_out["phenotype trenchids"] = gene_table.groupby("sgRNA").apply(
    lambda x: x["phenotype trenchid"].tolist()
)
gene_table_out = gene_table_out[
    [
        "Gene",
        "Target Sequence",
        "phenotype trenchids",
        "N Mismatch",
        "N Target Sites",
        "Category",
        "Strand",
    ]
].compute()

In [None]:
gene_table_out = gene_table.groupby("sgRNA").apply(lambda x: x.iloc[0])
gene_table_out["phenotype trenchids"] = gene_table.groupby("sgRNA").apply(
    lambda x: x["phenotype trenchid"].tolist()
)
gene_table_out = gene_table_out[
    [
        "Gene",
        "Target Sequence",
        "phenotype trenchids",
        "N Mismatch",
        "N Target Sites",
        "Category",
        "Strand",
    ]
].compute()

In [None]:
kymo_xarr = tr.kymo_xarr(
    "/home/de64/scratch/de64/sync_folder/2021-11-08_lDE20_Final_3/Growth_Division/"
)
wrapped_kymo_xarr = tr.kymo_xarr(
    "/home/de64/scratch/de64/sync_folder/2021-11-08_lDE20_Final_3/Growth_Division",
    unwrap=False,
)

In [None]:
(
    gene_table_layout,
    select_gene,
    select_trenchid,
    select_unpacked_trenchid,
) = tr.linked_gene_table(
    gene_table_out,
    index_key="Gene",
    trenchids_as_list=True,
    trenchid_column="phenotype trenchids",
)

In [None]:
gene_table_layout

In [None]:
output_display, save_button = tr.linked_kymograph_for_gene_table(
    kymo_xarr,
    wrapped_kymo_xarr,
    gene_table_out,
    select_gene,
    select_trenchid,
    select_unpacked_trenchid=select_unpacked_trenchid,
    trenchid_column="phenotype trenchids",
    y_scale=3,
    x_window_size=300,
)

In [None]:
output_display

In [None]:
save_button