## Mapping Barcodes and Cleaning Data

In [None]:
import dask.array as da
import dask.dataframe as dd
import holoviews as hv
import numpy as np
import pandas as pd
import scipy as sp
import scipy.stats
import seaborn as sns
import sklearn as skl
from matplotlib import pyplot as plt
from sklearn.linear_model import LinearRegression

import paulssonlab.deaton.trenchripper.trenchripper as tr

hv.extension("bokeh")

In [None]:
headpath = (
    "/home/de64/scratch/de64/sync_folder/2021-06-14_lDE20_biofloat_fullrun_1/Barcodes"
)

In [None]:
# dask_controller = tr.trcluster.dask_controller(
#     walltime="04:00:00",
#     local=False,
#     n_workers=100,
#     death_timeout=5.,
#     memory="16GB",
#     working_directory="/home/de64/scratch/de64/temp/dask",
# )
# dask_controller.startdask()

dask_controller = tr.trcluster.dask_controller(
    walltime="04:00:00",
    local=False,
    n_workers=20,
    death_timeout=5.0,
    memory="16GB",
    working_directory="/home/de64/scratch/de64/temp/dask",
)
dask_controller.startdask()

In [None]:
dask_controller.displaydashboard()

In [None]:
dask_controller.daskclient.restart()

#### Import Barcode Dataframe

In [None]:
meta_handle = tr.pandas_hdf5_handler(
    "/home/de64/scratch/de64/sync_folder/2021-06-14_lDE20_biofloat_fullrun_1/Barcodes/metadata.hdf5"
)
pandas_barcode_df = meta_handle.read_df("barcodes", read_metadata=True)
barcode_df = dd.from_pandas(pandas_barcode_df, npartitions=500, sort=True)
barcode_df = barcode_df.persist()

In [None]:
ttl_called = len(barcode_df.index)
ttl_trenches = pandas_barcode_df.metadata["Total Trenches"]
ttl_trenches_w_cells = pandas_barcode_df.metadata["Total Trenches With Cells"]
percent_called = ttl_called / ttl_trenches
percent_called_w_cells = ttl_called / ttl_trenches_w_cells

In [None]:
print(ttl_called)
print(ttl_trenches)
print(ttl_trenches_w_cells)
print(percent_called)
print(percent_called_w_cells)

#### Import Lineage Output

In [None]:
def filter_df(df, query_list, client=False, repartition=False):
    # filter_list must be in df.query format (see pandas docs)

    # returns persisted dataframe either in cluster or local

    compiled_query = " and ".join(query_list)
    out_df = df.query(compiled_query)
    if client:
        out_df = client.daskclient.persist(out_df)
    else:
        out_df = out_df.persist()

    if repartition:
        init_size = len(df)
        final_size = len(out_df)
        ratio = init_size // final_size
        out_df = out_df.repartition(npartitions=(df.npartitions // ratio) + 1)

        if client:
            out_df = client.daskclient.persist(out_df)
        else:
            out_df = out_df.persist()

    return out_df


def get_first_cell_timepoint(df):
    min_tpts = df.groupby(["Global CellID"])["timepoints"].idxmin().tolist()
    init_cells = df.loc[min_tpts]
    return init_cells


def get_last_cell_timepoint(df):
    max_tpts = df.groupby(["Global CellID"])["timepoints"].idxmax().tolist()
    fin_cells = df.loc[max_tpts]
    return fin_cells


def get_first_last_cell_dfs(df, persist=False):
    ### NOTE: this functions requires that the input df has partitions aligned with trenchids, so
    ### that mother and siblings are in the same partition.

    init_cells = df.map_partitions(get_first_cell_timepoint)
    fin_cells = df.map_partitions(get_last_cell_timepoint)
    if persist:
        init_cells = init_cells.persist()
        fin_cells = fin_cells.persist()
    return init_cells, fin_cells


def get_df_from_series_index(df, delayed_series, partition_info=None):
    # Hack to avoid automatic partition alignment in map_partitions
    # Allows for mismatched index lookup

    n = partition_info["number"]
    list_of_indices = delayed_series[n].tolist()
    df_out = df.loc[list_of_indices]
    return df_out


def get_relative_dfs(query_df, reference_df, persist_relatives=False):
    init_cells, fin_cells = get_first_last_cell_dfs(query_df, persist=False)
    cell_min_tpt_df, cell_max_tpt_df = get_first_last_cell_dfs(
        reference_df, persist=False
    )

    init_cells = (
        init_cells.reset_index(drop=False)
        .set_index("Global CellID", sorted=False)
        .persist()
    )
    fin_cells = (
        fin_cells.reset_index(drop=False)
        .set_index("Global CellID", sorted=False)
        .persist()
    )
    cell_min_tpt_df = (
        cell_min_tpt_df.reset_index(drop=False)
        .set_index("Global CellID", sorted=False)
        .persist()
    )
    cell_max_tpt_df = (
        cell_max_tpt_df.reset_index(drop=False)
        .set_index("Global CellID", sorted=False)
        .persist()
    )

    mother_df = dd.map_partitions(
        get_df_from_series_index,
        cell_max_tpt_df,
        init_cells["Mother CellID"].to_delayed(),
        meta=cell_max_tpt_df.head()[:0],
    )
    sister_df = dd.map_partitions(
        get_df_from_series_index,
        cell_min_tpt_df,
        init_cells["Sister CellID"].to_delayed(),
        meta=cell_min_tpt_df.head()[:0],
    )
    daughter_1_df = dd.map_partitions(
        get_df_from_series_index,
        cell_min_tpt_df,
        fin_cells["Daughter CellID 1"].to_delayed(),
        meta=cell_min_tpt_df.head()[:0],
    )
    daughter_2_df = dd.map_partitions(
        get_df_from_series_index,
        cell_min_tpt_df,
        fin_cells["Daughter CellID 2"].to_delayed(),
        meta=cell_min_tpt_df.head()[:0],
    )

    mother_df = mother_df.reset_index(
        drop=False
    )  # .set_index("init_cells Index",sorted=True)
    sister_df = sister_df.reset_index(
        drop=False
    )  # .set_index("init_cells Index",sorted=True)
    daughter_1_df = daughter_1_df.reset_index(
        drop=False
    )  # .set_index("init_cells Index",sorted=True)
    daughter_2_df = daughter_2_df.reset_index(
        drop=False
    )  # .set_index("init_cells Index",sorted=True)

    if persist_relatives:
        mother_df = mother_df.persist()
        sister_df = sister_df.persist()
        daughter_1_df = daughter_1_df.persist()
        daughter_2_df = daughter_2_df.persist()

    return init_cells, fin_cells, mother_df, sister_df, daughter_1_df, daughter_2_df


def get_init_and_final_size(
    query_df,
    reference_df,
    size_metrics=[
        "area",
        "major_axis_length",
        "minor_axis_length",
        "Volume",
        "Surface Area",
    ],
):
    ##query contains cells of interest
    ##reference contains all cells that may be retrieved (mothers,sisters,daughters)

    (
        init_cells,
        fin_cells,
        mother_df,
        sister_df,
        daughter_1_df,
        daughter_2_df,
    ) = get_relative_dfs(query_df, reference_df)

    init_cells_noidx, fin_cells_noidx = (
        init_cells.reset_index(drop=False).persist(),
        fin_cells.reset_index(drop=False).persist(),
    )

    adjusted_init_size = {}
    adjusted_final_size = {}
    adjusted_del_size = {}

    ### Ineffecient, but not sure how to avoid
    for metric in size_metrics:
        if metric == "minor_axis_length":
            adjusted_init_size[metric] = init_cells_noidx[metric]

            adjusted_final_size[metric] = fin_cells_noidx[metric]

            adjusted_del_size[metric] = (
                adjusted_final_size[metric] - adjusted_init_size[metric]
            )

        else:
            interp_mother_final_size = (
                (init_cells_noidx[metric] + sister_df[metric]) * mother_df[metric]
            ) ** (1 / 2)
            sister_frac = init_cells_noidx[metric] / (
                sister_df[metric] + init_cells_noidx[metric]
            )
            adjusted_init_size[metric] = sister_frac * interp_mother_final_size

            adjusted_final_size[metric] = (
                (daughter_1_df[metric] + daughter_2_df[metric])
                * fin_cells_noidx[metric]
            ) ** (1 / 2)

            adjusted_del_size[metric] = (
                adjusted_final_size[metric] - adjusted_init_size[metric]
            )

    return (
        init_cells_noidx,
        fin_cells_noidx,
        adjusted_init_size,
        adjusted_final_size,
        adjusted_del_size,
    )


# def get_promoter_synthesis_rate(cellid_groupby, intensity_label, size_metric_label):
#     del_intensity_series = cellid_groupby[intensity_label].apply(lambda x: x.values[1:]-x.values[:-1]).to_frame(name="del intensity")
#     mean_intensity_series = cellid_groupby[intensity_label].apply(lambda x: (x.values[1:]+x.values[:-1])/2).to_frame(name="mean intensity")
#     del_size_series = cellid_groupby[size_metric_label].apply(lambda x: x.values[1:]-x.values[:-1]).to_frame(name="del size")
#     mean_size_series = cellid_groupby[size_metric_label].apply(lambda x: (x.values[1:]+x.values[:-1])/2).to_frame(name="mean size")
#     pro_syn_df = dd.concat([del_intensity_series,mean_intensity_series,del_size_series,mean_size_series],axis=1)
#     promoter_activity_series = pro_syn_df.apply(lambda x: np.nanmedian(x["del intensity"] + (x["mean intensity"]*(x["del size"]/x["mean size"]))), axis=1, meta=float)
#     return promoter_activity_series


def get_growth_and_division_stats(
    query_df,
    reference_df,
    delta_t_min=4,
    size_metrics=[
        "area",
        "major_axis_length",
        "minor_axis_length",
        "Volume",
        "Surface Area",
    ],
):
    (
        init_cells_noidx,
        fin_cells_noidx,
        adjusted_init_size,
        adjusted_final_size,
        adjusted_del_size,
    ) = get_init_and_final_size(query_df, reference_df, size_metrics=size_metrics)

    for size_metric in size_metrics:
        init_cells_noidx["Delta: " + size_metric] = adjusted_del_size[
            size_metric
        ].persist()
        init_cells_noidx["Birth: " + size_metric] = adjusted_init_size[
            size_metric
        ].persist()
        init_cells_noidx["Division: " + size_metric] = adjusted_final_size[
            size_metric
        ].persist()

    init_cells_noidx["final timepoints"] = fin_cells_noidx["timepoints"]
    del_t = init_cells_noidx["final timepoints"] - init_cells_noidx["timepoints"]
    init_cells_noidx["Delta t"] = del_t

    init_cells = init_cells_noidx.set_index("Global CellID", sorted=True)

    query_df_cellid_sorted = (
        query_df.reset_index(drop=False)
        .set_index("Global CellID", sorted=False)
        .persist()
    )
    query_df["Global CellID-timepoints Index"] = query_df.apply(
        lambda x: int(f'{int(x["Global CellID"]):04}{int(x["timepoints"]):04}'), axis=1
    )
    query_df_cellid_sorted = (
        query_df.reset_index(drop=False)
        .set_index("Global CellID-timepoints Index", sorted=False)
        .set_index("Global CellID", sorted=True)
        .persist()
    )
    del query_df

    for size_metric in size_metrics:  # Havn't decided between mean and median
        mean_cell_size_metric_linear_gr = query_df_cellid_sorted.groupby(
            "Global CellID"
        )[size_metric].apply(lambda x: np.nanmean(x[1:].values - x[:-1].values))
        mean_cell_size_metric_linear_gr = (
            mean_cell_size_metric_linear_gr / delta_t_min
        ) * 60  # size unit per hr
        mean_cell_size_metric_exp_gr = query_df_cellid_sorted.groupby("Global CellID")[
            size_metric
        ].apply(
            lambda x: np.nanmean(
                (2 * (x[1:].values - x[:-1].values)) / (x[1:].values + x[:-1].values)
            )
        )
        mean_cell_size_metric_exp_gr = (
            mean_cell_size_metric_exp_gr / delta_t_min
        ) * 60  # exponential size unit per hr
        mean_cell_size_metric = query_df_cellid_sorted.groupby("Global CellID")[
            size_metric
        ].apply(lambda x: np.nanmean(x.values))

        init_cells["Mean: " + size_metric] = mean_cell_size_metric.persist()
        init_cells[
            "Mean Linear Growth Rate: " + size_metric
        ] = mean_cell_size_metric_linear_gr.persist()
        init_cells[
            "Mean Exponential Growth Rate: " + size_metric
        ] = mean_cell_size_metric_exp_gr.persist()

    median_mchy_intensity = query_df_cellid_sorted.groupby("Global CellID")[
        "mCherry mean_intensity"
    ].apply(lambda x: np.nanmean(x.values))
    init_cells["Mean: mCherry Intensity"] = median_mchy_intensity.persist()

    #     volume_normed_mchy_intensity = get_promoter_synthesis_rate(query_df_cellid_sorted.groupby('Global CellID'),"mCherry mean_intensity","Volume")
    #     init_cells["Median: mCherry Promoter Activity (Volume normed)"] = volume_normed_mchy_intensity.persist()

    #     init_cells_trenchid_idx = init_cells.set_index("trenchid",sorted=False).persist()
    #     init_cells_trenchid_groupby = init_cells_trenchid_idx.groupby("trenchid")

    ## Filtering by cell cycle length to eliminate artifact
    init_cells = init_cells[init_cells["Delta t"] >= delta_t_min]
    init_cells = init_cells.rename(columns={"timepoints": "initial timepoints"})
    init_cells = init_cells.drop(["time (s)", "Trenchid Timepoint Index"], axis=1)

    #     trenchid_df = query_df_cellid_sorted.reset_index(drop=False).set_index('trenchid',sorted=True).groupby('trenchid').apply(lambda x: x.iloc[0]).persist()

    ## HERE
    #     init_cells_trenchid_idx = init_cells.reset_index(drop=False).set_index("trenchid",sorted=True).persist()

    #     init_cells_trenchid_groupby = init_cells_trenchid_idx.groupby("trenchid",sort=False)

    #     for size_metric in size_metrics:
    #         trenchid_df["Delta: " + size_metric + " list"] = init_cells_trenchid_groupby["Delta: " + size_metric].apply(lambda x: x.tolist(), meta=list).persist()
    #         trenchid_df["Birth: " + size_metric + " list"] = init_cells_trenchid_groupby["Birth: " + size_metric].apply(lambda x: x.tolist(), meta=list).persist()
    #         trenchid_df["Division: " + size_metric + " list"] = init_cells_trenchid_groupby["Division: " + size_metric].apply(lambda x: x.tolist(), meta=list).persist()
    #         trenchid_df["Median: " + size_metric + " list"] = init_cells_trenchid_groupby["Median: " + size_metric].apply(lambda x: x.tolist(), meta=list).persist()
    #         trenchid_df["Median Linear Growth Rate: " + size_metric + " list"] = init_cells_trenchid_groupby["Median Linear Growth Rate: " + size_metric].apply(lambda x: x.tolist(), meta=list).persist()
    #         trenchid_df["Median Exponential Growth Rate: " + size_metric + " list"] = init_cells_trenchid_groupby["Median Exponential Growth Rate: " + size_metric].apply(lambda x: x.tolist(), meta=list).persist()

    #     trenchid_df["Median: mCherry Intensity list"] = init_cells_trenchid_groupby["Median: mCherry Intensity"].apply(lambda x: x.tolist(), meta=list).persist()
    #     trenchid_df["Delta t list"] = init_cells_trenchid_groupby["Delta t"].apply(lambda x: x.tolist(), meta=list).persist()
    #     trenchid_df["Median: mCherry Promoter Activity (Volume normed) list"] = init_cells_trenchid_groupby["Median: mCherry Promoter Activity (Volume normed)"].apply(lambda x: x.tolist(), meta=list).persist()
    #     trenchid_df["cell timepoints list"] = init_cells_trenchid_groupby["timepoints"].apply(lambda x: x.tolist(), meta=list).persist()
    #     trenchid_df["final cell timepoints list"] = init_cells_trenchid_groupby["final timepoints"].apply(lambda x: x.tolist(), meta=list).persist()

    return init_cells

### Import Lineage

In [None]:
lineage_df = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2021-06-14_lDE20_biofloat_fullrun_1/Growth_Division/lineage"
)

##temp fix
lineage_df["CellID"] = lineage_df["CellID"].astype(int)
lineage_df["Global CellID"] = lineage_df["Global CellID"].astype(int)

In [None]:
def hrm_find_mode(series, max_iter=1000, min_binsize=50):
    working_series = series
    for i in range(max_iter):
        range_max, range_min = np.max(working_series), np.min(working_series)
        midpoint = (range_max + range_min) / 2
        above_middle = working_series[working_series > midpoint]
        below_middle = working_series[working_series <= midpoint]

        count_above = len(above_middle)
        count_below = len(below_middle)

        if count_above > count_below:
            working_series = above_middle
        else:
            working_series = below_middle

        if i > 0:
            if (len(working_series) < min_binsize) or (last_midpoint == midpoint):
                return np.mean(working_series)

        last_midpoint = midpoint


def bootstrap_hrm(series, n_bootstraps=100, n_per_bootstrap=100):
    modes = []
    for n in range(n_bootstraps):
        modes.append(hrm_find_mode(series.sample(n=n_per_bootstrap)))
    return np.mean(modes)

### Variables over FOV

In [None]:
values_to_rescale = [
    "mCherry mean_intensity",
    "area",
    "major_axis_length",
    "minor_axis_length",
    "Volume",
    "Surface Area",
]

In [None]:
lineage_df_subsample = (
    lineage_df[lineage_df["timepoints"] < 20].sample(frac=0.01).compute()
)

In [None]:
fig = plt.figure(figsize=(30, 20))
values_names = [
    "Mean mCherry Intensity",
    "Area",
    "Major Axis Length",
    "Minor Axis Length",
    "Volume",
    "Surface Area",
]
for i, label in enumerate(values_to_rescale):
    fov_series_groupby = lineage_df_subsample.groupby("fov")[label]
    fov_median_series = fov_series_groupby.apply(lambda x: np.median(x)).sort_index()
    fov_correction_series = fov_median_series / np.max(fov_median_series)
    fov_correction_dict = fov_correction_series.to_dict()
    plt.subplot(2, 3, i + 1)
    plt.plot(fov_correction_series)
    plt.title(values_names[i], fontsize=22)
    plt.xlabel("FOV #", fontsize=18)
    plt.ylabel("Scaling", fontsize=18)
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)
    plt.ylim(0.0, 1.0)
    label_scaling = lineage_df["fov"].apply(lambda x: fov_correction_dict[x]).persist()
    lineage_df[label + ": FOV Corrected"] = (
        lineage_df[label] / label_scaling
    ).persist()
plt.savefig("FOV_correction.png", dpi=500)

### Variables over time

In [None]:
# values_to_rescale = ['mCherry mean_intensity: FOV Corrected','area: FOV Corrected', 'major_axis_length: FOV Corrected', 'minor_axis_length: FOV Corrected']
values_to_rescale_step_2 = [value + ": FOV Corrected" for value in values_to_rescale]

In [None]:
lineage_df_subsample = lineage_df.sample(frac=0.01).compute()

In [None]:
fig = plt.figure(figsize=(30, 20))
values_names = [
    "Mean mCherry Intensity",
    "Area",
    "Major Axis Length",
    "Minor Axis Length",
    "Volume",
    "Surface Area",
]
for i, label in enumerate(values_to_rescale_step_2):
    time_series_groupby = lineage_df_subsample.groupby("timepoints")[label]
    time_mode_series = time_series_groupby.apply(
        lambda x: bootstrap_hrm(x)
    ).sort_index()
    time_correction_series = time_mode_series / np.max(time_mode_series)
    time_correction_dict = time_correction_series.to_dict()
    plt.subplot(2, 3, i + 1)
    plt.plot(time_correction_series)
    plt.title(values_names[i], fontsize=22)
    plt.xlabel("Timepoint (3 min steps)", fontsize=18)
    plt.ylabel("Scaling", fontsize=18)
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)
    plt.ylim(0.0, 1.0)
    label_scaling = lineage_df["timepoints"].apply(lambda x: time_correction_dict[x])
    lineage_df[label + ": Time Corrected"] = (
        lineage_df[label] / label_scaling
    ).persist()
plt.savefig("Time_correction.png", dpi=500)

The HSM method [2] iteratively divides the data set into samples of half the size as the original set and uses the half-sample with the minimum range, where range is defined as the difference between the maximum and the minimum value of the sample. This method terminates when the half-sample is less than three data points. An average of these three or fewer values is the mode. The HRM method [2] is similar but uses the sub-sample with the densest half-range, where range is defined as the absolute difference between the maximum and the minimum values in a sample. Of these two methods, only the HRM was used in this study because HRM has been shown to have lower bias with increasing contamination and asymmetry [2].

### Overwrite Variables with Correction

In [None]:
for label in values_to_rescale:
    lineage_df[label] = lineage_df[label + ": FOV Corrected: Time Corrected"]

In [None]:
lineage_df = lineage_df[
    [
        "fov",
        "row",
        "trench",
        "timepoints",
        "time (s)",
        "lane orientation",
        "y (local)",
        "x (local)",
        "File Index",
        "File Trench Index",
        "trenchid",
        "Trenchid Timepoint Index",
        "CellID",
        "Global CellID",
        "Trench Score",
        "Mother CellID",
        "Daughter CellID 1",
        "Daughter CellID 2",
        "Sister CellID",
        "Centroid X",
        "Centroid Y",
        "FOV Parquet Index",
    ]
    + values_to_rescale
].persist()

### Growth/Div Function

In [None]:
reference_df = filter_df(
    lineage_df, ["`Trench Score` < -75"], client=dask_controller, repartition=False
).persist()
query_df = filter_df(
    lineage_df,
    [
        "`Mother CellID` != -1",
        "`Daughter CellID 1` != -1",
        "`Daughter CellID 2` != -1",
        "`Sister CellID` != -1",
        "`Trench Score` < -75",
    ],
    client=dask_controller,
    repartition=False,
).persist()
init_cells = get_growth_and_division_stats(query_df, reference_df)

del reference_df
del query_df
del lineage_df

#### Get Trench Mapping

In [None]:
phenotype_kymopath = "/home/de64/scratch/de64/sync_folder/2021-06-14_lDE20_biofloat_fullrun_1/Growth_Division/kymograph/metadata"
barcode_kymopath = "/home/de64/scratch/de64/sync_folder/2021-06-14_lDE20_biofloat_fullrun_1/Barcodes/kymograph/metadata"

trenchid_map = tr.files_to_trenchid_map(phenotype_kymopath, barcode_kymopath)

In [None]:
init_df_idx = init_cells["trenchid"].unique().compute().tolist()
valid_barcode_df = barcode_df[
    barcode_df["trenchid"].isin(trenchid_map.keys())
].compute()
barcode_df_mapped_trenchids = valid_barcode_df["trenchid"].apply(
    lambda x: trenchid_map[x]
)
valid_init_df_indices = barcode_df_mapped_trenchids.isin(init_df_idx)
barcode_df_mapped_trenchids = barcode_df_mapped_trenchids[valid_init_df_indices]
final_valid_barcode_df_indices = barcode_df_mapped_trenchids.index.to_list()
called_df = barcode_df.loc[final_valid_barcode_df_indices]
called_df["phenotype trenchid"] = barcode_df_mapped_trenchids
called_df = (
    called_df.reset_index()
    .set_index("phenotype trenchid", drop=True, sorted=False)
    .compute()
)
# called_df = called_df.repartition(npartitions=1).persist()
init_cells = init_cells.rename(columns={"trenchid": "phenotype trenchid"})
init_cells = (
    init_cells.reset_index()
    .set_index("phenotype trenchid", drop=True, sorted=False)
    .compute()
)
init_cells = init_cells.merge(called_df, how="inner", left_index=True, right_index=True)
init_cells = init_cells.drop(["Barcode Signal"], axis=1)
init_cells = init_cells.reset_index().set_index("Global CellID")
init_cells = init_cells.sort_index()
final_output_df = dd.from_pandas(init_cells, npartitions=200).persist()

In [None]:
final_output_df.to_parquet(
    "/home/de64/scratch/de64/sync_folder/2021-06-14_lDE20_biofloat_fullrun_1/2021-09-24_lDE20_Lineage_Analysis",
    engine="fastparquet",
    overwrite=True,
)

In [None]:
# output_df_trenchid_idx = trenchid_df.reset_index(drop=False).set_index("trenchid",sorted=True)

# init_df_idx = init_cells["trenchid"].unique().compute().tolist()
# valid_barcode_df = barcode_df[barcode_df["trenchid"].isin(trenchid_map.keys())].compute()
# barcode_df_mapped_trenchids = valid_barcode_df["trenchid"].apply(lambda x: trenchid_map[x])
# valid_init_df_indices = barcode_df_mapped_trenchids.isin(init_df_idx)
# barcode_df_mapped_trenchids = barcode_df_mapped_trenchids[valid_init_df_indices]
# final_valid_barcode_df_indices = barcode_df_mapped_trenchids.index.to_list()
# called_df = barcode_df.loc[final_valid_barcode_df_indices]
# called_df["phenotype trenchid"] = barcode_df_mapped_trenchids
# # called_df = called_df.set_index("phenotype trenchid")
# final_output_df = trenchid_df.loc[called_df.index.compute().tolist()].join(called_df)
# final_output_df["phenotype trenchid"] = final_output_df.index
# final_output_df = final_output_df.reset_index(drop=True).set_index("File Parquet Index",sorted=True)

# del final_output_df["Barcode Signal"]
# final_output_df = final_output_df.compute()

In [None]:
final_output_df = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2021-06-14_lDE20_biofloat_fullrun_1/2021-09-24_lDE20_Lineage_Analysis"
)

In [None]:
test = final_output_df.loc[:1000000].compute()

In [None]:
test.columns

In [None]:
test["major_axis_length"]

In [None]:
dask_controller.shutdown()