## Mapping Barcodes and Cleaning Data

In [None]:
import paulssonlab.deaton.trenchripper.trenchripper as tr

import numpy as np
import pandas as pd
import seaborn as sns
import scipy as sp
import sklearn as skl
import dask.dataframe as dd
import dask.array as da

import scipy.stats
from sklearn.linear_model import LinearRegression

from matplotlib import pyplot as plt
import holoviews as hv

hv.extension("bokeh")

In [None]:
headpath = "/home/de64/scratch/de64/sync_folder/2021-10-21_lDE15_Final_1/barcodes"

In [None]:
dask_controller = tr.trcluster.dask_controller(
    walltime="04:00:00",
    local=False,
    n_workers=50,
    death_timeout=5.0,
    memory="16GB",
    working_directory="/home/de64/scratch/de64/temp/dask",
)
dask_controller.startdask()

In [None]:
dask_controller.displaydashboard()

#### Import Barcode Dataframe

In [None]:
meta_handle = tr.pandas_hdf5_handler(
    "/home/de64/scratch/de64/sync_folder/2021-10-21_lDE15_Final_1/barcodes/metadata.hdf5"
)
pandas_barcode_df = meta_handle.read_df("barcodes", read_metadata=True)
barcode_df = dd.from_pandas(pandas_barcode_df, npartitions=500, sort=True)
barcode_df = barcode_df.persist()

In [None]:
ttl_called = len(barcode_df.index)
ttl_trenches = pandas_barcode_df.metadata["Total Trenches"]
ttl_trenches_w_cells = pandas_barcode_df.metadata["Total Trenches With Cells"]
percent_called = ttl_called / ttl_trenches
percent_called_w_cells = ttl_called / ttl_trenches_w_cells

In [None]:
print(ttl_called)
print(ttl_trenches)
print(ttl_trenches_w_cells)
print(percent_called)
print(percent_called_w_cells)

### Import Analysis 

In [None]:
analysis_df = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2021-10-21_lDE15_Final_1/GFP/analysis"
)
last_trenchid = int(analysis_df.tail(1)["trenchid"])

In [None]:
import skimage as sk


def hrm_find_mode(series, max_iter=1000, min_binsize=50):
    working_series = series
    for i in range(max_iter):
        range_max, range_min = np.max(working_series), np.min(working_series)
        midpoint = (range_max + range_min) / 2
        above_middle = working_series[working_series > midpoint]
        below_middle = working_series[working_series <= midpoint]

        count_above = len(above_middle)
        count_below = len(below_middle)

        if count_above > count_below:
            working_series = above_middle
        else:
            working_series = below_middle

        if i > 0:
            if (len(working_series) < min_binsize) or (last_midpoint == midpoint):
                return np.mean(working_series)

        last_midpoint = midpoint


def bootstrap_hrm(series, n_bootstraps=100, max_n_per_bootstrap=100):
    modes = []

    series_len = len(series)

    n_per_bootstrap = min(series_len, max_n_per_bootstrap)

    for n in range(n_bootstraps):
        modes.append(hrm_find_mode(series.sample(n=n_per_bootstrap)))
    return np.mean(modes)


def get_GFPpos_modes(
    GFP_series, series_groupby, frac=0.01, n_bootstraps=100, max_n_per_bootstrap=100
):
    gfp_vals = GFP_series.sample(frac=frac).compute()
    tri_thr = sk.filters.threshold_triangle(gfp_vals)
    mode_series = (
        series_groupby.apply(
            lambda x: bootstrap_hrm(
                x[x > tri_thr],
                n_bootstraps=n_bootstraps,
                max_n_per_bootstrap=max_n_per_bootstrap,
            )
        )
        .compute()
        .sort_index()
    )
    return mode_series

### Variables over FOV

In [None]:
analysis_df

In [None]:
analysis_df_nobkd = analysis_df[analysis_df["Objectid"] != 0]

In [None]:
values_to_rescale = ["RFP-Penta mean_intensity", "GFP-Penta mean_intensity"]

In [None]:
fig = plt.figure(figsize=(30, 20))
values_names = ["Median mCherry Intensity", "Median GFPmut2 Intensity"]

for i, label in enumerate(values_to_rescale):
    fov_series_groupby = analysis_df_nobkd.groupby("fov")[label]
    if label == "RFP-Penta mean_intensity":
        fov_mode_series = (
            fov_series_groupby.apply(
                lambda x: bootstrap_hrm(x, max_n_per_bootstrap=100)
            )
            .compute()
            .sort_index()
        )
    elif label == "GFP-Penta mean_intensity":
        fov_mode_series = get_GFPpos_modes(
            analysis_df["GFP-Penta mean_intensity"],
            fov_series_groupby,
            max_n_per_bootstrap=100,
        )
    else:
        print("Weird Label")

    fov_correction_series = fov_mode_series / np.max(fov_mode_series)
    fov_correction_dict = fov_correction_series.to_dict()
    plt.subplot(2, 3, i + 1)
    plt.plot(fov_correction_series)
    plt.title(values_names[i], fontsize=22)
    plt.xlabel("FOV #", fontsize=18)
    plt.ylabel("Scaling", fontsize=18)
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)
    plt.ylim(0.0, 1.0)
    label_scaling = analysis_df["fov"].apply(lambda x: fov_correction_dict[x]).persist()
    analysis_df[label + ": FOV Corrected"] = (
        analysis_df[label] / label_scaling
    ).persist()
plt.savefig("FOV_correction.png", dpi=500)

### Variables over time

In [None]:
analysis_df_nobkd = analysis_df[analysis_df["Objectid"] != 0]
values_to_rescale_step_2 = [value + ": FOV Corrected" for value in values_to_rescale]

In [None]:
# add real time later when fixed

In [None]:
fig = plt.figure(figsize=(30, 20))
values_names = ["Median mCherry Intensity", "Median GFPmut2 Intensity"]

for i, label in enumerate(values_to_rescale_step_2):
    time_series_groupby = analysis_df_nobkd.groupby("timepoints")[label]
    if label == "RFP-Penta mean_intensity: FOV Corrected":
        time_mode_series = (
            time_series_groupby.apply(
                lambda x: bootstrap_hrm(x, max_n_per_bootstrap=100)
            )
            .compute()
            .sort_index()
        )
    elif label == "GFP-Penta mean_intensity: FOV Corrected":
        time_mode_series = get_GFPpos_modes(
            analysis_df["GFP-Penta mean_intensity"],
            time_series_groupby,
            max_n_per_bootstrap=100,
        )
    else:
        print("Weird Label")

    time_correction_series = time_mode_series / np.max(time_mode_series)
    time_correction_dict = time_correction_series.to_dict()
    plt.subplot(2, 3, i + 1)
    plt.plot(time_correction_series)
    plt.title(values_names[i], fontsize=22)
    plt.xlabel("Timepoint (3 min steps)", fontsize=18)
    plt.ylabel("Scaling", fontsize=18)
    plt.xticks(fontsize=18)
    plt.yticks(fontsize=18)
    plt.ylim(0.0, 1.2)
    label_scaling = analysis_df["timepoints"].apply(lambda x: time_correction_dict[x])
    analysis_df[label + ": Time Corrected"] = (
        analysis_df[label] / label_scaling
    ).persist()
plt.savefig("Time_correction.png", dpi=500)

### Overwrite Variables with Correction

In [None]:
for label in values_to_rescale:
    analysis_df[label] = analysis_df[label + ": FOV Corrected: Time Corrected"]

In [None]:
analysis_df = analysis_df[
    [
        "File Index",
        "File Trench Index",
        "timepoints",
        "Objectid",
        "centroid_y",
        "centroid_x",
        "area",
        "fov",
        "row",
        "trench",
        "time (s)",
        "lane orientation",
        "y (local)",
        "x (local)",
        "y (global)",
        "x (global)",
        "trenchid",
        "Trenchid Timepoint Index",
    ]
    + values_to_rescale
].persist()

In [None]:
analysis_df

### GFP Quantification Function

In [None]:
def local_background_subtract(series, intensity_key):
    intensity_vals = series[intensity_key]
    bkd_val = series[series["Objectid"] == 0][intensity_key].iloc[0]
    bkd_sub_intensity = intensity_vals - bkd_val
    bkd_sub_intensity = bkd_sub_intensity.to_dict()
    return bkd_sub_intensity

In [None]:
analysis_df_trenchtimepoint_sorted = (
    analysis_df.reset_index(drop=False)
    .set_index("Trenchid Timepoint Index", sorted=False)
    .persist()
)
analysis_df_trenchtimepoint_groupby = analysis_df_trenchtimepoint_sorted.groupby(
    "Trenchid Timepoint Index"
)

In [None]:
gfp_intensity_wo_bkd = (
    analysis_df_trenchtimepoint_groupby.apply(
        lambda x: local_background_subtract(x, "GFP-Penta mean_intensity"),
        meta=("GFP-Penta mean_intensity", float),
    )
    .compute()
    .reset_index(drop=True)
    .to_list()
)
gfp_intensity_wo_bkd = {k: v for d in gfp_intensity_wo_bkd for k, v in d.items()}
gfp_intensity_wo_bkd = pd.DataFrame.from_dict(
    gfp_intensity_wo_bkd, orient="index", columns=["GFP-Penta mean_intensity_wo_bkd"]
)

In [None]:
mchy_intensity_wo_bkd = (
    analysis_df_trenchtimepoint_groupby.apply(
        lambda x: local_background_subtract(x, "RFP-Penta mean_intensity"),
        meta=("RFP-Penta mean_intensity", float),
    )
    .compute()
    .reset_index(drop=True)
    .to_list()
)
mchy_intensity_wo_bkd = {k: v for d in mchy_intensity_wo_bkd for k, v in d.items()}
mchy_intensity_wo_bkd = pd.DataFrame.from_dict(
    mchy_intensity_wo_bkd, orient="index", columns=["RFP-Penta mean_intensity_wo_bkd"]
)

In [None]:
analysis_df = analysis_df.join([mchy_intensity_wo_bkd, gfp_intensity_wo_bkd]).persist()

In [None]:
analysis_df_nobkd = analysis_df[analysis_df["Objectid"] != 0]
analysis_df_nobkd["Object Parquet Index"] = analysis_df_nobkd.apply(
    lambda x: int(
        f"{x['File Index']:04n}{x['File Trench Index']:04n}{x['timepoints']:04n}{x['Objectid']:02n}"
    ),
    axis=1,
)
analysis_df_nobkd = analysis_df_nobkd.set_index("Object Parquet Index")

In [None]:
ratio_series = (
    analysis_df_nobkd["GFP-Penta mean_intensity_wo_bkd"]
    / analysis_df_nobkd["RFP-Penta mean_intensity_wo_bkd"]
)
analysis_df_nobkd["gfp/mchy Ratio"] = ratio_series

In [None]:
trenchid_groupby = analysis_df_nobkd.groupby("trenchid")
median_ratio = trenchid_groupby["gfp/mchy Ratio"].apply(np.median).compute()
median_ratio = median_ratio.sort_index()

In [None]:
threshold = 1.0

dark_gfp = median_ratio < threshold
perc_gfp = 1.0 - (np.sum(dark_gfp) / len(median_ratio))
print(perc_gfp)

plt.hist(
    median_ratio[median_ratio < threshold],
    range=(0, 10),
    bins=50,
    color="grey",
    label="Measured Dark GFP",
    density=True,
)
plt.hist(
    median_ratio[median_ratio > threshold],
    range=(0, 10),
    bins=50,
    color="green",
    label="Measured GFP",
    density=True,
)
plt.xlabel("Mean Intensity Ratio", fontsize=26)
plt.xticks(fontsize=26)
plt.yticks(fontsize=26)
plt.legend(fontsize=26)
# plt.savefig("./GFP_Threshold_Distribution_1.png",dpi=300,bbox_inches="tight")
plt.show()

In [None]:
# write this...
# reference_df = filter_df(lineage_df,["`Trench Score` < -75"],client=dask_controller,repartition=False).persist()
# query_df = filter_df(lineage_df,["`Mother CellID` != -1","`Daughter CellID 1` != -1","`Daughter CellID 2` != -1",\
#                                               "`Sister CellID` != -1","`Trench Score` < -75"],client=dask_controller,repartition=False).persist()
# init_cells = get_growth_and_division_stats(query_df,reference_df)

# del reference_df
# del query_df
# del lineage_df

#### Compute Call Rate

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy as sp
import sklearn as skl
import dask.dataframe as dd

from matplotlib import pyplot as plt

from matplotlib.ticker import FuncFormatter

sns.set()
sns.set_style("ticks")

In [None]:
analysis_df = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2021-10-21_lDE15_Final_1/GFP/analysis"
)
last_trenchid = int(analysis_df.tail(1)["trenchid"])

In [None]:
ttl_true = np.sum([item == True for item in barcode_df["dark_gfp"].tolist()])
ttl_false = np.sum([item == False for item in barcode_df["dark_gfp"].tolist()])
ttl_none = np.sum([item == "Unknown" for item in barcode_df["dark_gfp"].tolist()])
ttl_called = ttl_true + ttl_false
ttl_trenches = barcode_df.metadata["Total Trenches"]
ttl_trenches_w_signal = barcode_df.metadata["Total Trenches With Cells"]
percent_called = ttl_called / ttl_trenches
percent_called_w_signal = ttl_called / ttl_trenches_w_signal

percent_called_w_gfp_call = ttl_called / last_trenchid
percent_signal_w_gfp_call = ttl_trenches_w_signal / last_trenchid

In [None]:
percent_called

In [None]:
percent_called_w_signal

In [None]:
percent_called_w_gfp_call

In [None]:
percent_signal_w_gfp_call

#### Estimating Error from Recall Rate

math for this in scanned doc; using empirical hamming distance distribution

In [None]:
N = 29
p_err_out = 1.0 - percent_called_w_signal
H_vals, H_counts = np.unique(
    barcode_df["Closest Hamming Distance"].values, return_counts=True
)
p_Hdist = H_counts / np.sum(H_counts)

In [None]:
coeffs = [-p_err_out]
for idx, j in enumerate(H_vals):
    bin_coeff = sp.special.binom(N, j)
    coeff = bin_coeff - p_Hdist[idx]
    coeffs.append(coeff)
coeffs = np.array(coeffs)[::-1]
roots = np.roots(coeffs)
epsilon = np.real(roots[~np.iscomplex(roots)])[0]

In [None]:
p_err_in = 0.0
for idx, j in enumerate(H_vals):
    p_err_in += p_Hdist[idx] * (epsilon**j)

In [None]:
print("Epsilon Estimate: " + str(epsilon))
print("P(error in) Estimate: " + str(p_err_in))

Guessing epislon based on single bit error rate (using soft matching)

In [None]:
output = []
eps_range = np.linspace(0.0, 0.2)
for epsilon in eps_range:
    #     epsilon = 0.05
    p_err_in = 0.0
    for idx, j in enumerate(H_vals):
        p_err_in += p_Hdist[idx] * (epsilon**j)
    output.append(p_err_in * 100)


plt.plot(eps_range, output)
plt.xlabel("Epsilon", fontsize=20)
plt.ylabel("Error Rate (%)", fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()

#### Get trenchwise GFP signal

In [None]:
mchy_df = analysis_df[analysis_df["Intensity Channel"] == "RFP-Penta"]
mchy_groupby = mchy_df.groupby(["trenchid", "timepoints"])

gfp_df = analysis_df[analysis_df["Intensity Channel"] == "GFP-Penta"]
gfp_groupby = gfp_df.groupby(["trenchid", "timepoints"])

gfp_intensity_wo_bkd = (
    gfp_groupby.apply(
        lambda x: (
            x["mean_intensity"] - x[x["Objectid"] == 0]["mean_intensity"].iloc[0]
        ).to_dict(),
        meta=("mean_intensity", float),
    )
    .reset_index(drop=True)
    .compute()
    .to_list()
)
gfp_intensity_wo_bkd = {k: v for d in gfp_intensity_wo_bkd for k, v in d.items()}
gfp_intensity_wo_bkd = pd.DataFrame.from_dict(
    gfp_intensity_wo_bkd, orient="index", columns=["mean_intensity_wo_bkd"]
)
gfp_df = gfp_df.join(gfp_intensity_wo_bkd).persist()
del gfp_intensity_wo_bkd

mchy_intensity_wo_bkd = (
    mchy_groupby.apply(
        lambda x: (
            x["mean_intensity"] - x[x["Objectid"] == 0]["mean_intensity"].iloc[0]
        ).to_dict(),
        meta=("mean_intensity", float),
    )
    .reset_index(drop=True)
    .compute()
    .to_list()
)
mchy_intensity_wo_bkd = {k: v for d in mchy_intensity_wo_bkd for k, v in d.items()}
mchy_intensity_wo_bkd = pd.DataFrame.from_dict(
    mchy_intensity_wo_bkd, orient="index", columns=["mean_intensity_wo_bkd"]
)
mchy_df = mchy_df.join(mchy_intensity_wo_bkd).persist()
del mchy_intensity_wo_bkd

gfp_df_nobkd = gfp_df[gfp_df["Objectid"] != 0]
gfp_df_nobkd["Object Parquet Index"] = gfp_df_nobkd.apply(
    lambda x: int(
        f"{x['File Index']:04}{x['File Trench Index']:04}{x['timepoints']:04}{x['Objectid']:02}"
    ),
    axis=1,
)
gfp_df_nobkd = gfp_df_nobkd.set_index("Object Parquet Index")

mchy_df_nobkd = mchy_df[mchy_df["Objectid"] != 0]
mchy_df_nobkd["Object Parquet Index"] = mchy_df_nobkd.apply(
    lambda x: int(
        f"{x['File Index']:04}{x['File Trench Index']:04}{x['timepoints']:04}{x['Objectid']:02}"
    ),
    axis=1,
)
mchy_df_nobkd = mchy_df_nobkd.set_index("Object Parquet Index")

ratio_series = (
    gfp_df_nobkd["mean_intensity_wo_bkd"] / mchy_df_nobkd["mean_intensity_wo_bkd"]
)
gfp_df_nobkd["gfp/mchy Ratio"] = ratio_series

trenchid_groupby = gfp_df_nobkd.groupby("trenchid")
median_ratio = trenchid_groupby["gfp/mchy Ratio"].apply(np.median).compute()
median_ratio = median_ratio.sort_index()

In [None]:
plt.hist(
    median_ratio,
    range=(0, 10),
    bins=50,
    color="green",
    label="Measured Dark GFP",
    density=True,
)
plt.xlabel("Mean Intensity Ratio", fontsize=26)
plt.xticks(fontsize=26)
plt.yticks(fontsize=26)
plt.legend(fontsize=26)
# plt.savefig("./GFP_Intensity_Ratio_Dist.png",dpi=300,bbox_inches="tight")
plt.show()

#### Apply GFP Signal Threshold

In [None]:
threshold = 1.0

dark_gfp = median_ratio < threshold
perc_gfp = 1.0 - (np.sum(dark_gfp) / len(median_ratio))
print(perc_gfp)

plt.hist(
    median_ratio[median_ratio < threshold],
    range=(0, 10),
    bins=50,
    color="grey",
    label="Measured Dark GFP",
    density=True,
)
plt.hist(
    median_ratio[median_ratio > threshold],
    range=(0, 10),
    bins=50,
    color="green",
    label="Measured GFP",
    density=True,
)
plt.xlabel("Mean Intensity Ratio", fontsize=26)
plt.xticks(fontsize=26)
plt.yticks(fontsize=26)
plt.legend(fontsize=26)
# plt.savefig("./GFP_Threshold_Distribution_1.png",dpi=300,bbox_inches="tight")
plt.show()

#### Get Trench Mapping

In [None]:
gfp_kymo_df = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2021-10-21_lDE15_Final_1/GFP/kymograph/metadata"
)
barcode_kymo_df = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2021-10-21_lDE15_Final_1/barcodes/kymograph/metadata"
)

max_gfp_tpt = gfp_kymo_df.loc[:1000]["timepoints"].max().compute()
min_barcode_tpt = barcode_kymo_df.loc[:1000]["timepoints"].min().compute()

last_gfp_tpt_df = gfp_kymo_df[gfp_kymo_df["timepoints"] == max_gfp_tpt].compute()
first_barcode_tpt_df = barcode_kymo_df[
    barcode_kymo_df["timepoints"] == min_barcode_tpt
].compute()

trenchid_map = tr.get_trenchid_map(first_barcode_tpt_df, last_gfp_tpt_df)

#### Get GFP Call Error and Recovery Rate

In [None]:
barcode_df["Measured Dark GFP"] = barcode_df.apply(
    tr.map_Series, axis=1, args=(dark_gfp, trenchid_map)
)
barcode_df["Measured GFP Ratio"] = barcode_df.apply(
    tr.map_Series, axis=1, args=(median_ratio, trenchid_map)
)
called_df = barcode_df[barcode_df["Measured Dark GFP"] != "Unknown"]
ttl_correct = np.sum(called_df["dark_gfp"] == called_df["Measured Dark GFP"])
ttl_called = len(called_df)
recovery_rate = len(called_df) / len(dark_gfp)
n_barcodes = called_df["barcodeid"].nunique()
n_trenches = called_df["trenchid"].nunique()

In [None]:
print("Error Rate:" + str(1.0 - ttl_correct / ttl_called))
print("Recovery Rate:" + str(recovery_rate))
print("Unique Barcodes:" + str(n_barcodes))
print("Total Trenches:" + str(n_trenches))

In [None]:
gfp_kymo_df

In [None]:
gfp_kymo_idx = gfp_kymo_df["trenchid"].unique().compute().tolist()
valid_barcode_df = barcode_df[
    barcode_df["trenchid"].isin(trenchid_map.keys())
].compute()
barcode_df_mapped_trenchids = valid_barcode_df["trenchid"].apply(
    lambda x: trenchid_map[x]
)
valid_init_df_indices = barcode_df_mapped_trenchids.isin(gfp_kymo_idx)
barcode_df_mapped_trenchids = barcode_df_mapped_trenchids[valid_init_df_indices]
final_valid_barcode_df_indices = barcode_df_mapped_trenchids.index.to_list()
called_df = barcode_df.loc[final_valid_barcode_df_indices]
called_df["phenotype trenchid"] = barcode_df_mapped_trenchids
called_df = (
    called_df.reset_index()
    .set_index("phenotype trenchid", drop=True, sorted=False)
    .compute()
)
# called_df = called_df.repartition(npartitions=1).persist()

In [None]:
init_cells = init_cells.rename(columns={"trenchid": "phenotype trenchid"})
init_cells = (
    init_cells.reset_index()
    .set_index("phenotype trenchid", drop=True, sorted=False)
    .compute()
)
init_cells = init_cells.merge(called_df, how="inner", left_index=True, right_index=True)
init_cells = init_cells.drop(["Barcode Signal"], axis=1)
init_cells = init_cells.reset_index().set_index("Global CellID")
init_cells = init_cells.sort_index()
final_output_df = dd.from_pandas(init_cells, npartitions=200).persist()

In [None]:
complete_barcode_df = barcode_df[barcode_df["trenchid"].isin(trenchid_map.keys())]

In [None]:
complete_barcode_df["kymograph"] = complete_barcode_df.apply(
    lambda x: trenchid_map[x["trenchid"]], axis=1
)

In [None]:
fig = plt.figure(figsize=(12, 8))

plt.hist(
    called_df[called_df["dark_gfp"] == True]["Measured GFP Ratio"],
    range=(0, 10),
    bins=50,
    color="grey",
    alpha=0.7,
    label="Predicted Dark GFP",
    density=False,
)
plt.hist(
    called_df[called_df["dark_gfp"] == False]["Measured GFP Ratio"],
    range=(0, 10),
    bins=50,
    color="green",
    alpha=0.7,
    label="Predicted GFP",
    density=False,
)
plt.xlabel("Lineage GFP/mCherry Intensity Ratio", fontsize=26)
plt.ylabel("Lineages", fontsize=26)
plt.xticks(fontsize=26)
plt.yticks(fontsize=26)
plt.savefig("./GFP_Threshold_Distribution_2.svg", dpi=300, bbox_inches="tight")
plt.show()

#### Get Confusion Matrix

In [None]:
def get_confusion_mat(df):
    TP = np.sum((df["dark_gfp"] == False) & (df["Measured Dark GFP"] == False))
    TN = np.sum((df["dark_gfp"] == True) & (df["Measured Dark GFP"] == True))
    FP = np.sum((df["dark_gfp"] == False) & (df["Measured Dark GFP"] == True))
    FN = np.sum((df["dark_gfp"] == True) & (df["Measured Dark GFP"] == False))

    error = (FP + FN) / (TP + TN + FP + FN)
    FP_error = FP / (TP + TN + FP + FN)
    FN_error = FN / (TP + TN + FP + FN)

    return error, FP_error, FN_error

In [None]:
error, FP_error, FN_error = get_confusion_mat(called_df)
print("Error: " + str(error))
print("FP error: " + str(FP_error))
print("FN error: " + str(FN_error))

In [None]:
hamming_filters = list(range(1, 5))
hamming_n_barcodes = []
hamming_errors = []
for i in hamming_filters:
    filtered_df = called_df[called_df["Closest Hamming Distance"] >= i]
    n_barcode = len(filtered_df)
    error, FP_error, FN_error = get_confusion_mat(filtered_df)
    error = np.round(100 * error, decimals=2)
    hamming_errors.append(error)
    hamming_n_barcodes.append(n_barcode)

fig = plt.figure(figsize=(8, 6))

sns.lineplot(hamming_filters, hamming_errors, linewidth=4, marker="o", markersize=15)
plt.gca().xaxis.set_major_formatter(FuncFormatter(lambda x, _: int(x)))
plt.xlabel("Minimum Hamming Distance", fontsize=26)
plt.ylabel("Error Rate (%)", fontsize=26)
plt.xticks([1, 2, 3, 4], fontsize=26)
plt.yticks(fontsize=26)
plt.ylim(0.0, 1.0)
plt.savefig("./Hamming_Dist_vs_Error.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
fig = plt.figure(figsize=(8, 6))


sns.lineplot(hamming_filters, hamming_errors, linewidth=4, marker="o", markersize=15)
plt.gca().xaxis.set_major_formatter(FuncFormatter(lambda x, _: int(x)))
plt.xlabel("Minimum Hamming Distance", fontsize=26)
plt.ylabel("Error Rate (%)", fontsize=26)
plt.xticks([1, 2, 3, 4], fontsize=26)
plt.yticks(fontsize=26)
plt.ylim(-5.0, 100.0)

sns.despine()

plt.savefig("./Hamming_Dist_vs_Error_big.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
perc_lib = 100 * (np.array(hamming_n_barcodes) / hamming_n_barcodes[0])

fig = plt.figure(figsize=(8, 6))

sns.lineplot(hamming_filters, perc_lib, linewidth=4, marker="o", markersize=15)
plt.gca().xaxis.set_major_formatter(FuncFormatter(lambda x, _: int(x)))
plt.ylim(0, 110)
plt.xlabel("Minimum Hamming Distance", fontsize=26)
plt.ylabel("Percent of Library", fontsize=26)
plt.xticks([1, 2, 3, 4], fontsize=26)
plt.yticks(
    [0, 20, 40, 60, 80, 100],
    fontsize=26,
)
plt.savefig("./Hamming_Dist_vs_Lib_Size.svg", dpi=300, bbox_inches="tight")
plt.show()

#### Varying The GFP Threshold

In [None]:
called_df

In [None]:
negative_threshold = 1.0
called_negative_df = called_df[called_df["Measured GFP Ratio"] < negative_threshold]

In [None]:
TN = np.sum(
    (called_negative_df["dark_gfp"] == True)
    & (called_negative_df["Measured Dark GFP"] == True)
)
FP = np.sum(
    (called_negative_df["dark_gfp"] == False)
    & (called_negative_df["Measured Dark GFP"] == True)
)

In [None]:
precision = TN / (TN + FN)

In [None]:
FP / (TN + FP)

In [None]:
def get_confusion_mat(df):
    TP = np.sum((df["dark_gfp"] == False) & (df["Measured Dark GFP"] == False))
    TN = np.sum((df["dark_gfp"] == True) & (df["Measured Dark GFP"] == True))
    FP = np.sum((df["dark_gfp"] == False) & (df["Measured Dark GFP"] == True))
    FN = np.sum((df["dark_gfp"] == True) & (df["Measured Dark GFP"] == False))

    error = (FP + FN) / (TP + TN + FP + FN)
    FP_error = FP / (TP + TN + FP + FN)
    FN_error = FN / (TP + TN + FP + FN)

    return error, FP_error, FN_error

In [None]:
def get_gmm_params(values):
    gmm = skl.mixture.GaussianMixture(n_components=2, n_init=10)
    gmm.fit(values.reshape(-1, 1))
    #     probs = gmm.predict_proba(values.reshape(-1,1))
    return gmm.means_[:, 0], ((gmm.covariances_) ** (1 / 2))[:, 0, 0]

In [None]:
test_std = 0.5

means, stds = get_gmm_params(called_df["Measured GFP Ratio"].values)
if means[0] > means[1]:
    means = means[::-1]
    stds = stds[::-1]

upper_bound = means + test_std
lower_bound = means - test_std

valid_dark = called_df["Measured GFP Ratio"] < upper_bound[0]
valid_gfp = called_df["Measured GFP Ratio"] > lower_bound[1]
valid = valid_dark | valid_gfp

filtered_df = called_df[valid]
filtered_df_complement = called_df[~valid]

plt.title("0.5 Standard Deviations", fontsize=20)
plt.hist(
    filtered_df_complement["Measured GFP Ratio"],
    range=(0, 10),
    bins=50,
    color="grey",
    alpha=0.7,
    label="Predicted Dark GFP",
    density=False,
)
plt.hist(
    filtered_df["Measured GFP Ratio"],
    range=(0, 10),
    bins=50,
    color="red",
    alpha=0.7,
    label="Predicted Dark GFP",
    density=False,
)
plt.show()

In [None]:
test_std = 1.0

means, stds = get_gmm_params(called_df["Measured GFP Ratio"].values)
if means[0] > means[1]:
    means = means[::-1]
    stds = stds[::-1]

upper_bound = means + test_std
lower_bound = means - test_std

valid_dark = called_df["Measured GFP Ratio"] < upper_bound[0]
valid_gfp = called_df["Measured GFP Ratio"] > lower_bound[1]
valid = valid_dark | valid_gfp

filtered_df = called_df[valid]
filtered_df_complement = called_df[~valid]

plt.title("1.0 Standard Deviations", fontsize=20)
plt.hist(
    filtered_df_complement["Measured GFP Ratio"],
    range=(0, 10),
    bins=50,
    color="grey",
    alpha=0.7,
    label="Predicted Dark GFP",
    density=False,
)
plt.hist(
    filtered_df["Measured GFP Ratio"],
    range=(0, 10),
    bins=50,
    color="red",
    alpha=0.7,
    label="Predicted Dark GFP",
    density=False,
)
plt.show()

In [None]:
test_std = 1.5

means, stds = get_gmm_params(called_df["Measured GFP Ratio"].values)
if means[0] > means[1]:
    means = means[::-1]
    stds = stds[::-1]

upper_bound = means + test_std
lower_bound = means - test_std

valid_dark = called_df["Measured GFP Ratio"] < upper_bound[0]
valid_gfp = called_df["Measured GFP Ratio"] > lower_bound[1]
valid = valid_dark | valid_gfp

filtered_df = called_df[valid]
filtered_df_complement = called_df[~valid]

plt.title("1.5 Standard Deviations", fontsize=20)
plt.hist(
    filtered_df_complement["Measured GFP Ratio"],
    range=(0, 10),
    bins=50,
    color="grey",
    alpha=0.7,
    label="Predicted Dark GFP",
    density=False,
)
plt.hist(
    filtered_df["Measured GFP Ratio"],
    range=(0, 10),
    bins=50,
    color="red",
    alpha=0.7,
    label="Predicted Dark GFP",
    density=False,
)
plt.show()

In [None]:
import seaborn as sns

sns.set()

n_std = np.linspace(0, 2, 20)
n_barcodes = []
errors = []
FP_errors = []
FN_errors = []

means, stds = get_gmm_params(called_df["Measured GFP Ratio"].values)

if means[0] > means[1]:
    means = means[::-1]
    stds = stds[::-1]

for i in n_std:
    upper_bound = means + stds * i
    lower_bound = means - stds * i

    #     valid_dark = (called_df_barcodes["Measured Median GFP"] < upper_bound[0]) &\
    #     (called_df_barcodes["Measured Median GFP"] > lower_bound[0])
    #     valid_gfp = (called_df_barcodes["Measured Median GFP"] < upper_bound[1]) &\
    #     (called_df_barcodes["Measured Median GFP"] > lower_bound[1])
    #     valid = valid_dark|valid_gfp
    valid_dark = called_df["Measured GFP Ratio"] < upper_bound[0]
    valid_gfp = called_df["Measured GFP Ratio"] > lower_bound[1]
    valid = valid_dark | valid_gfp

    filtered_df = called_df[valid]
    n_barcode = len(filtered_df)
    error, FP_error, FN_error = get_confusion_mat(filtered_df)
    error = np.round(100 * error, decimals=2)
    FP_error = np.round(100 * FP_error, decimals=2)
    FN_error = np.round(100 * FN_error, decimals=2)
    errors.append(error)
    FP_errors.append(FP_error)
    FN_errors.append(FN_error)
    n_barcodes.append(n_barcode)

In [None]:
sns.lineplot(n_std, errors, linewidth=5, label="Error")
sns.lineplot(n_std, FP_errors, linewidth=5, label="FP Error")
sns.lineplot(n_std, FN_errors, linewidth=5, label="FN Error")
plt.xticks(fontsize=20)
plt.yticks(
    fontsize=20,
)
plt.ylim(0.0, 1.0)
plt.xlabel("N $\sigma$s Around Peak", fontsize=20)
plt.ylabel("Error Rate (%)", fontsize=20)
plt.legend(fontsize=20)
plt.savefig("./GFP_Error_vs_Sigma.png", dpi=300, bbox_inches="tight")
plt.show()


sns.lineplot(n_std, n_barcodes, linewidth=5)
plt.xticks(fontsize=20)
plt.yticks(
    fontsize=20,
)
plt.xlabel("N $\sigma$s Around Peak", fontsize=20)
plt.ylabel("Library Size Past Filter", fontsize=20)
plt.savefig("./Library_Size_vs_Sigma.png", dpi=300, bbox_inches="tight")
plt.show()

#### Sources of error

There are around twice the number of false negatives (predicted to be a Dark GFP, but measured as bright) as there are false positives (predicted to be GFP, but measured as dark).

Some theories for these error classes:

False Positives:
    
    - Mutations in the promoter (should be constant within barcodes)
    
    - Strain variation (should be lower when averaging among strains)
    
    - Misread of barcodes
    
False Negatives:
    
    - Bleed from adjacent cells (should be corrected by averging among strains)
    
    - Multiple strains per trench (?)
    
    - Misread of barcodes

#### Median GFP Approach

In [None]:
median_gfp_df = called_df.groupby("Barcode").apply(
    lambda x: x["Measured GFP Ratio"].median()
)

In [None]:
plt.hist(
    median_gfp_df[median_gfp_df < threshold],
    range=(0, 20),
    bins=50,
    color="grey",
    label="Measured Dark GFP",
)
plt.hist(
    median_gfp_df[median_gfp_df > threshold],
    range=(0, 20),
    bins=50,
    color="green",
    label="Measured GFP",
)
plt.xlabel("Mean Intensity Ratio", fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend(fontsize=20)
plt.savefig("./Pooled_correction.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
called_df_barcodes = called_df.set_index(["Barcode"]).sort_index()
called_df_barcodes["Measured Median GFP"] = median_gfp_df
called_df_barcodes.reset_index(drop=False)
called_df_barcodes = called_df_barcodes.groupby("Barcode").apply(lambda x: x.iloc[0])

In [None]:
ttl_correct = np.sum(
    called_df_barcodes["dark_gfp"]
    == (called_df_barcodes["Measured Median GFP"] < threshold)
)
ttl_called = len(called_df_barcodes)
print("Percent Correct:" + str(ttl_correct / ttl_called))

### Error With One Mismatch (Hamming Distance Up to 1, Eliminate Bad Bits)

#### Import Barcode Dataframe

In [None]:
meta_handle = tr.pandas_hdf5_handler(
    "/home/de64/scratch/de64/sync_folder/2021-03-07_lDE15/barcodes/barcode_df_hamming_1.hdf5"
)
barcode_df = meta_handle.read_df("barcodes", read_metadata=True)

#### Compute Call Rate

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy as sp
import sklearn as skl
import dask.dataframe as dd

from matplotlib import pyplot as plt

In [None]:
analysis_df = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2021-03-07_lDE15/gfp/analysis"
)
last_trenchid = int(analysis_df.tail(1)["trenchid"])

In [None]:
ttl_true = np.sum([item == True for item in barcode_df["dark_gfp"].tolist()])
ttl_false = np.sum([item == False for item in barcode_df["dark_gfp"].tolist()])
ttl_none = np.sum([item == "Unknown" for item in barcode_df["dark_gfp"].tolist()])
ttl_called = ttl_true + ttl_false
ttl_trenches = barcode_df.metadata["Total Trenches"]
ttl_trenches_w_signal = barcode_df.metadata["Total Trenches With Cells"]
percent_called = ttl_called / ttl_trenches
percent_called_w_signal = ttl_called / ttl_trenches_w_signal

percent_called_w_gfp_call = ttl_called / last_trenchid
percent_signal_w_gfp_call = ttl_trenches_w_signal / last_trenchid

In [None]:
percent_called

In [None]:
percent_called_w_signal

In [None]:
percent_called_w_gfp_call

In [None]:
percent_signal_w_gfp_call

#### Get GFP Call Error and Recovery Rate

In [None]:
barcode_df["Measured Dark GFP"] = barcode_df.apply(
    tr.map_Series, axis=1, args=(dark_gfp, trenchid_map)
)
barcode_df["Measured GFP Ratio"] = barcode_df.apply(
    tr.map_Series, axis=1, args=(median_ratio, trenchid_map)
)
called_df = barcode_df[barcode_df["Measured Dark GFP"] != "Unknown"]
ttl_correct = np.sum(called_df["dark_gfp"] == called_df["Measured Dark GFP"])
ttl_called = len(called_df)
recovery_rate = len(called_df) / len(dark_gfp)
n_barcodes = called_df["barcodeid"].nunique()
n_trenches = called_df["trenchid"].nunique()

In [None]:
print("Error Rate:" + str(1.0 - ttl_correct / ttl_called))
print("Recovery Rate:" + str(recovery_rate))
print("Unique Barcodes:" + str(n_barcodes))
print("Total Trenches:" + str(n_trenches))

In [None]:
fig = plt.figure(figsize=(12, 8))

plt.hist(
    called_df[called_df["dark_gfp"] == True]["Measured GFP Ratio"],
    range=(0, 10),
    bins=50,
    color="grey",
    alpha=0.7,
    label="Predicted Dark GFP",
    density=False,
)
plt.hist(
    called_df[called_df["dark_gfp"] == False]["Measured GFP Ratio"],
    range=(0, 10),
    bins=50,
    color="green",
    alpha=0.7,
    label="Predicted GFP",
    density=False,
)
plt.xlabel("Lineage GFP/mCherry Intensity Ratio", fontsize=20)
plt.ylabel("Lineages", fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend(fontsize=20)
plt.savefig(
    "./GFP_Threshold_Distribution_2_hamming_1.png", dpi=300, bbox_inches="tight"
)
plt.show()

In [None]:
dask_controller.shutdown()