In [None]:
import paulssonlab.deaton.trenchripper.trenchripper as tr

import numpy as np
import pandas as pd
import seaborn as sns
import scipy as sp
import sklearn as skl
import dask.dataframe as dd
import dask.array as da

from matplotlib.ticker import FuncFormatter

import scipy.stats
from sklearn.linear_model import LinearRegression

from matplotlib import pyplot as plt
import holoviews as hv

hv.extension("bokeh")

In [None]:
dask_controller = tr.trcluster.dask_controller(
    walltime="04:00:00",
    local=False,
    n_workers=50,
    death_timeout=5.0,
    memory="16GB",
    working_directory="/home/de64/scratch/de64/temp/dask",
)
dask_controller.startdask()

In [None]:
dask_controller.displaydashboard()

In [None]:
dask_controller.shutdown()

In [None]:
output_df = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2021-10-21_lDE15_Final_1/2021-11-17_lDE15_Analysis_Filtered",
    engine="pyarrow",
)

In [None]:
output_df_trenchid_sorted = (
    output_df.reset_index().set_index("trenchid", drop=True, sorted=False).persist()
)
output_df_trenchid_groupby = output_df_trenchid_sorted.groupby("trenchid")
median_ratio = (
    output_df_trenchid_groupby["gfp/mchy Ratio"].apply(np.nanmedian).compute()
)

In [None]:
threshold = 1.0

dark_gfp = median_ratio < threshold
perc_gfp = 1.0 - (np.sum(dark_gfp) / len(median_ratio))
print(perc_gfp)

fig = plt.figure(figsize=(12, 8))
plt.hist(
    median_ratio[median_ratio < threshold],
    range=(0, 5),
    bins=50,
    color="grey",
    label="Measured Dark GFP",
    density=True,
    log=True,
)
plt.hist(
    median_ratio[median_ratio > threshold],
    range=(0, 5),
    bins=50,
    color="green",
    label="Measured GFP",
    density=True,
    log=True,
)
plt.xlabel("Mean Intensity Ratio", fontsize=26)
plt.xticks(fontsize=26)
plt.yticks(fontsize=26)
plt.legend(fontsize=26)
# plt.savefig("./GFP_Threshold_Distribution_1.png",dpi=300,bbox_inches="tight")
plt.show()

In [None]:
output_df_trenchid_sorted["Measured Median GFP Ratio"] = median_ratio
output_df_trenchid_sorted["Measured Dark GFP"] = dark_gfp
output_df_trenchid_sorted = output_df_trenchid_sorted.reset_index().set_index(
    "File Parquet Index", drop=True, sorted=False
)
output_single_trenchid_df = (
    output_df_trenchid_sorted.groupby("trenchid")
    .apply(lambda x: x.iloc[0])
    .set_index("trenchid")
)
output_single_trenchid_df = output_single_trenchid_df.compute()

In [None]:
# called_df = output_single_barcode_df[output_single_barcode_df["Measured Dark GFP"]!="Unknown"]
ttl_correct = np.sum(
    output_single_trenchid_df["dark_gfp"]
    == output_single_trenchid_df["Measured Dark GFP"]
)
ttl_called = len(output_single_trenchid_df)

n_barcodes = output_single_trenchid_df["barcodeid"].nunique()

In [None]:
print("Error Rate:" + str(1.0 - ttl_correct / ttl_called))
# print("Recovery Rate:" + str(recovery_rate))
print("Unique Barcodes:" + str(n_barcodes))
print("Total Trenches:" + str(ttl_called))

In [None]:
fig = plt.figure(figsize=(12, 8))

plt.hist(
    output_single_trenchid_df[output_single_trenchid_df["dark_gfp"] == True][
        "Measured Median GFP Ratio"
    ],
    range=(0, 10),
    bins=50,
    color="grey",
    alpha=0.7,
    label="Predicted Dark GFP",
    density=False,
)
plt.hist(
    output_single_trenchid_df[output_single_trenchid_df["dark_gfp"] == False][
        "Measured Median GFP Ratio"
    ],
    range=(0, 10),
    bins=50,
    color="green",
    alpha=0.7,
    label="Predicted GFP",
    density=False,
)
plt.xlabel("Lineage GFP/mCherry Intensity Ratio", fontsize=26)
plt.ylabel("Lineages", fontsize=26)
plt.xticks(fontsize=26)
plt.yticks(fontsize=26)
# plt.savefig("./GFP_Threshold_Distribution_2.svg",dpi=300,bbox_inches="tight")
plt.show()

In [None]:
def get_confusion_mat(df):
    TP = np.sum((df["dark_gfp"] == False) & (df["Measured Dark GFP"] == False))
    TN = np.sum((df["dark_gfp"] == True) & (df["Measured Dark GFP"] == True))
    FP = np.sum((df["dark_gfp"] == False) & (df["Measured Dark GFP"] == True))
    FN = np.sum((df["dark_gfp"] == True) & (df["Measured Dark GFP"] == False))

    error = (FP + FN) / (TP + TN + FP + FN)
    FP_error = FP / (TP + TN + FP + FN)
    FN_error = FN / (TP + TN + FP + FN)

    return error, FP_error, FN_error

In [None]:
error, FP_error, FN_error = get_confusion_mat(output_single_trenchid_df)
print("Error: " + str(error))
print("FP error: " + str(FP_error))
print("FN error: " + str(FN_error))

In [None]:
hamming_filters = list(range(1, 5))
hamming_n_barcodes = []
hamming_errors = []
for i in hamming_filters:
    filtered_df = output_single_trenchid_df[
        output_single_trenchid_df["Closest Hamming Distance"] >= i
    ]
    n_barcode = len(filtered_df)
    error, FP_error, FN_error = get_confusion_mat(filtered_df)
    error = np.round(100 * error, decimals=2)
    hamming_errors.append(error)
    hamming_n_barcodes.append(n_barcode)

fig = plt.figure(figsize=(8, 6))

sns.lineplot(hamming_filters, hamming_errors, linewidth=4, marker="o", markersize=15)
plt.gca().xaxis.set_major_formatter(FuncFormatter(lambda x, _: int(x)))
plt.xlabel("Minimum Hamming Distance", fontsize=26)
plt.ylabel("Error Rate (%)", fontsize=26)
plt.xticks([1, 2, 3, 4], fontsize=26)
plt.yticks(fontsize=26)
plt.ylim(0.0, 1.0)
plt.savefig("./Hamming_Dist_vs_Error.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
fig = plt.figure(figsize=(8, 6))

sns.lineplot(hamming_filters, hamming_errors, linewidth=4, marker="o", markersize=15)
plt.gca().xaxis.set_major_formatter(FuncFormatter(lambda x, _: int(x)))
plt.xlabel("Minimum Hamming Distance", fontsize=26)
plt.ylabel("Error Rate (%)", fontsize=26)
plt.xticks([1, 2, 3, 4], fontsize=26)
plt.yticks(fontsize=26)
plt.ylim(-5.0, 100.0)

sns.despine()

plt.savefig("./Hamming_Dist_vs_Error_big.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
perc_lib = 100 * (np.array(hamming_n_barcodes) / hamming_n_barcodes[0])

fig = plt.figure(figsize=(8, 6))

sns.lineplot(hamming_filters, perc_lib, linewidth=4, marker="o", markersize=15)
plt.gca().xaxis.set_major_formatter(FuncFormatter(lambda x, _: int(x)))
plt.ylim(0, 110)
plt.xlabel("Minimum Hamming Distance", fontsize=26)
plt.ylabel("Percent of Library", fontsize=26)
plt.xticks([1, 2, 3, 4], fontsize=26)
plt.yticks(
    [0, 20, 40, 60, 80, 100],
    fontsize=26,
)
plt.savefig("./Hamming_Dist_vs_Lib_Size.svg", dpi=300, bbox_inches="tight")
plt.show()

### From Old Notebook

#### Estimating Error from Recall Rate

math for this in scanned doc; using empirical hamming distance distribution

In [None]:
N = 29
p_err_out = 1.0 - percent_called_w_signal
H_vals, H_counts = np.unique(
    barcode_df["Closest Hamming Distance"].values, return_counts=True
)
p_Hdist = H_counts / np.sum(H_counts)

In [None]:
coeffs = [-p_err_out]
for idx, j in enumerate(H_vals):
    bin_coeff = sp.special.binom(N, j)
    coeff = bin_coeff - p_Hdist[idx]
    coeffs.append(coeff)
coeffs = np.array(coeffs)[::-1]
roots = np.roots(coeffs)
epsilon = np.real(roots[~np.iscomplex(roots)])[0]

In [None]:
p_err_in = 0.0
for idx, j in enumerate(H_vals):
    p_err_in += p_Hdist[idx] * (epsilon**j)

In [None]:
print("Epsilon Estimate: " + str(epsilon))
print("P(error in) Estimate: " + str(p_err_in))

Guessing epislon based on single bit error rate (using soft matching)

In [None]:
output = []
eps_range = np.linspace(0.0, 0.2)
for epsilon in eps_range:
    #     epsilon = 0.05
    p_err_in = 0.0
    for idx, j in enumerate(H_vals):
        p_err_in += p_Hdist[idx] * (epsilon**j)
    output.append(p_err_in * 100)


plt.plot(eps_range, output)
plt.xlabel("Epsilon", fontsize=20)
plt.ylabel("Error Rate (%)", fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.show()

#### Varying The GFP Threshold

In [None]:
called_df

In [None]:
negative_threshold = 1.0
called_negative_df = called_df[called_df["Measured GFP Ratio"] < negative_threshold]

In [None]:
TN = np.sum(
    (called_negative_df["dark_gfp"] == True)
    & (called_negative_df["Measured Dark GFP"] == True)
)
FP = np.sum(
    (called_negative_df["dark_gfp"] == False)
    & (called_negative_df["Measured Dark GFP"] == True)
)

In [None]:
precision = TN / (TN + FN)

In [None]:
FP / (TN + FP)

In [None]:
def get_confusion_mat(df):
    TP = np.sum((df["dark_gfp"] == False) & (df["Measured Dark GFP"] == False))
    TN = np.sum((df["dark_gfp"] == True) & (df["Measured Dark GFP"] == True))
    FP = np.sum((df["dark_gfp"] == False) & (df["Measured Dark GFP"] == True))
    FN = np.sum((df["dark_gfp"] == True) & (df["Measured Dark GFP"] == False))

    error = (FP + FN) / (TP + TN + FP + FN)
    FP_error = FP / (TP + TN + FP + FN)
    FN_error = FN / (TP + TN + FP + FN)

    return error, FP_error, FN_error

In [None]:
def get_gmm_params(values):
    gmm = skl.mixture.GaussianMixture(n_components=2, n_init=10)
    gmm.fit(values.reshape(-1, 1))
    #     probs = gmm.predict_proba(values.reshape(-1,1))
    return gmm.means_[:, 0], ((gmm.covariances_) ** (1 / 2))[:, 0, 0]

In [None]:
test_std = 0.5

means, stds = get_gmm_params(called_df["Measured GFP Ratio"].values)
if means[0] > means[1]:
    means = means[::-1]
    stds = stds[::-1]

upper_bound = means + test_std
lower_bound = means - test_std

valid_dark = called_df["Measured GFP Ratio"] < upper_bound[0]
valid_gfp = called_df["Measured GFP Ratio"] > lower_bound[1]
valid = valid_dark | valid_gfp

filtered_df = called_df[valid]
filtered_df_complement = called_df[~valid]

plt.title("0.5 Standard Deviations", fontsize=20)
plt.hist(
    filtered_df_complement["Measured GFP Ratio"],
    range=(0, 10),
    bins=50,
    color="grey",
    alpha=0.7,
    label="Predicted Dark GFP",
    density=False,
)
plt.hist(
    filtered_df["Measured GFP Ratio"],
    range=(0, 10),
    bins=50,
    color="red",
    alpha=0.7,
    label="Predicted Dark GFP",
    density=False,
)
plt.show()

In [None]:
test_std = 1.0

means, stds = get_gmm_params(called_df["Measured GFP Ratio"].values)
if means[0] > means[1]:
    means = means[::-1]
    stds = stds[::-1]

upper_bound = means + test_std
lower_bound = means - test_std

valid_dark = called_df["Measured GFP Ratio"] < upper_bound[0]
valid_gfp = called_df["Measured GFP Ratio"] > lower_bound[1]
valid = valid_dark | valid_gfp

filtered_df = called_df[valid]
filtered_df_complement = called_df[~valid]

plt.title("1.0 Standard Deviations", fontsize=20)
plt.hist(
    filtered_df_complement["Measured GFP Ratio"],
    range=(0, 10),
    bins=50,
    color="grey",
    alpha=0.7,
    label="Predicted Dark GFP",
    density=False,
)
plt.hist(
    filtered_df["Measured GFP Ratio"],
    range=(0, 10),
    bins=50,
    color="red",
    alpha=0.7,
    label="Predicted Dark GFP",
    density=False,
)
plt.show()

In [None]:
test_std = 1.5

means, stds = get_gmm_params(called_df["Measured GFP Ratio"].values)
if means[0] > means[1]:
    means = means[::-1]
    stds = stds[::-1]

upper_bound = means + test_std
lower_bound = means - test_std

valid_dark = called_df["Measured GFP Ratio"] < upper_bound[0]
valid_gfp = called_df["Measured GFP Ratio"] > lower_bound[1]
valid = valid_dark | valid_gfp

filtered_df = called_df[valid]
filtered_df_complement = called_df[~valid]

plt.title("1.5 Standard Deviations", fontsize=20)
plt.hist(
    filtered_df_complement["Measured GFP Ratio"],
    range=(0, 10),
    bins=50,
    color="grey",
    alpha=0.7,
    label="Predicted Dark GFP",
    density=False,
)
plt.hist(
    filtered_df["Measured GFP Ratio"],
    range=(0, 10),
    bins=50,
    color="red",
    alpha=0.7,
    label="Predicted Dark GFP",
    density=False,
)
plt.show()

In [None]:
import seaborn as sns

sns.set()

n_std = np.linspace(0, 2, 20)
n_barcodes = []
errors = []
FP_errors = []
FN_errors = []

means, stds = get_gmm_params(called_df["Measured GFP Ratio"].values)

if means[0] > means[1]:
    means = means[::-1]
    stds = stds[::-1]

for i in n_std:
    upper_bound = means + stds * i
    lower_bound = means - stds * i

    #     valid_dark = (called_df_barcodes["Measured Median GFP"] < upper_bound[0]) &\
    #     (called_df_barcodes["Measured Median GFP"] > lower_bound[0])
    #     valid_gfp = (called_df_barcodes["Measured Median GFP"] < upper_bound[1]) &\
    #     (called_df_barcodes["Measured Median GFP"] > lower_bound[1])
    #     valid = valid_dark|valid_gfp
    valid_dark = called_df["Measured GFP Ratio"] < upper_bound[0]
    valid_gfp = called_df["Measured GFP Ratio"] > lower_bound[1]
    valid = valid_dark | valid_gfp

    filtered_df = called_df[valid]
    n_barcode = len(filtered_df)
    error, FP_error, FN_error = get_confusion_mat(filtered_df)
    error = np.round(100 * error, decimals=2)
    FP_error = np.round(100 * FP_error, decimals=2)
    FN_error = np.round(100 * FN_error, decimals=2)
    errors.append(error)
    FP_errors.append(FP_error)
    FN_errors.append(FN_error)
    n_barcodes.append(n_barcode)

In [None]:
sns.lineplot(n_std, errors, linewidth=5, label="Error")
sns.lineplot(n_std, FP_errors, linewidth=5, label="FP Error")
sns.lineplot(n_std, FN_errors, linewidth=5, label="FN Error")
plt.xticks(fontsize=20)
plt.yticks(
    fontsize=20,
)
plt.ylim(0.0, 1.0)
plt.xlabel("N $\sigma$s Around Peak", fontsize=20)
plt.ylabel("Error Rate (%)", fontsize=20)
plt.legend(fontsize=20)
plt.savefig("./GFP_Error_vs_Sigma.png", dpi=300, bbox_inches="tight")
plt.show()


sns.lineplot(n_std, n_barcodes, linewidth=5)
plt.xticks(fontsize=20)
plt.yticks(
    fontsize=20,
)
plt.xlabel("N $\sigma$s Around Peak", fontsize=20)
plt.ylabel("Library Size Past Filter", fontsize=20)
plt.savefig("./Library_Size_vs_Sigma.png", dpi=300, bbox_inches="tight")
plt.show()

#### Sources of error

There are around twice the number of false negatives (predicted to be a Dark GFP, but measured as bright) as there are false positives (predicted to be GFP, but measured as dark).

Some theories for these error classes:

False Positives:
    
    - Mutations in the promoter (should be constant within barcodes)
    
    - Strain variation (should be lower when averaging among strains)
    
    - Misread of barcodes
    
False Negatives:
    
    - Bleed from adjacent cells (should be corrected by averging among strains)
    
    - Multiple strains per trench (?)
    
    - Misread of barcodes

#### Median GFP Approach

In [None]:
median_gfp_df = called_df.groupby("Barcode").apply(
    lambda x: x["Measured GFP Ratio"].median()
)

In [None]:
plt.hist(
    median_gfp_df[median_gfp_df < threshold],
    range=(0, 20),
    bins=50,
    color="grey",
    label="Measured Dark GFP",
)
plt.hist(
    median_gfp_df[median_gfp_df > threshold],
    range=(0, 20),
    bins=50,
    color="green",
    label="Measured GFP",
)
plt.xlabel("Mean Intensity Ratio", fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend(fontsize=20)
plt.savefig("./Pooled_correction.png", dpi=300, bbox_inches="tight")
plt.show()

In [None]:
called_df_barcodes = called_df.set_index(["Barcode"]).sort_index()
called_df_barcodes["Measured Median GFP"] = median_gfp_df
called_df_barcodes.reset_index(drop=False)
called_df_barcodes = called_df_barcodes.groupby("Barcode").apply(lambda x: x.iloc[0])

In [None]:
ttl_correct = np.sum(
    called_df_barcodes["dark_gfp"]
    == (called_df_barcodes["Measured Median GFP"] < threshold)
)
ttl_called = len(called_df_barcodes)
print("Percent Correct:" + str(ttl_correct / ttl_called))

### Error With One Mismatch (Hamming Distance Up to 1, Eliminate Bad Bits)

#### Import Barcode Dataframe

In [None]:
meta_handle = tr.pandas_hdf5_handler(
    "/home/de64/scratch/de64/sync_folder/2021-03-07_lDE15/barcodes/barcode_df_hamming_1.hdf5"
)
barcode_df = meta_handle.read_df("barcodes", read_metadata=True)

#### Compute Call Rate

In [None]:
import numpy as np
import pandas as pd
import seaborn as sns
import scipy as sp
import sklearn as skl
import dask.dataframe as dd

from matplotlib import pyplot as plt

In [None]:
analysis_df = dd.read_parquet(
    "/home/de64/scratch/de64/sync_folder/2021-03-07_lDE15/gfp/analysis"
)
last_trenchid = int(analysis_df.tail(1)["trenchid"])

In [None]:
ttl_true = np.sum([item == True for item in barcode_df["dark_gfp"].tolist()])
ttl_false = np.sum([item == False for item in barcode_df["dark_gfp"].tolist()])
ttl_none = np.sum([item == "Unknown" for item in barcode_df["dark_gfp"].tolist()])
ttl_called = ttl_true + ttl_false
ttl_trenches = barcode_df.metadata["Total Trenches"]
ttl_trenches_w_signal = barcode_df.metadata["Total Trenches With Cells"]
percent_called = ttl_called / ttl_trenches
percent_called_w_signal = ttl_called / ttl_trenches_w_signal

percent_called_w_gfp_call = ttl_called / last_trenchid
percent_signal_w_gfp_call = ttl_trenches_w_signal / last_trenchid

In [None]:
percent_called

In [None]:
percent_called_w_signal

In [None]:
percent_called_w_gfp_call

In [None]:
percent_signal_w_gfp_call

#### Get GFP Call Error and Recovery Rate

In [None]:
barcode_df["Measured Dark GFP"] = barcode_df.apply(
    tr.map_Series, axis=1, args=(dark_gfp, trenchid_map)
)
barcode_df["Measured GFP Ratio"] = barcode_df.apply(
    tr.map_Series, axis=1, args=(median_ratio, trenchid_map)
)
called_df = barcode_df[barcode_df["Measured Dark GFP"] != "Unknown"]
ttl_correct = np.sum(called_df["dark_gfp"] == called_df["Measured Dark GFP"])
ttl_called = len(called_df)
recovery_rate = len(called_df) / len(dark_gfp)
n_barcodes = called_df["barcodeid"].nunique()
n_trenches = called_df["trenchid"].nunique()

In [None]:
print("Error Rate:" + str(1.0 - ttl_correct / ttl_called))
print("Recovery Rate:" + str(recovery_rate))
print("Unique Barcodes:" + str(n_barcodes))
print("Total Trenches:" + str(n_trenches))

In [None]:
fig = plt.figure(figsize=(12, 8))

plt.hist(
    called_df[called_df["dark_gfp"] == True]["Measured GFP Ratio"],
    range=(0, 10),
    bins=50,
    color="grey",
    alpha=0.7,
    label="Predicted Dark GFP",
    density=False,
)
plt.hist(
    called_df[called_df["dark_gfp"] == False]["Measured GFP Ratio"],
    range=(0, 10),
    bins=50,
    color="green",
    alpha=0.7,
    label="Predicted GFP",
    density=False,
)
plt.xlabel("Lineage GFP/mCherry Intensity Ratio", fontsize=20)
plt.ylabel("Lineages", fontsize=20)
plt.xticks(fontsize=20)
plt.yticks(fontsize=20)
plt.legend(fontsize=20)
plt.savefig(
    "./GFP_Threshold_Distribution_2_hamming_1.png", dpi=300, bbox_inches="tight"
)
plt.show()

In [None]:
dask_controller.shutdown()