In [None]:
import re
import warnings
from pathlib import Path

import Bio.Restriction as Restriction
import pandas as pd
import parasail
import pygsheets
import requests
import toml
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.api as api
import paulssonlab.api.geneious as geneious
import paulssonlab.cloning.enzyme as enzyme
import paulssonlab.cloning.io as io
import paulssonlab.cloning.registry as registry
import paulssonlab.cloning.sequence as sequence
import paulssonlab.cloning.workflow as workflow

# Setup

In [None]:
config = toml.load("config.toml")

In [None]:
gc = pygsheets.authorize(service_account_file="credentials.json")

In [None]:
reg = registry.Registry(gc, config["registry"]["folder"])

# Config

In [None]:
olib_oligos = reg[("oLIB", "oligos")]
olt_oligos = reg[("oLT", "oligos")]
plib_plasmids = reg[("pLIB", "plasmids")]
plib_maps = reg[("pLIB", "maps")]
flib_fragments = reg[("fLIB", "fragments")]
part_types = reg[("fLIB", "fragments", "Part types")]

## Probe set

In [None]:
PROBE_FORMAT = "C{Cycle:02d}_{Channel}"

In [None]:
probes_df = (
    gc.open_by_key("1dL_I39dvgdQ7gw47gs0zYkU9uGmODg4hlJEPzFpUAIs")
    .worksheet()
    .get_as_df()
)

In [None]:
probes_df = probes_df[
    probes_df["Oligo Name"].str.startswith("DE-A")
    & (probes_df["Sequence"].str.len() != 0)
    & ~probes_df["Sequence"].str.isspace()
].copy()

In [None]:
# probes_df["Sequence"].str.replace(r"\/[^/]*/", "", regex=True)

In [None]:
probes_df["Channel"] = probes_df["Dye"].replace(
    {"AF488": "GFP", "AF647": "Cy5", "Alexa750": "Cy7", "NA": ""}
)

In [None]:
probe_set_df = probes_df[probes_df["Dye"].isin(["AF488", "AF647", "Alexa750"])].copy()

In [None]:
probe_set_df["Name"] = probe_set_df.apply(
    lambda row: PROBE_FORMAT.format(**row), axis=1
)

In [None]:
probe_set_df = probe_set_df.sort_values("Name")

In [None]:
placeholder_set_df = probes_df[probes_df["Channel"] == ""]

In [None]:
placeholder_set_df = placeholder_set_df.join(
    probe_set_df.set_index("Bit")[["Name"]], on="Bit"
)

# Barcode decode

In [None]:
sub_matrix = parasail.matrix_create("acgt", 1, -2)

In [None]:
all_probes = set(probe_set_df["Name"]) | set(placeholder_set_df["Name"])

In [None]:
results = {}

In [None]:
circular = True

In [None]:
# seq_files = Path("/Users/Jacob/Downloads/X4LSJG_results/X4LSJG_genbank_files").glob("*.gbk")
# seq_files = Path("/Users/jacob/Downloads/8J2L8P_results/8J2L8P_genbank_files").glob(
#     "*.gbk"
# )
seq_files = Path("/Users/jacob/Downloads/220304_Eaton_BCs_30-666511268_ab1").glob(
    "*.ab1"
)

In [None]:
seq_files = sorted(list(seq_files))

In [None]:
seq_files

In [None]:
for seq_file in seq_files:
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        orig_seq = io.read_file(seq_file)
    for rc in (False, True):
        if rc:
            seq = sequence.reverse_complement(orig_seq)
        else:
            seq = orig_seq
        seq = workflow.normalize_seq(seq)
        if circular:
            seq = seq + seq  # to handle wrap-around in circular sequences
        on_probes = set()
        off_probes = set()
        partial_matches = {}
        for on, selected_probes_df in [
            (True, probe_set_df),
            (False, placeholder_set_df),
        ]:
            for _, row in selected_probes_df.iterrows():
                probe_seq = workflow.normalize_seq(row["Sequence"])
                match = parasail.sg_dx_trace_striped_sat(
                    probe_seq, seq, 4, 1, sub_matrix
                )
                if match.score == len(probe_seq):
                    if on:
                        on_probes.add(row["Name"])
                    else:
                        off_probes.add(row["Name"])
                elif match.score > len(probe_seq) / 2:
                    partial_match_name = row["Name"] + ("_off" if not on else "")
                    partial_matches[partial_match_name] = {
                        "name": partial_match_name,
                        "score": match.score,
                        "cigar": match.cigar.decode.decode(),
                    }
        name = "{}{}".format(seq_file.name, " (rc)" if rc else "")
        missing_probes = all_probes - set(on_probes) - set(off_probes)
        conflicting_probes = set(on_probes) & set(off_probes)
        if not on_probes and not conflicting_probes and not partial_matches and not rc:
            continue
        results[name] = {
            "missing_probes": missing_probes,
            "conflicting_probes": conflicting_probes,
            "on_probes": on_probes,
            "partial_matches": partial_matches,
        }
        if not missing_probes:
            break

In [None]:
for name, result in results.items():
    print(f"{name}:")
    if on_probes:
        print("    ON: {}".format(",".join(result["on_probes"])))
    if partial_matches:
        for partial_match in result["partial_matches"].values():
            print(
                "    PARTIAL: {name} score:{score} CIGAR:{cigar}".format(
                    **partial_match
                )
            )
    if missing_probes:
        print("    MISSING: {}".format(",".join(result["missing_probes"])))
    if conflicting_probes:
        print("    CONFLICTING: {}".format(",".join(result["conflicting_probes"])))
    print()

# Barcode table

In [None]:
# results.pop("NAO676_01.gbk (rc)", None)
results.pop("NAO680_08.gbk (rc)", None)
results.pop("pLIB317-oDE201.ab1 (rc)", None)
results.pop("pLIB318-oDE201.ab1 (rc)", None)
results.pop("pLIB318-oDE84.ab1 (rc)", None)
results.pop("pLIB317-oDE84.ab1 (rc)", None);

In [None]:
num_cycles = probe_set_df["Cycle"].max()
# channels = set(probe_set_df["Channel"])
channels = ["GFP", "Cy5", "Cy7"]

In [None]:
def result_column(result, channel):
    return [
        _result_cell(
            result, PROBE_FORMAT.format(**{"Cycle": cycle, "Channel": channel})
        )
        for cycle in range(1, num_cycles + 1)
    ]


def _result_cell(result, probe_name):
    if probe_name in result["on_probes"]:
        return "■"
    else:
        if probe_name in result["partial_matches"]:
            return "■?"
        elif f"{probe_name}_off" in result["partial_matches"]:
            return "□?"
        else:
            return "□"

In [None]:
IDX = pd.IndexSlice

In [None]:
table = pd.DataFrame(
    {
        (name, channel): result_column(result, channel)
        for name, result in results.items()
        for channel in channels
    }
)

In [None]:
table.style.set_properties(
    **{"color": "#00ff00"}, subset=IDX[:, IDX[:, "GFP"]]
).set_properties(**{"color": "#00ffff"}, subset=IDX[:, IDX[:, "Cy5"]]).set_properties(
    **{"color": "#ff00ff"}, subset=IDX[:, IDX[:, "Cy7"]]
)

In [None]:
colors = {"GFP": "#A6D629", "Cy5": "#29A6D6", "Cy7": "#D629A6"}

In [None]:
table.T.style.set_properties(
    **{"color": colors["GFP"]}, subset=IDX[IDX[:, "GFP"], :]
).set_properties(
    **{"color": colors["Cy5"], "text-align": "left"}, subset=IDX[IDX[:, "Cy5"], :]
).set_properties(
    **{"color": colors["Cy7"], "text-align": "left"}, subset=IDX[IDX[:, "Cy7"], :]
).apply_index(
    lambda s: np.select(
        [s == channel for channel in channels],
        [f"color:white;background-color:{colors[channel]};" for channel in channels],
    ),
    level=1,
    axis=0,
).set_table_styles(
    [{"selector": "td, th", "props": "width:20px;text-align:left;"}]
).set_table_styles(
    {0: [{"selector": "td, th", "props": "padding-left:15px;"}]},
    overwrite=False,
    axis=0,
).set_table_styles(
    {
        c: [{"selector": "td, th", "props": "border-bottom: 2px solid #aaa;"}]
        for c in table.columns
        if c[1] == "Cy7"
    },
    overwrite=False,
    axis=1,
)

# Probe sequences

In [None]:
for _, row in probe_set_df[["Name", "Sequence"]].iterrows():
    print("{}\t{}".format(*row))

In [None]:
for _, row in placeholder_set_df[["Name", "Sequence"]].iterrows():
    print("{}_off\t{}".format(*row))