In [None]:
import pandas as pd
import re
import toml
import warnings
from pathlib import Path
import pygsheets
import requests
import Bio.Restriction as Restriction
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
import parasail

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
import paulssonlab.api as api
import paulssonlab.cloning.registry as registry
import paulssonlab.cloning.io as io
import paulssonlab.cloning.workflow as workflow
import paulssonlab.cloning.sequence as sequence
import paulssonlab.cloning.enzyme as enzyme
import paulssonlab.api.geneious as geneious

# Setup

In [None]:
config = toml.load("config.toml")

In [None]:
gc = pygsheets.authorize(service_account_file="credentials.json")

In [None]:
reg = registry.Registry(gc, config["registry"]["folder"])

# Config

In [None]:
olib_oligos = reg[("oLIB", "oligos")]
olt_oligos = reg[("oLT", "oligos")]
plib_plasmids = reg[("pLIB", "plasmids")]
plib_maps = reg[("pLIB", "maps")]
flib_fragments = reg[("fLIB", "fragments")]
part_types = reg[("fLIB", "fragments", "Part types")]

## Probe set

In [None]:
probes_df = (
    gc.open_by_key("1dL_I39dvgdQ7gw47gs0zYkU9uGmODg4hlJEPzFpUAIs")
    .worksheet()
    .get_as_df()
)

In [None]:
probes_df = probes_df[
    probes_df["Oligo Name"].str.startswith("DE-A")
    & (probes_df["Sequence"].str.len() != 0)
    & ~probes_df["Sequence"].str.isspace()
].copy()

In [None]:
#probes_df["Sequence"].str.replace(r"\/[^/]*/", "", regex=True)

In [None]:
probes_df["Channel"] = probes_df["Dye"].replace({"AF488": "GFP", "AF647": "Cy5", "Alexa750": "Cy7", "NA": ""})

In [None]:
probe_set_df = probes_df[probes_df["Dye"].isin(["AF488", "AF647", "Alexa750"])].copy()

In [None]:
probe_set_df["Name"] = probe_set_df.apply(lambda row: "C{Cycle:02d}_{Channel}".format(**row), axis=1)

In [None]:
probe_set_df = probe_set_df.sort_values("Name")

In [None]:
placeholder_set_df = probes_df[probes_df["Channel"] == ""]

In [None]:
placeholder_set_df = placeholder_set_df.join(probe_set_df.set_index("Bit")[["Name"]], on="Bit")

# Barcode decode

In [None]:
circular = True

In [None]:
#seq_files = Path("/Users/Jacob/Downloads/X4LSJG_results/X4LSJG_genbank_files").glob("*.gbk")
seq_files = Path("/Users/jacob/Downloads/220304_Eaton_BCs_30-666511268_ab1").glob("*.ab1")

In [None]:
seq_files = sorted(list(seq_files))

In [None]:
seq_files

In [None]:
sub_matrix = parasail.matrix_create("acgt", 1, -2)

In [None]:
all_probes = set(probe_set_df["Name"]) | set(placeholder_set_df["Name"])

In [None]:
for seq_file in seq_files:
    with warnings.catch_warnings():
        warnings.simplefilter("ignore")
        orig_seq = io.read_file(seq_file)
    for rc in (False, True):
        if rc:
            seq = sequence.reverse_complement(orig_seq)
        else:
            seq = orig_seq
        seq = workflow.normalize_seq(seq)
        if circular:
            seq = seq + seq # to handle wrap-around in circular sequences
        on_probes = []
        off_probes = []
        partial_matches = []
        for on, selected_probes_df in [(True, probe_set_df), (False, placeholder_set_df)]:
            for _, row in selected_probes_df.iterrows():
                probe_seq = workflow.normalize_seq(row["Sequence"])
                match = parasail.sg_dx_trace_striped_sat(probe_seq, seq, 4, 1, sub_matrix)
                if match.score == len(probe_seq):
                    if on:
                        on_probes.append(row["Name"])
                    else:
                        off_probes.append(row["Name"])
                elif match.score > len(probe_seq) / 2:
                    partial_matches.append((row["Name"] + ("_off" if not on else ""), match.score, match.cigar.decode.decode()))
        prefix = "{}{}:".format(seq_file.name, " (rc)" if rc else "")
        missing_probes = all_probes - set(on_probes) - set(off_probes)
        conflicting_probes = set(on_probes) & set(off_probes)
        if not on_probes and not conflicting_probes and not partial_matches and not rc:
            continue
        print(prefix)
        if on_probes:
            print("    ON: {}".format(",".join(on_probes)))
        if partial_matches:
            for probe_name, score, cigar in partial_matches:
                print("    PARTIAL: {} score:{} CIGAR:{}".format(probe_name, score, cigar))
        if missing_probes:
            print("    MISSING: {}".format(",".join(missing_probes)))
        if conflicting_probes:
            print("    CONFLICTING: {}".format(",".join(conflicting_probes)))
        print()
        if not missing_probes:
            break

In [None]:
for _, row in probe_set_df[["Name", "Sequence"]].iterrows():
    print("{}\t{}".format(*row))

In [None]:
for _, row in placeholder_set_df[["Name", "Sequence"]].iterrows():
    print("{}_off\t{}".format(*row))