In [None]:
import collections

import ipywidgets
import numpy as np
import pandas as pd
import qgrid

In [None]:
%load_ext autoreload
%autoreload 2

In [None]:
from cloning import *

In [None]:
rcvenus_seq = "atgagtaaaggcgaagaattgttcactggcgtggtaccgatcctggtagaactggatggcgacgttaatggtcacaagttcagcgttagtggagagggtgaaggtgatgcgacctatggcaaactgaccctgaagctgatctgcacaaccggcaagctgcctgttccttggccgacactggttacaacgctgggctatggcgtacaatgtttcgcacggtacccggaccacatgaagcaacatgacttcttcaagagcgctatgcctgaaggctatgtccaagaaaggactatcttcttcaaagacgacggcaattacaagacacgggccgaagtcaaattcgaaggcgatacgctggtcaacagaatcgagctgaaaggcatcgacttcaaggaagatggcaacatcctgggccataaactggaatataattataacagtcataatgtgtatatcaccgctgacaaacaaaagaatggcatcaaggccaacttcaaaatcagacataacatcgaagatggaggtgttcaactggcagaccactaccaacaaaatactccgatcggcgatggcccggtgctgctgccggataaccattatctgagttatcaaagtaagctgagcaaggatccgaacgaaaaaagagatcatatggttctgctggaattcgtaacggccgcgggcatcacgcatggcatggacgagctgtataaataa"
mvenus_seq = "ATGAGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACATACGGAAAACTTACCCTTAAATTGATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTTTGGGTTATGGTGTTCAATGCTTTGCGAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAAAGAACTATATTTTTCAAAGATGACGGGAACTACAAGACACGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATAGAATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTTGGACACAAATTGGAATACAACTATAACTCACACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCGAACTTCAAAATTAGACACAACATTGAAGATGGAGGTGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCCTACCAATCTAAGCTTTCGAAAGATCCCAACGAAAAGAGAGACCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGGATGAACTATACAAATAA".lower()
mscfp3_seq = "ATGAGTAAAGGAGAAGAACTTTTCACTGGAGTTGTCCCAATTCTTGTTGAATTAGATGGTGATGTTAATGGGCACAAATTTTCTGTCAGTGGAGAGGGTGAAGGTGATGCAACATACGGAAAACTTACCCTTAAATTTATTTGCACTACTGGAAAACTACCTGTTCCATGGCCAACACTTGTCACTACTCTCACTTGGGGTGTTCAATGCTTTGCAAGATACCCAGATCATATGAAACAGCATGACTTTTTCAAGAGTGCCATGCCCGAAGGTTATGTACAGGAAAGAACTATATTTTTCAAAGATGACGGGAACTACAAGACACGTGCTGAAGTCAAGTTTGAAGGTGATACCCTTGTTAATAGAATCGAGTTAAAAGGTATTGATTTTAAAGAAGATGGAAACATTCTTGGACACAAATTGGAATACAACTACATCTCAGACAATGTATACATCACGGCAGACAAACAAAAGAATGGAATCAAAGCTAACTTCAAAATTAGACACAACATTGAAGATGGAGGCGTTCAACTAGCAGACCATTATCAACAAAATACTCCAATTGGCGATGGCCCTGTCCTTTTACCAGACAACCATTACCTGTCCACACAATCTAAGCTTTCGAAAGATCCCAACGAAAAGAGAGACCACATGGTCCTTCTTGAGTTTGTAACAGCTGCTGGGATTACACATGGCATGGATGAACTATACAAATAA".lower()
mscarlet_seq = "ATGAGTAAAGGAGAAGCTGTGATTAAAGAGTTCATGCGCTTCAAAGTTCACATGGAGGGTTCTATGAACGGTCACGAGTTCGAGATCGAAGGCGAAGGCGAGGGCCGTCCGTATGAAGGCACCCAGACCGCCAAACTGAAAGTGACTAAAGGCGGCCCGCTGCCTTTTTCCTGGGACATCCTGAGCCCGCAATTTATGTACGGTTCTAGGGCGTTCATCAAACACCCAGCGGATATCCCGGACTATTATAAGCAGTCTTTTCCGGAAGGTTTCAAGTGGGAACGCGTAATGAATTTTGAAGATGGTGGTGCCGTGACCGTCACTCAGGACACCTCCCTGGAGGATGGCACCCTGATCTATAAAGTTAAACTGCGTGGTACTAATTTTCCACCTGATGGCCCGGTGATGCAGAAAAAGACGATGGGTTGGGAGGCGTCTACCGAACGCTTGTATCCGGAAGATGGTGTGCTGAAAGGCGACATTAAAATGGCCCTGCGCCTGAAAGATGGCGGCCGCTATCTGGCTGACTTCAAAACCACGTACAAAGCCAAGAAACCTGTGCAGATGCCTGGCGCGTACAATGTGGACCGCAAACTGGACATCACCTCTCATAATGAAGATTATACGGTGGTAGAGCAATATGAGCGCTCCGAGGGTCGTCATTCTACCGGTGGCATGGATGAACTATACAAATAA".lower()

In [None]:
seq = rcvenus_seq

# Kondrashov 2016

In [None]:
def parse_kondrashov_mutations(s):
    if pd.isna(s):
        return tuple()
    else:
        try:
            return tuple([parse_kondrashov_mutation(ss) for ss in s.split(":")])
        except:
            print(">>>", s)
            raise


def parse_kondrashov_mutation(s):
    mut_type, mut_from, mut_res, mut_to = re.match(
        r"(S|I|D)([A-Z])(\d+)([A-Z*])", s
    ).groups()
    # Kondrashov uses MixCR 0-based indexing, also is missing initial methionine, so need to add 2 to index
    mut_res = int(mut_res) + 2
    return (mut_type, mut_from, mut_res, mut_to)

In [None]:
kondrashov = pd.read_csv(
    "2016kondrashov/nucleotide_genotypes_to_brightness.tsv", sep="\t"
)
kondrashov["aa_mutations"] = kondrashov["aaMutations"].map(parse_kondrashov_mutations)
kondrashov["nt_mutations"] = kondrashov["nMutations"].map(parse_kondrashov_mutations)
kondrashov["num_aa_mutations"] = kondrashov["aa_mutations"].map(len)
kondrashov["num_nt_mutations"] = kondrashov["nt_mutations"].map(len)
kondrashov["res"] = kondrashov[kondrashov["num_aa_mutations"] == 1]["aa_mutations"].map(
    lambda x: x[0][2]
)
kondrashov_single = kondrashov[kondrashov["num_aa_mutations"] == 1]
kondrashov_single_nt = kondrashov[kondrashov["num_nt_mutations"] == 1]
# qgrid.show_grid(kondrashov)

In [None]:
kondrashov_aa_grouped = kondrashov_single.groupby("aa_mutations").agg(
    {"res": ["first"], "medianBrightness": ["mean", "std"], "uniqueBarcodes": ["sum"]}
)
kondrashov_aa_grouped.columns = list(
    map("_".join, kondrashov_aa_grouped.columns.values)
)

## New

In [None]:
def dim_frac(x):
    return (x.values < 3).sum() / len(x)


kondrashov_spread = kondrashov_aa_grouped.groupby(["res_first"])[
    "medianBrightness_mean"
].agg({"medianBrightness_mean": ["mean", "size", "max", "min", dim_frac]})
kondrashov_spread.columns = kondrashov_spread.columns.get_level_values(1)
kondrashov_spread["mean_minus_min"] = (
    kondrashov_spread["mean"] - kondrashov_spread["min"]
)
kondrashov_spread["max_minus_min"] = kondrashov_spread["max"] - kondrashov_spread["min"]

In [None]:
low_dim_frac = kondrashov_spread[
    (kondrashov_spread["size"] > 3) & (kondrashov_spread["min"] < 3)
].sort_values("dim_frac", ascending=True)

In [None]:
all_dim = kondrashov_spread[kondrashov_spread["dim_frac"] == 1]

## Order 180328

In [None]:
good_res = set(all_dim.index) | set(low_dim_frac.index)

In [None]:
dimmest_muts = (
    kondrashov_aa_grouped[kondrashov_aa_grouped["res_first"].isin(good_res)]
    .groupby("res_first")["medianBrightness_mean"]
    .idxmin()
)  # .apply(lambda x: x[''])
len(dimmest_muts)

In [None]:
dim_muts = kondrashov_aa_grouped[
    (kondrashov_aa_grouped["medianBrightness_mean"] < 3)
    & (kondrashov_aa_grouped["res_first"].isin(good_res))
]
len(dim_muts)

In [None]:
primers = []
for mut in dim_muts.index.values:
    mut = mut[0]
    res = mut[2]
    old_aa = mut[1].lower()
    new_aa = mut[3].lower()
    seq_idx = 3 * (res - 1)
    old_codon = seq[seq_idx : seq_idx + 3]
    if codon_to_aa[old_codon] != old_aa:
        print(
            "expecting aa {} at residue {}, found {} instead".format(
                old_aa, res, codon_to_aa[old_codon]
            )
        )
        continue
        # raise ValueError('expecting aa {} at residue {}, found {} instead'.format(mut[1].lower(), res, codon_to_aa[old_codon]))
    primers_for_transition = defaultdict(list)
    for new_codon in aa_to_codon[new_aa]:
        for alt_old_codon in aa_to_codon[old_aa]:
            diff = site_diff(alt_old_codon, new_codon)
            if len(diff) == 1:
                primer = iva_substitution_primers(seq, diff[0][2], seq_idx + diff[0][0])
                primer["mutation"] = mutation_name(old_codon, new_codon, res)
                transition = "{}->{}".format(diff[0][1], diff[0][2])
                primer["transition"] = transition
                primers_for_transition[transition].append(primer)
    for transition, transition_primers in primers_for_transition.items():
        primers.append(transition_primers[0])
len(primers)

In [None]:
ps = primers

In [None]:
ps_nonnull = [p for p in ps if "forward_primer" in p]
df = pd.DataFrame(ps_nonnull)

In [None]:
df_display = df.copy()
df_display["selected"] = False
df_display["forward_len"] = df_display["forward_primer"].map(len)
df_display["reverse_len"] = df_display["reverse_primer"].map(len)
cols = ["selected", "mutation", "transition", "forward_len", "reverse_len"] + [
    c
    for c in list(ps_nonnull[0].keys())
    if c not in ("mutation", "transition")
    and not c.endswith("_dG")
    and not c.endswith("_primer")
]
df_display = df_display[cols]
table = qgrid.show_grid(
    df_display,
    grid_options={
        "forceFitColumns": False,
        "defaultColumnWidth": 120,
        "autoEdit": True,
    },
    precision=1,
)
table

In [None]:
df2 = table.get_changed_df()
selected_primers = df.iloc[df2[df2["selected"]].index]
selected_primers = selected_primers.sort_index()
selected_primers

In [None]:
collections.Counter(selected_primers["transition"])

In [None]:
selected_primers.to_json("180328primers_fixed.json")

In [None]:
jqs_num = 114
prefix = "rcvenus"

In [None]:
ordered_seqs = set()
primers_for_output = defaultdict(list)
for p in selected_primers.itertuples():
    for suffix, seq in (("f", p.forward_primer), ("r", p.reverse_primer)):
        name = "JQS{}_{}_{}_{}".format(jqs_num, prefix, p.mutation, suffix)
        order = "{}\t{}".format(name, seq)
        if seq in ordered_seqs:
            continue
        ordered_seqs.add(seq)
        primers_for_output[suffix].append(order)
        jqs_num += 1
print("\n".join(primers_for_output["f"] + primers_for_output["r"]))

# mSCFP3 stop codons

In [None]:
ps = stop_codon_primers(mscfp3_seq)

In [None]:
ps_nonnull = [p for p in ps if "forward_primer" in p]
df = pd.DataFrame(ps_nonnull)

In [None]:
df_display = df.copy()
df_display["selected"] = False
df_display["forward_len"] = df_display["forward_primer"].map(len)
df_display["reverse_len"] = df_display["reverse_primer"].map(len)
cols = ["selected", "mutation", "transition", "forward_len", "reverse_len"] + [
    c
    for c in list(ps_nonnull[0].keys())
    if c not in ("mutation", "transition")
    and not c.endswith("_dG")
    and not c.endswith("_primer")
]
df_display = df_display[cols]
table = qgrid.show_grid(
    df_display,
    grid_options={
        "forceFitColumns": False,
        "defaultColumnWidth": 120,
        "autoEdit": True,
    },
    precision=1,
)
table

In [None]:
df2 = table.get_changed_df()
selected_primers = df.iloc[df2[df2["selected"]].index]
selected_primers = selected_primers.sort_index()
selected_primers

In [None]:
selected_primers.to_json("180423primers_mscfp3.json")

In [None]:
jqs_num = 143
prefix = "mSCFP3"

In [None]:
ordered_seqs = set()
primers_for_output = defaultdict(list)
for p in selected_primers.itertuples():
    for suffix, seq in (("f", p.forward_primer), ("r", p.reverse_primer)):
        name = "JQS{}_{}_{}_{}".format(jqs_num, prefix, p.mutation, suffix)
        order = "{}\t{}".format(name, seq)
        if seq in ordered_seqs:
            continue
        ordered_seqs.add(seq)
        primers_for_output[suffix].append(order)
        jqs_num += 1
print("\n".join(primers_for_output["f"] + primers_for_output["r"]))

In [None]:
# jqs_num = 107
# for p in selected_primers.itertuples():
#     for suffix, seq in (('rev_f', p.reversion_forward_primer),):
#         name = 'JQS{}_{}_{}_{}'.format(jqs_num, prefix, p.mutation, suffix)
#         print("{}\t{}".format(name, seq))
#         jqs_num += 1

# mScarlet stop codons

In [None]:
ps = stop_codon_primers(mscarlet_seq)

In [None]:
ps_nonnull = [p for p in ps if "forward_primer" in p]
df = pd.DataFrame(ps_nonnull)

In [None]:
df_display = df.copy()
df_display["selected"] = False
df_display["forward_len"] = df_display["forward_primer"].map(len)
df_display["reverse_len"] = df_display["reverse_primer"].map(len)
cols = ["selected", "mutation", "transition", "forward_len", "reverse_len"] + [
    c
    for c in list(ps_nonnull[0].keys())
    if c not in ("mutation", "transition")
    and not c.endswith("_dG")
    and not c.endswith("_primer")
]
df_display = df_display[cols]
table = qgrid.show_grid(
    df_display,
    grid_options={
        "forceFitColumns": False,
        "defaultColumnWidth": 120,
        "autoEdit": True,
    },
    precision=1,
)
table

In [None]:
df2 = table.get_changed_df()
selected_primers = df.iloc[df2[df2["selected"]].index]
selected_primers = selected_primers.sort_index()
selected_primers

In [None]:
selected_primers.to_json("180423primers_mscarlet.json")

In [None]:
jqs_num = 162
prefix = "mScarlet"

In [None]:
ordered_seqs = set()
primers_for_output = defaultdict(list)
for p in selected_primers.itertuples():
    for suffix, seq in (("f", p.forward_primer), ("r", p.reverse_primer)):
        name = "JQS{}_{}_{}_{}".format(jqs_num, prefix, p.mutation, suffix)
        order = "{}\t{}".format(name, seq)
        if seq in ordered_seqs:
            continue
        ordered_seqs.add(seq)
        primers_for_output[suffix].append(order)
        jqs_num += 1
print("\n".join(primers_for_output["f"] + primers_for_output["r"]))

# mSCFP3 chromophore: G91A, L60H

In [None]:
seq = mscfp3_seq

In [None]:
muts = [parse_mutation_name(s) for s in ["G91A", "L60H"]]
muts

In [None]:
primers = []
for mut in muts:
    res = mut[2]
    old_aa = mut[0].lower()
    new_aa = mut[1].lower()
    seq_idx = 3 * (res - 1)
    old_codon = seq[seq_idx : seq_idx + 3]
    if codon_to_aa[old_codon] != old_aa:
        print(
            "expecting aa {} at residue {}, found {} instead".format(
                old_aa, res, codon_to_aa[old_codon]
            )
        )
        continue
        # raise ValueError('expecting aa {} at residue {}, found {} instead'.format(mut[1].lower(), res, codon_to_aa[old_codon]))
    primers_for_transition = defaultdict(list)
    for new_codon in aa_to_codon[new_aa]:
        for alt_old_codon in aa_to_codon[old_aa]:
            diff = site_diff(alt_old_codon, new_codon)
            if len(diff) == 1:
                primer = iva_substitution_primers(seq, diff[0][2], seq_idx + diff[0][0])
                primer["mutation"] = mutation_name(old_codon, new_codon, res)
                transition = "{}->{}".format(diff[0][1], diff[0][2])
                primer["transition"] = transition
                primers_for_transition[transition].append(primer)
    for transition, transition_primers in primers_for_transition.items():
        primers.append(transition_primers[0])
len(primers)

In [None]:
selected_primers = pd.DataFrame(primers)
selected_primers

In [None]:
jqs_num = 174
prefix = "mSCFP3"

In [None]:
ordered_seqs = set()
primers_for_output = defaultdict(list)
for p in selected_primers.itertuples():
    for suffix, seq in (("f", p.forward_primer), ("r", p.reverse_primer)):
        name = "JQS{}_{}_{}_{}".format(jqs_num, prefix, p.mutation, suffix)
        order = "{}\t{}".format(name, seq)
        if seq in ordered_seqs:
            continue
        ordered_seqs.add(seq)
        primers_for_output[suffix].append(order)
        jqs_num += 1
print("\n".join(primers_for_output["f"] + primers_for_output["r"]))