In [None]:
import ast
import csv
import re

import numpy as np
import pandas as pd
import seaborn as sns
from Bio import SeqIO
from matplotlib import pyplot as plt

In [None]:
def align_read(querystr, cigarstr, pattern=re.compile("[0-9]{0,10}[MDI]")):
    result = pattern.finditer(cigarstr)
    cigar_seq = [(item.group(0)[-1], int(item.group(0)[:-1])) for item in result]
    #     output_str = "".join(["-" for i in range(cigar[1])])
    output_str = ""
    current_idx = 0
    for item in cigar_seq:
        if item[0] == "M":
            added_str = querystr[current_idx : current_idx + item[1]]
            output_str += added_str
            current_idx += item[1]
        elif item[0] == "D":
            added_str = "".join(["-" for i in range(item[1])])
            output_str += added_str
        elif item[0] == "I":
            current_idx += item[1]
    return output_str


def cigarsfromsam(samfilepath):
    cigars = {}
    with open(samfilepath, "r") as samfile:
        for line in samfile:
            if line[0] == "@":
                next(samfile)
            else:
                splitline = line.split("\t")
                cigars[splitline[0]] = splitline[5]
    return cigars


def strsfromfasta(fastafilepath):
    queries = SeqIO.to_dict(SeqIO.parse(fastafilepath, "fasta"))
    queries = {key: str(val.seq) for key, val in queries.items()}
    return queries


def make_seg_dict(gfafile):
    segment_dict = {}
    with open(gfafile, "r") as infile:
        for line in infile:
            if line[0] == "S":
                splitline = line.split("\t")
                segment_dict[splitline[1]] = splitline[2][:-1]
    return segment_dict


def get_ref_intervals(gfafile):
    segment_dict = {}
    current_idx = 0
    with open(gfafile, "r") as infile:
        for line in infile:
            if line[0] == "S":
                splitline = line.split("\t")
                if "OFF" not in splitline[1]:
                    refstr = splitline[2][:-1]
                    strlen = len(refstr)
                    name = splitline[1]
                    if "ON" in name:
                        name = name[:-2]
                    segment_dict[name] = tuple((current_idx, current_idx + strlen))
                    current_idx += strlen
    return segment_dict


def align_read(querystr, cigarstr, pattern=re.compile("[0-9]{0,10}[MDI]")):
    result = pattern.finditer(cigarstr)
    cigar_seq = [(item.group(0)[-1], int(item.group(0)[:-1])) for item in result]
    #     output_str = "".join(["-" for i in range(cigar[1])])
    output_str = ""
    current_idx = 0
    for item in cigar_seq:
        if item[0] == "M":
            added_str = querystr[current_idx : current_idx + item[1]]
            output_str += added_str
            current_idx += item[1]
        elif item[0] == "D":
            added_str = "".join(["-" for i in range(item[1])])
            output_str += added_str
        elif item[0] == "I":
            current_idx += item[1]
    return output_str


def splitstr(instr, ref_intervals):
    strassign = {key: instr[val[0] : val[1]] for key, val in ref_intervals.items()}
    return strassign


def slow_hamming_distance(s1, s2):
    if len(s1) != len(s2):
        print(s1, s2)
        raise ValueError("Strand lengths are not equal!")
    term_list = []
    for ch1, ch2 in zip(s1, s2):
        if ch1 == "N" or ch2 == "N":
            term_list.append(False)
        else:
            term_list.append(ch1 != ch2)
    result = sum(term_list)
    return result


def get_dict_dist(dict1, dict2):
    hamming_dict = {
        key: slow_hamming_distance(dict1[key], dict2[key]) for key in dict1.keys()
    }
    return hamming_dict

In [None]:
# ORIGINAL PATHS
# R9_data = pd.read_csv("/home/de64/scratch/de64/2020-10-18_snakemake_2020-09-24_oDEPool3/output.tsv",delimiter="\t")
# R10_data = pd.read_csv("/home/de64/scratch/de64/2020-10-18_snakemake_2020-10-14_lDE11_R10-3_merged/output.tsv",delimiter="\t")
# ref_intervals = get_ref_intervals("/home/de64/scratch/de64/2020-10-18_snakemake_2020-10-14_lDE11_R10-3_merged/ref.gfa")

In [None]:
R9_data = pd.read_csv(
    "/home/de64/scratch/de64/2020-12-05_DAC/R9_output.tsv", delimiter="\t"
)
R10_data = pd.read_csv(
    "/home/de64/scratch/de64/2020-12-05_DAC/R10_output.tsv", delimiter="\t"
)
ref_intervals = get_ref_intervals("./ref.gfa")

R9_barcodes = set(R9_data["barcode"].tolist())
R10_barcodes = set(R10_data["barcode"].tolist())

R9_only_barcodes = R9_barcodes - R10_barcodes
R10_only_barcodes = R10_barcodes - R9_barcodes
shared_barcodes = R10_barcodes & R9_barcodes
barcode_count_arr = np.array(
    [len(R9_only_barcodes), len(R10_only_barcodes), len(shared_barcodes)]
)

R9_data = R9_data[R9_data["barcode"].isin(shared_barcodes)]
R10_data = R10_data[R10_data["barcode"].isin(shared_barcodes)]

In [None]:
ax = sns.barplot(x=["R9 Only", "R10 Only", "Both"], y=barcode_count_arr)

In [None]:
bit_arr = np.array([list(item) for item in shared_barcodes]).astype(int)
bit_freq = np.mean(bit_arr, axis=0)

In [None]:
len(bit_freq)

In [None]:
bit_arr

In [None]:
ax = sns.barplot(x=list(range(27)), y=bit_freq, color="grey")

In [None]:
both_on = bit_arr @ bit_arr.T
both_off = (-bit_arr + 1) @ (-bit_arr.T + 1)
ttl_match = both_on + both_off
np.fill_diagonal(ttl_match, 100)

In [None]:
closest_match = np.min(ttl_match, axis=0)

In [None]:
plt.hist(closest_match, range=(0, 10))
plt.show()

In [None]:
np.sum(closest_match == 1)

In [None]:
plt.hist(
    np.random.choice(ttl_match.flatten(), 50000, replace=False), range=(0, 27), bins=27
)
plt.show()

In [None]:
aligned_cons = R9_data.apply(lambda x: align_read(x["consensus"], x["cigar"]), axis=1)
R9_data["aligned_cons"] = aligned_cons

aligned_cons = R10_data.apply(lambda x: align_read(x["consensus"], x["cigar"]), axis=1)
R10_data["aligned_cons"] = aligned_cons

split_ref = R9_data.apply(lambda x: splitstr(x["reference"], ref_intervals), axis=1)
split_align = R9_data.apply(
    lambda x: splitstr(x["aligned_cons"], ref_intervals), axis=1
)
R9_data["split_ref"] = split_ref
R9_data["split_align"] = split_align

split_ref = R10_data.apply(lambda x: splitstr(x["reference"], ref_intervals), axis=1)
split_align = R10_data.apply(
    lambda x: splitstr(x["aligned_cons"], ref_intervals), axis=1
)
R10_data["split_ref"] = split_ref
R10_data["split_align"] = split_align

R9_data["split_ref"] = R9_data["split_ref"].apply(
    lambda x: {key: val for key, val in x.items() if key == "GFP"}
)  ## This is a hack until I can repull the alignment data...then I'll add padding for the unaligned parts
R9_data["split_align"] = R9_data["split_align"].apply(
    lambda x: {key: val for key, val in x.items() if key == "GFP"}
)  ## This is a hack until I can repull the alignment data...

R10_data["split_ref"] = R10_data["split_ref"].apply(
    lambda x: {key: val for key, val in x.items() if key == "GFP"}
)  ## This is a hack until I can repull the alignment data...then I'll add padding for the unaligned parts
R10_data["split_align"] = R10_data["split_align"].apply(
    lambda x: {key: val for key, val in x.items() if key == "GFP"}
)  ## This is a hack until I can repull the alignment data...

hamm_ref = R9_data.apply(
    lambda x: get_dict_dist(x["split_align"], x["split_ref"]), axis=1
)
R9_data["hamm_ref"] = hamm_ref

hamm_ref = R10_data.apply(
    lambda x: get_dict_dist(x["split_align"], x["split_ref"]), axis=1
)
R10_data["hamm_ref"] = hamm_ref

dark_gfp = (
    R9_data.apply(
        lambda x: slow_hamming_distance(
            x["split_align"]["GFP"][623:625], x["split_ref"]["GFP"][623:625]
        ),
        axis=1,
    )
    > 0
)
R9_data["dark_gfp"] = dark_gfp

dark_gfp = (
    R10_data.apply(
        lambda x: slow_hamming_distance(
            x["split_align"]["GFP"][623:625], x["split_ref"]["GFP"][623:625]
        ),
        axis=1,
    )
    > 0
)
R10_data["dark_gfp"] = dark_gfp

In [None]:
gt_data = R9_data[R9_data["subsample"] == 200]
gt_lookup = dict(zip(gt_data["barcodeid"], gt_data["dark_gfp"]))

R9_data["call"] = R9_data.apply(
    lambda x: x["dark_gfp"] == gt_lookup[x["barcodeid"]], axis=1
)

gt_data = R10_data[R10_data["subsample"] == 200]
gt_lookup = dict(zip(gt_data["barcodeid"], gt_data["dark_gfp"]))

R10_data["call"] = R10_data.apply(
    lambda x: x["dark_gfp"] == gt_lookup[x["barcodeid"]], axis=1
)

In [None]:
R9_subsample_group = R9_data.groupby("subsample")
R10_subsample_group = R10_data.groupby("subsample")

In [None]:
R9_call_curve = R9_subsample_group.aggregate({"call": "mean"})
R10_call_curve = R10_subsample_group.aggregate({"call": "mean"})

In [None]:
R9_q_scores = -10 * np.log10(1.0 - R9_call_curve)
R10_q_scores = -10 * np.log10(1.0 - R10_call_curve)

In [None]:
R9_q_scores

In [None]:
(1.0 - R10_call_curve) * 100

In [None]:
plt.plot((1.0 - R10_call_curve) * 100)
plt.ylim(0.0, 1.0)
plt.xlim(0, 100)
plt.xticks([2, 25, 50, 100])
plt.show()

In [None]:
plt.plot(1.0 - R9_call_curve)
plt.plot(1.0 - R10_call_curve)
plt.xlim(0, 100)
plt.yscale("log")
plt.show()

In [None]:
plt.plot(R9_q_scores)
plt.plot(R10_q_scores)
plt.xlim(0, 100)
plt.xticks(R9_q_scores.index.values[:-3])
plt.show()

#### Whole GFP Error

In [None]:
R9_gfp_error = R9_data[R9_data["call"] == False].apply(
    lambda x: slow_hamming_distance(x["split_align"]["GFP"], x["split_ref"]["GFP"])
    / len(x["split_ref"]["GFP"]),
    axis=1,
)
R10_gfp_error = R10_data[R10_data["call"] == False].apply(
    lambda x: slow_hamming_distance(x["split_align"]["GFP"], x["split_ref"]["GFP"])
    / len(x["split_ref"]["GFP"]),
    axis=1,
)

R9_data["GFP Error"] = R9_gfp_error
R10_data["GFP Error"] = R10_gfp_error

In [None]:
R9_subsample_group = R9_data.groupby("subsample")
R10_subsample_group = R10_data.groupby("subsample")

In [None]:
R9_error_curve = R9_subsample_group.aggregate({"GFP Error": "mean"})
R10_error_curve = R10_subsample_group.aggregate({"GFP Error": "mean"})
R9_q_scores = -10 * np.log10(R9_error_curve)
R10_q_scores = -10 * np.log10(R10_error_curve)

In [None]:
plt.plot(R9_q_scores)
plt.plot(R10_q_scores)
plt.xlim(0, 25)
plt.xticks(R9_q_scores.index.values[:-4])
plt.show()

In [None]:
R9_error_curve

In [None]:
R10_error_curve

### UMI Error

In [None]:
R10_data = pd.read_csv(
    "/home/de64/scratch/de64/2020-12-05_DAC/R10_output.tsv", delimiter="\t"
)
aligned_cons = R10_data.apply(lambda x: align_read(x["consensus"], x["cigar"]), axis=1)
R10_data["aligned_cons"] = aligned_cons

split_ref = R10_data.apply(lambda x: splitstr(x["reference"], ref_intervals), axis=1)
split_align = R10_data.apply(
    lambda x: splitstr(x["aligned_cons"], ref_intervals), axis=1
)
R10_data["split_ref"] = split_ref
R10_data["split_align"] = split_align
R10_data["Nmer"] = R10_data["split_align"].apply(lambda x: x["GFP"][925:940])
gt_data = R10_data[R10_data["subsample"] == 200]
gt_lookup = dict(zip(gt_data["barcodeid"], gt_data["Nmer"]))
R10_data["Nmer Errors"] = R10_data.apply(
    lambda x: slow_hamming_distance(x["Nmer"], gt_lookup[x["barcodeid"]]), axis=1
)
R10_data["Nmer Error Rate"] = R10_data["Nmer Errors"] / 15

In [None]:
R10_subsample_group = R10_data.groupby("subsample")

In [None]:
R10_call_curve = R10_subsample_group.aggregate({"Nmer Error Rate": "mean"})

In [None]:
R10_call_curve

In [None]:
plt.plot(R10_call_curve * 100, linewidth=3.0)
plt.axvline(24, color="C1", ls="--")
plt.ylim(0.0, 2.0)
plt.xlim(0, 100)
plt.xticks([2, 25, 50, 100], fontsize=16)
plt.yticks([0.0, 0.5, 1.0, 1.5, 2.0], fontsize=16)
plt.xlabel("Depth", fontsize=16)
plt.ylabel("Error (%)", fontsize=16)
plt.tight_layout()
plt.savefig("./Figure_1.png", dpi=150)

In [None]:
gt_data = R9_data[R9_data["subsample"]==200]
gt_lookup = dict(zip(gt_data["barcodeid"],gt_data["dark_gfp"]))

R9_data["call"] = R9_data.apply(lambda x: x["dark_gfp"]==gt_lookup[x["barcodeid"]],axis=1)

gt_data = R10_data[R10_data["subsample"]==200]
gt_lookup = dict(zip(gt_data["barcodeid"],gt_data["dark_gfp"]))

R10_data["call"] = R10_data.apply(lambda x: slow_hamming_distance(x["dark_gfp"],gt_lookup[x["barcodeid"]]))
                                  x["dark_gfp"]==gt_lookup[x["barcodeid"]],axis=1)

In [None]:
bitlist = ["BIT" + str(i) for i in range(27)]
bit_mismatch_dict = {}
for bit in bitlist:
    mismatch_list = data.apply(lambda x: x["hamm_ref"][bit], axis=1).values
    bit_mismatch_dict[bit] = mismatch_list

In [None]:
bit_mismatch_dict

In [None]:
from matplotlib import pyplot as plt

for key in bit_mismatch_dict.keys():
    plt.hist(bit_mismatch_dict[key], bins=20)
    plt.show()

In [None]:
miscalls = data.apply(lambda x: x["hamm_ref"]["BIT26"], axis=1).values > 5

In [None]:
plt.hist([int(item[26]) for item in data[miscalls]["barcode"]])

In [None]:
plt.hist([int(item[26]) for item in data[~miscalls]["barcode"]])

In [None]:
plt.hist([int(item[26]) for item in data["barcode"]])

So the last bit is always called as 0; fixed. was no newline at the end of the .gaf file

In [None]:
list(np.random.choice([0, 1, 2], size=3, replace=False))

In [None]:
import csv
import os

import numpy as np
from matplotlib import pyplot as plt

In [None]:
inpathlist = [
    "/home/de64/scratch/de64/2020-10-18_snakemake_2020-09-24_oDEPool3/graph_output/"
    + item
    for item in os.listdir(
        "/home/de64/scratch/de64/2020-10-18_snakemake_2020-09-24_oDEPool3/graph_output"
    )
    if item[-3:] == "tsv" and item[:4] == "read"
]

In [None]:
### Get Barcode Histogram ###

barcode_dict = {}
for filepath in inpathlist:
    with open(filepath, "r") as infile:
        next(infile)
        for line in infile:
            data = line.split("\t")
            barcode_dict[data[0]] = data[1]

In [None]:
barcode_arr = np.array(list(barcode_dict.values()))
unique, counts = np.unique(barcode_arr, return_counts=True)=

In [None]:
vmin, vmax = (2, np.max(counts[counts]))
nbins = vmax - vmin

In [None]:
nbins

In [None]:
vmin, vmax = (0, int(np.percentile(counts, 99.9)))
nbins = min(200, vmax - vmin)

plt.hist(counts, range=(vmin, vmax), bins=nbins)
plt.yscale("log")
plt.axvline(200, color="salmon")
plt.show()

In [None]:
data = pd.read_csv(
    "/home/de64/scratch/de64/2020-10-18_snakemake_2020-09-24_oDEPool3/graph_output/inv_codebook.tsv",
    delimiter="\t",
)

In [None]:
data["barcodeid"]

In [None]:
test = data["readlist"]

In [None]:
inv_barcode_codebook = {}
for _, row in data.iterrows():
    inv_barcode_codebook[int(row["barcodeid"])] = ast.literal_eval(row["readlist"])

In [None]:
inv_barcode_codebook[0]

In [None]:
d = {1: 2}
d.update({3: 4})

In [None]:
d

In [None]:
print("making fastq dict")
record_dict_list = [{1: 2}, {3: 4}, {5: 6}]
record_dict = {}
for i in range(len(record_dict_list)):
    subdict = record_dict_list[i]
    record_dict.update(subdict)
    del subdict
    record_dict_list[i] = None
del record_dict_list
print("finished making fastq dict")

In [None]:
record_dict

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv(
    "/home/de64/scratch/de64/2020-10-18_snakemake_2020-09-24_oDEPool3/output.tsv",
    delimiter="\t",
)

In [None]:
test = data[data["subsample"] == 200]

In [None]:
test