In [None]:
import csv
import ast
import re
import numpy as np

from Bio import SeqIO

from matplotlib import pyplot as plt

In [None]:
def align_read(querystr, cigarstr, pattern=re.compile("[0-9]{0,10}[MDI]")):
    result = pattern.finditer(cigarstr)
    cigar_seq = [(item.group(0)[-1], int(item.group(0)[:-1])) for item in result]
    #     output_str = "".join(["-" for i in range(cigar[1])])
    output_str = ""
    current_idx = 0
    for item in cigar_seq:
        if item[0] == "M":
            added_str = querystr[current_idx : current_idx + item[1]]
            output_str += added_str
            current_idx += item[1]
        elif item[0] == "D":
            added_str = "".join(["-" for i in range(item[1])])
            output_str += added_str
        elif item[0] == "I":
            current_idx += item[1]
    return output_str

In [None]:
!head "./alignment/chunk_0/group_0.sam"

In [None]:
def cigarsfromsam(samfilepath):
    cigars = {}
    with open(samfilepath, "r") as samfile:
        for line in samfile:
            if line[0] == "@":
                next(samfile)
            else:
                splitline = line.split("\t")
                cigars[splitline[0]] = splitline[5]
    return cigars


def strsfromfasta(fastafilepath):
    queries = SeqIO.to_dict(SeqIO.parse(fastafilepath, "fasta"))
    queries = {key: str(val.seq) for key, val in queries.items()}
    return queries

In [None]:
cigars = cigarsfromsam("./alignment/chunk_0/group_0.sam")
queries = strsfromfasta("./consensus/chunk_0/group_0/consensus.fasta")
references = strsfromfasta("./grouprefs/chunk_0/group_0.fasta")

In [None]:
int(list(cigars.keys())[0].split("_")[1])

In [None]:
cigar = list(cigars.values())[0]
query = list(queries.values())[0]
reference = list(references.values())[0]

In [None]:
align_read(query, cigar)

In [None]:
reference

In [None]:
def make_seg_dict(gfafile):
    segment_dict = {}
    with open(gfafile, "r") as infile:
        for line in infile:
            if line[0] == "S":
                splitline = line.split("\t")
                segment_dict[splitline[1]] = splitline[2][:-1]
    return segment_dict

In [None]:
def get_ref_intervals(gfafile):
    segment_dict = {}
    current_idx = 0
    with open(gfafile, "r") as infile:
        for line in infile:
            if line[0] == "S":
                splitline = line.split("\t")
                if "OFF" not in splitline[1]:
                    refstr = splitline[2][:-1]
                    strlen = len(refstr)
                    name = splitline[1]
                    if "ON" in name:
                        name = name[:-2]
                    segment_dict[name] = tuple((current_idx, current_idx + strlen))
                    current_idx += strlen
    return segment_dict

In [None]:
ref_intervals = get_ref_intervals("ref.gfa")

In [None]:
ref_intervals

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv(
    "/home/de64/scratch/de64/2020-10-18_snakemake_2020-09-24_oDEPool3/output.tsv",
    delimiter="\t",
)

In [None]:
data

In [None]:
def align_read(querystr, cigarstr, pattern=re.compile("[0-9]{0,10}[MDI]")):
    result = pattern.finditer(cigarstr)
    cigar_seq = [(item.group(0)[-1], int(item.group(0)[:-1])) for item in result]
    #     output_str = "".join(["-" for i in range(cigar[1])])
    output_str = ""
    current_idx = 0
    for item in cigar_seq:
        if item[0] == "M":
            added_str = querystr[current_idx : current_idx + item[1]]
            output_str += added_str
            current_idx += item[1]
        elif item[0] == "D":
            added_str = "".join(["-" for i in range(item[1])])
            output_str += added_str
        elif item[0] == "I":
            current_idx += item[1]
    return output_str


def splitstr(instr, ref_intervals):
    strassign = {key: instr[val[0] : val[1]] for key, val in ref_intervals.items()}
    return strassign


def slow_hamming_distance(s1, s2):
    if len(s1) != len(s2):
        raise ValueError("Strand lengths are not equal!")
    term_list = []
    for ch1, ch2 in zip(s1, s2):
        if ch1 == "N" or ch2 == "N":
            term_list.append(False)
        else:
            term_list.append(ch1 != ch2)
    result = sum(term_list)
    return result


def get_dict_dist(dict1, dict2):
    hamming_dict = {
        key: slow_hamming_distance(dict1[key], dict2[key]) for key in dict1.keys()
    }
    return hamming_dict

In [None]:
aligned_cons = data.apply(lambda x: align_read(x["consensus"], x["cigar"]), axis=1)
data["aligned_cons"] = aligned_cons

split_ref = data.apply(lambda x: splitstr(x["reference"], ref_intervals), axis=1)
split_align = data.apply(lambda x: splitstr(x["aligned_cons"], ref_intervals), axis=1)
data["split_ref"] = split_ref
data["split_align"] = split_align

data["split_ref"] = data["split_ref"].apply(
    lambda x: {key: val for key, val in x.items() if key != "SPACER4"}
)  ## This is a hack until I can repull the alignment data...
data["split_align"] = data["split_align"].apply(
    lambda x: {key: val for key, val in x.items() if key != "SPACER4"}
)  ## This is a hack until I can repull the alignment data...

hamm_ref = data.apply(lambda x: get_dict_dist(x["split_align"], x["split_ref"]), axis=1)
data["hamm_ref"] = hamm_ref

dark_gfp = (
    data.apply(
        lambda x: slow_hamming_distance(
            x["split_align"]["GFP"][623:625], x["split_ref"]["GFP"][623:625]
        ),
        axis=1,
    )
    > 0
)
data["dark_gfp"] = dark_gfp

In [None]:
gt_data = data[data["subsample"] == 200]
gt_lookup = dict(zip(gt_data["barcodeid"], gt_data["dark_gfp"]))

data["call"] = data.apply(lambda x: x["dark_gfp"] == gt_lookup[x["barcodeid"]], axis=1)

In [None]:
subsample_group = data.groupby("subsample")

In [None]:
call_curve = subsample_group.aggregate({"call": "mean"})

In [None]:
q_scores = -10 * np.log10(1.0 - call_curve)

In [None]:
call_curve

In [None]:
plt.plot(call_curve)

In [None]:
plt.plot(1.0 - call_curve)
plt.yscale("log")
plt.show()

In [None]:
q_scores

In [None]:
plt.plot(q_scores)

In [None]:
bitlist = ["BIT" + str(i) for i in range(27)]
bit_mismatch_dict = {}
for bit in bitlist:
    mismatch_list = data.apply(lambda x: x["hamm_ref"][bit], axis=1).values
    bit_mismatch_dict[bit] = mismatch_list

In [None]:
bit_mismatch_dict

In [None]:
from matplotlib import pyplot as plt

for key in bit_mismatch_dict.keys():
    plt.hist(bit_mismatch_dict[key], bins=20)
    plt.show()

In [None]:
miscalls = data.apply(lambda x: x["hamm_ref"]["BIT26"], axis=1).values > 5

In [None]:
plt.hist([int(item[26]) for item in data[miscalls]["barcode"]])

In [None]:
plt.hist([int(item[26]) for item in data[~miscalls]["barcode"]])

In [None]:
plt.hist([int(item[26]) for item in data["barcode"]])

So the last bit is always called as 0; fixed. was no newline at the end of the .gaf file

In [None]:
list(np.random.choice([0, 1, 2], size=3, replace=False))

In [None]:
import csv
import numpy as np
import os

from matplotlib import pyplot as plt

In [None]:
inpathlist = [
    "/home/de64/scratch/de64/2020-10-18_snakemake_2020-09-24_oDEPool3/graph_output/"
    + item
    for item in os.listdir(
        "/home/de64/scratch/de64/2020-10-18_snakemake_2020-09-24_oDEPool3/graph_output"
    )
    if item[-3:] == "tsv" and item[:4] == "read"
]

In [None]:
### Get Barcode Histogram ###

barcode_dict = {}
for filepath in inpathlist:
    with open(filepath, "r") as infile:
        next(infile)
        for line in infile:
            data = line.split("\t")
            barcode_dict[data[0]] = data[1]

In [None]:
barcode_arr = np.array(list(barcode_dict.values()))
unique, counts = np.unique(barcode_arr, return_counts=True)=

In [None]:
vmin, vmax = (2, np.max(counts[counts]))
nbins = vmax - vmin

In [None]:
nbins

In [None]:
vmin, vmax = (0, int(np.percentile(counts, 99.9)))
nbins = min(200, vmax - vmin)

plt.hist(counts, range=(vmin, vmax), bins=nbins)
plt.yscale("log")
plt.axvline(200, color="salmon")
plt.show()

In [None]:
data = pd.read_csv(
    "/home/de64/scratch/de64/2020-10-18_snakemake_2020-09-24_oDEPool3/graph_output/inv_codebook.tsv",
    delimiter="\t",
)

In [None]:
data["barcodeid"]

In [None]:
test = data["readlist"]

In [None]:
inv_barcode_codebook = {}
for _, row in data.iterrows():
    inv_barcode_codebook[int(row["barcodeid"])] = ast.literal_eval(row["readlist"])

In [None]:
inv_barcode_codebook[0]

In [None]:
d = {1: 2}
d.update({3: 4})

In [None]:
d

In [None]:
print("making fastq dict")
record_dict_list = [{1: 2}, {3: 4}, {5: 6}]
record_dict = {}
for i in range(len(record_dict_list)):
    subdict = record_dict_list[i]
    record_dict.update(subdict)
    del subdict
    record_dict_list[i] = None
del record_dict_list
print("finished making fastq dict")

In [None]:
record_dict

In [None]:
import pandas as pd

In [None]:
data = pd.read_csv(
    "/home/de64/scratch/de64/2020-10-18_snakemake_2020-09-24_oDEPool3/output.tsv",
    delimiter="\t",
)

In [None]:
test = data[data["subsample"] == 200]

In [None]:
test