In [None]:
import csv
import ast
import re
import numpy as np
import pandas as pd
import seaborn as sns

from Bio import SeqIO

from matplotlib import pyplot as plt

In [None]:
def align_read(querystr, cigarstr, pattern=re.compile("[0-9]{0,10}[MDI]")):
    result = pattern.finditer(cigarstr)
    cigar_seq = [(item.group(0)[-1], int(item.group(0)[:-1])) for item in result]
    #     output_str = "".join(["-" for i in range(cigar[1])])
    output_str = ""
    current_idx = 0
    for item in cigar_seq:
        if item[0] == "M":
            added_str = querystr[current_idx : current_idx + item[1]]
            output_str += added_str
            current_idx += item[1]
        elif item[0] == "D":
            added_str = "".join(["-" for i in range(item[1])])
            output_str += added_str
        elif item[0] == "I":
            current_idx += item[1]
    return output_str


def cigarsfromsam(samfilepath):
    cigars = {}
    with open(samfilepath, "r") as samfile:
        for line in samfile:
            if line[0] == "@":
                next(samfile)
            else:
                splitline = line.split("\t")
                cigars[splitline[0]] = splitline[5]
    return cigars


def strsfromfasta(fastafilepath):
    queries = SeqIO.to_dict(SeqIO.parse(fastafilepath, "fasta"))
    queries = {key: str(val.seq) for key, val in queries.items()}
    return queries


def make_seg_dict(gfafile):
    segment_dict = {}
    with open(gfafile, "r") as infile:
        for line in infile:
            if line[0] == "S":
                splitline = line.split("\t")
                segment_dict[splitline[1]] = splitline[2][:-1]
    return segment_dict


def get_ref_intervals(gfafile):
    segment_dict = {}
    current_idx = 0
    with open(gfafile, "r") as infile:
        for line in infile:
            if line[0] == "S":
                splitline = line.split("\t")
                if "OFF" not in splitline[1]:
                    refstr = splitline[2][:-1]
                    strlen = len(refstr)
                    name = splitline[1]
                    if "ON" in name:
                        name = name[:-2]
                    segment_dict[name] = tuple((current_idx, current_idx + strlen))
                    current_idx += strlen
    return segment_dict


def align_read(
    querystr, refstr, cigarstr, startpos=1, pattern=re.compile("[0-9]{0,10}[MDI]")
):
    start_pos = startpos - 1  ##comes as 1 indexed from minimap
    result = pattern.finditer(cigarstr)
    cigar_seq = [(item.group(0)[-1], int(item.group(0)[:-1])) for item in result]
    #     output_str = "".join(["-" for i in range(cigar[1])])
    output_str = ""
    if start_pos > 0:
        output_str += "".join(["-" for i in range(start_pos)])
    current_idx = 0
    for item in cigar_seq:
        if item[0] == "M":
            added_str = querystr[current_idx : current_idx + item[1]]
            output_str += added_str
            current_idx += item[1]
        elif item[0] == "D":
            added_str = "".join(["-" for i in range(item[1])])
            output_str += added_str
        elif item[0] == "I":
            current_idx += item[1]
    remaining_len = len(refstr) - len(output_str)
    if remaining_len > 0:
        output_str += "".join(["-" for i in range(remaining_len)])
    return output_str


def splitstr(instr, ref_intervals):
    strassign = {key: instr[val[0] : val[1]] for key, val in ref_intervals.items()}
    return strassign


def slow_hamming_distance(s1, s2):
    if len(s1) != len(s2):
        print(s1, s2)
        raise ValueError("Strand lengths are not equal!")
    term_list = []
    for ch1, ch2 in zip(s1, s2):
        if ch1 == "N" or ch2 == "N":
            term_list.append(False)
        else:
            term_list.append(ch1 != ch2)
    result = sum(term_list)
    return result


def get_dict_dist(dict1, dict2):
    hamming_dict = {
        key: slow_hamming_distance(dict1[key], dict2[key]) for key in dict1.keys()
    }
    return hamming_dict

In [None]:
R10_data = pd.read_csv(
    "/home/de64/scratch/de64/2020-10-20_snakemake_2020-10-14_lDE11_R10-3_merged_final/output.tsv",
    delimiter="\t",
)
ref_intervals = get_ref_intervals(
    "/home/de64/scratch/de64/2020-10-20_snakemake_2020-10-14_lDE11_R10-3_merged_final/ref.gfa"
)
R10_barcodes = set(R10_data["barcode"].tolist())

In [None]:
bit_arr = np.array([list(item) for item in R10_barcodes]).astype(int)
bit_freq = np.mean(bit_arr, axis=0)

In [None]:
len(bit_freq)

In [None]:
bit_arr

In [None]:
fig = plt.figure(figsize=(16, 6))
sns.barplot(x=list(range(27)), y=bit_freq, color="grey")
plt.xlabel("Bit Number", fontsize=20)
plt.ylabel("Percent Positive", fontsize=20)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig("./figure_1.png", dpi=300, bbox_inches="tight")

In [None]:
both_on = bit_arr @ bit_arr.T
both_off = (-bit_arr + 1) @ (-bit_arr.T + 1)
ttl_match = both_on + both_off
np.fill_diagonal(ttl_match, 100)

In [None]:
closest_match = np.min(ttl_match, axis=0)

In [None]:
plt.hist(closest_match, range=(0, 10))
plt.xlabel("Closest Hamming Distance", fontsize=20)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.savefig("./figure_2.png", dpi=300, bbox_inches="tight")

In [None]:
plt.hist(closest_match, range=(0, 2), bins=3)
plt.xlabel("Closest Hamming Distance", fontsize=20)
plt.xticks([0.25, 1.0, 1.75], [0, 1, 2], fontsize=16)
plt.yticks(fontsize=16)
plt.savefig("./figure_3.png", dpi=300, bbox_inches="tight")

In [None]:
np.sum(closest_match == 1)

In [None]:
plt.hist(
    np.random.choice(ttl_match.flatten(), 50000, replace=False), range=(0, 27), bins=27
)
plt.show()

In [None]:
aligned_cons = R10_data.apply(
    lambda x: align_read(
        x["consensus"], x["reference"], x["cigar"], startpos=x["alignmentstart"]
    ),
    axis=1,
)
R10_data["aligned_cons"] = aligned_cons

split_ref = R10_data.apply(lambda x: splitstr(x["reference"], ref_intervals), axis=1)
split_align = R10_data.apply(
    lambda x: splitstr(x["aligned_cons"], ref_intervals), axis=1
)
R10_data["split_ref"] = split_ref
R10_data["split_align"] = split_align

hamm_ref = R10_data.apply(
    lambda x: get_dict_dist(x["split_align"], x["split_ref"]), axis=1
)
R10_data["hamm_ref"] = hamm_ref

dark_gfp = (
    R10_data.apply(
        lambda x: slow_hamming_distance(
            x["split_align"]["GFP"][623:625], x["split_ref"]["GFP"][623:625]
        ),
        axis=1,
    )
    > 0
)
R10_data["dark_gfp"] = dark_gfp

In [None]:
R10_data.to_csv("./lDE11_final_df.tsv", sep="\t")