## NOTE THERE MAY BE AN ERROR IN HAMMING DIST CALCULATION

### Import Dependencies

In [None]:
import csv
import ast
import re
import numpy as np
import pandas as pd
import seaborn as sns

from Bio import SeqIO

from matplotlib import pyplot as plt

### Define Functions

In [None]:
def cigarsfromsam(samfilepath):
    cigars = {}
    with open(samfilepath, "r") as samfile:
        for line in samfile:
            if line[0] == "@":
                next(samfile)
            else:
                splitline = line.split("\t")
                cigars[splitline[0]] = splitline[5]
    return cigars


def strsfromfasta(fastafilepath):
    queries = SeqIO.to_dict(SeqIO.parse(fastafilepath, "fasta"))
    queries = {key: str(val.seq) for key, val in queries.items()}
    return queries


def make_seg_dict(gfafile):
    segment_dict = {}
    with open(gfafile, "r") as infile:
        for line in infile:
            if line[0] == "S":
                splitline = line.split("\t")
                segment_dict[splitline[1]] = splitline[2][:-1]
    return segment_dict


def get_ref_intervals(gfafile):
    segment_dict = {}
    current_idx = 0
    with open(gfafile, "r") as infile:
        for line in infile:
            if line[0] == "S":
                splitline = line.split("\t")
                if "OFF" not in splitline[1]:
                    refstr = splitline[2][:-1]
                    strlen = len(refstr)
                    name = splitline[1]
                    if "ON" in name:
                        name = name[:-2]
                    segment_dict[name] = tuple((current_idx, current_idx + strlen))
                    current_idx += strlen
    return segment_dict


def align_read(
    querystr, refstr, cigarstr, startpos=1, pattern=re.compile("[0-9]{0,10}[MDI]")
):
    start_pos = startpos - 1  ##comes as 1 indexed from minimap
    result = pattern.finditer(cigarstr)
    cigar_seq = [(item.group(0)[-1], int(item.group(0)[:-1])) for item in result]
    output_str = ""
    if start_pos > 0:
        output_str += "".join(["-" for i in range(start_pos)])
    current_idx = 0
    for item in cigar_seq:
        if item[0] == "M":
            added_str = querystr[current_idx : current_idx + item[1]]
            output_str += added_str
            current_idx += item[1]
        elif item[0] == "D":
            added_str = "".join(["-" for i in range(item[1])])
            output_str += added_str
        elif item[0] == "I":
            current_idx += item[1]
    remaining_len = len(refstr) - len(output_str)
    if remaining_len > 0:
        output_str += "".join(["-" for i in range(remaining_len)])
    return output_str


def splitstr(instr, ref_intervals):
    strassign = {key: instr[val[0] : val[1]] for key, val in ref_intervals.items()}
    return strassign


def slow_hamming_distance(s1, s2):
    if len(s1) != len(s2):
        print(s1, s2)
        raise ValueError("Strand lengths are not equal!")
    term_list = []
    for ch1, ch2 in zip(s1, s2):
        if ch1 == "N" or ch2 == "N":
            term_list.append(False)
        else:
            term_list.append(ch1 != ch2)
    result = sum(term_list)
    return result


def get_dict_dist(dict1, dict2):
    hamming_dict = {
        key: slow_hamming_distance(dict1[key], dict2[key]) for key in dict1.keys()
    }
    return hamming_dict

### Get Library Statistics

In [None]:
## Import data from nanopore snakemake pipeline
R10_data = pd.read_csv(
    "/home/de64/scratch/de64/2020-10-20_snakemake_2020-10-14_lDE11_R10-3_merged_final/output.tsv",
    delimiter="\t",
)

## Get set of all observed barcodes
R10_barcodes = set(R10_data["barcode"].tolist())

## Convert barcodes into array format
bit_arr = np.array([list(item) for item in R10_barcodes]).astype(int)

## Get frequency of the ON state for each bit
bit_freq = np.mean(bit_arr, axis=0)

## Determine the barcode hamming distance to the closest match for each observed barcode
both_on = bit_arr @ bit_arr.T
both_off = (-bit_arr + 1) @ (-bit_arr.T + 1)
ttl_match = both_on + both_off
np.fill_diagonal(ttl_match, 100)
closest_match = np.min(ttl_match, axis=0)

## Assign closest match to each barcode (for costructing output df later)
closest_match_dict = {
    barcode: closest_match[k] for k, barcode in enumerate(R10_barcodes)
}

In [None]:
fig = plt.figure(figsize=(16, 6))
sns.barplot(x=list(range(27)), y=bit_freq, color="grey")
plt.xlabel("Bit Number", fontsize=20)
plt.ylabel("Percent Positive", fontsize=20)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.show()
# plt.savefig("./figure_1.png",dpi=300,bbox_inches="tight")

plt.hist(closest_match, range=(0, 10))
plt.xlabel("Closest Hamming Distance", fontsize=20)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.show()
# plt.savefig("./figure_2.png",dpi=300,bbox_inches="tight")

plt.hist(closest_match, range=(0, 2), bins=3)
plt.xlabel("Closest Hamming Distance", fontsize=20)
plt.xticks([0.25, 1.0, 1.75], [0, 1, 2], fontsize=16)
plt.yticks(fontsize=16)
plt.show()
# plt.savefig("./figure_3.png",dpi=300,bbox_inches="tight")

### Call GFP and Make Output DF

In [None]:
## Align consensus sequence to reference using cigar string
aligned_cons = R10_data.apply(
    lambda x: align_read(
        x["consensus"], x["reference"], x["cigar"], startpos=x["alignmentstart"]
    ),
    axis=1,
)
R10_data["aligned_cons"] = aligned_cons

## Use GFA reference to determine intervals for each annotation
ref_intervals = get_ref_intervals(
    "/home/de64/scratch/de64/2020-10-20_snakemake_2020-10-14_lDE11_R10-3_merged_final/ref.gfa"
)

## Split sequences based on annotated intervals
split_ref = R10_data.apply(lambda x: splitstr(x["reference"], ref_intervals), axis=1)
split_align = R10_data.apply(
    lambda x: splitstr(x["aligned_cons"], ref_intervals), axis=1
)
R10_data["split_ref"] = split_ref
R10_data["split_align"] = split_align

## Compute hamming distance from reference, by annotated element
hamm_ref = R10_data.apply(
    lambda x: get_dict_dist(x["split_align"], x["split_ref"]), axis=1
)
R10_data["hamm_ref"] = hamm_ref

## Get hamming distance from reference of the nucleotides which vary in the library, to determine GFP vs DarkGFP
dark_gfp = (
    R10_data.apply(
        lambda x: slow_hamming_distance(
            x["split_align"]["GFP"][623:625], x["split_ref"]["GFP"][623:625]
        ),
        axis=1,
    )
    > 0
)
R10_data["dark_gfp"] = dark_gfp

## Assign closest match to each barcode (for costructing output df later)
R10_data["Closest Hamming Distance"] = R10_data["barcode"].apply(
    lambda x: closest_match_dict[x]
)

## Dropping leftover column
del R10_data["Unnamed: 0"]

In [None]:
R10_data.to_csv("./lDE11_final_df.tsv", sep="\t")

### Sanger Validation

#### Convert to fastq and group

In [None]:
with open("./ab1_files/merged.fastq", "w") as outfile:
    for i in range(1, 97):
        filepath1 = "./ab1_files/lDE11_validation_sample_" + str(i) + "-oDE154.ab1"
        filepath2 = "./ab1_files/lDE11_validation_sample_" + str(i) + "-oDE201.ab1"

        record1 = SeqIO.read(filepath1, "abi")
        record2 = SeqIO.read(filepath2, "abi")

        SeqIO.write(record1, outfile, "fastq")
        SeqIO.write(record2, outfile, "fastq")

#### Align to GAF

In [None]:
!GraphAligner -g ./ref.gfa -f ./ab1_files/merged.fastq -a ./ab1_files/aligned.gaf -x dbg --high-memory -b 20 -B 35 -C -1

#### Get cigar strings, barcodes and read sequences for each isolate

In [None]:
cigar_dict = {}
with open("./ab1_files/aligned.gaf", "r") as infile:
    for line in infile:
        data = line.split("\t")
        read_id = data[0].split(" ")[0]
        if ">" in data[5]:
            cigar_dict[read_id] = (
                "+",
                int(data[7]),
                int(data[8]),
                data[5],
                data[15].split(":")[-1][:-1],
            )
        else:
            cigar_dict[read_id] = (
                "-",
                int(data[7]),
                int(data[8]),
                data[5],
                data[15].split(":")[-1][:-1],
            )

barcode_dict = {}
for key in cigar_dict.keys():
    cigar = cigar_dict[key]
    if "oDE201" in key:
        barcode = cigar[3].split("<")
        barcode = barcode[::-1]
        barcode = barcode[:-1]
        barcode = (
            np.array(["ON" in item for item in barcode if "BIT" in item])
            .astype(int)
            .astype(str)
            .tolist()
        )
        barcode = "".join(barcode)
        index = key.split("_")[3].split("-")[0]
        barcode_dict[int(index)] = barcode

with open("./ab1_files/merged.fastq", "r") as infile:
    read_dict = SeqIO.parse(infile, "fastq")
    read_dict = SeqIO.to_dict(read_dict)

#### Get agreement between reads and nanopore calls at GFP/DarkGFP site

In [None]:
R10_data = pd.read_csv("./lDE11_final_df.tsv", sep="\t")

all_snp_hamming = []
for i in range(1, 97):
    if np.sum(R10_data["barcode"] == barcode_dict[i]) == 1:
        gfp_read_name = "lDE11_validation_sample_" + str(i) + "-oDE154"
        aligned = align_read(
            read_dict[gfp_read_name],
            R10_data[R10_data["barcode"] == barcode_dict[i]]["reference"].iloc[0],
            cigar_dict[gfp_read_name][4],
            startpos=cigar_dict[gfp_read_name][1] + 1,
        )
        ref_intervals = get_ref_intervals("./ref.gfa")
        split_ref = splitstr(
            R10_data[R10_data["barcode"] == barcode_dict[i]]["reference"].iloc[0],
            ref_intervals,
        )
        split_consensus = splitstr(
            R10_data[R10_data["barcode"] == barcode_dict[i]]["consensus"].iloc[0],
            ref_intervals,
        )
        split_align = splitstr(str(aligned.seq), ref_intervals)
        snp_hamming = slow_hamming_distance(
            split_align["GFP"][623:625], split_consensus["GFP"][623:625]
        )
        all_snp_hamming.append(snp_hamming)
all_snp_hamming = np.array(all_snp_hamming)

print("Percent Correct: " + str(np.sum(all_snp_hamming == 0) / len(all_snp_hamming)))

In [None]:
### Determine sanger reads with barcodes not in library
ttl_bc = []
for i in range(1, 97):
    ttl_bc.append(np.sum(R10_data["barcode"] == barcode_dict[i]))
ttl_bc = np.array(ttl_bc)
no_found_barcode_mask = ttl_bc == 0

In [None]:
### Measure average Phred quality of sanger reads
qualities = []
for i in range(1, 97):
    record = SeqIO.read(
        "./ab1_files/lDE11_validation_sample_" + str(i) + "-oDE201.ab1", "abi"
    )
    mean_quality = np.mean(record.letter_annotations["phred_quality"])
    qualities.append(mean_quality)
qualities = np.array(qualities)

In [None]:
### Plot Average Phred quality colored by whether a barcode was in or out of the library
plt.hist(qualities[~no_found_barcode_mask], range=(10, 50), bins=10, label="In Library")
plt.hist(
    qualities[no_found_barcode_mask], range=(10, 50), bins=10, label="Out of Library"
)
plt.legend()
plt.show()