In [None]:
import copy
import glob
import gzip
import os
import re
import shutil

import numpy as np
from Bio import SeqIO, motifs
from Bio.Seq import Seq
from matplotlib import pyplot as plt

In [None]:
headpath = "/n/scratch3/groups/hms/sysbio/paulsson/de64/2020-09-24_oDEPool3"
fastqpath = (
    "/n/scratch3/groups/hms/sysbio/paulsson/de64/2020-09-24_oDEPool3/data/fastq_pass"
)

In [None]:
def merge_fastq(fastqpath, outputpath, ttl_files=None):
    files = 0
    output_fastq = ""
    for filename in glob.iglob(fastqpath + "/*", recursive=True):
        with open(filename, "r") as infile:
            output_fastq += infile.read()
        if ttl_files != None:
            files += 1
            print(files)
            if files >= ttl_files:
                break
    with open(outputpath, "w") as outfile:
        outfile.write(output_fastq)

### Merge FASTQ

In [None]:
merge_fastq(fastqpath, headpath + "/passed_reads.fastq", ttl_files=None)

In [None]:
!head '/n/scratch3/groups/hms/sysbio/paulsson/de64/2020-09-24_oDEPool3/passed_reads.fastq'

### Run GraphAligner

```
(base) [de64@compute-e-16-192 ~]$ source activate nanopore
(nanopore) [de64@compute-e-16-192 ~]$ GraphAligner -g ~/scratch/de64/2020-09-24_oDEPool3/aDE4.gfa -f ~/scratch/de64/2020-09-24_oDEPool3/FAK31569_pass_b3389333_46.fastq -a ~/scratch/de64/2020-09-24_oDEPool3/output.gaf -x vg
GraphAligner bioconda 1.0.12-
GraphAligner bioconda 1.0.12-
Load graph from /home/de64/scratch/de64/2020-09-24_oDEPool3/aDE4.gfa
Build alignment graph
118 original nodes
150 split nodes
2 ambiguous split nodes
246 edges
99 nodes with in-degree >= 2
Build minimizer seeder from the graph
Minimizer seeds, length 15, window size 20, density 10
Seed cluster size 1
Initial bandwidth 10
write alignments to /home/de64/scratch/de64/2020-09-24_oDEPool3/output.gaf
Align
Alignment finished
Input reads: 4000 (7071710bp)
Seeds found: 422660
Seeds extended: 12573
Reads with a seed: 3984 (7025587bp)
Reads with an alignment: 3984
Alignments: 4126 (6980856bp) (8413 additional alignments discarded)
End-to-end alignments: 3830 (6590459bp)
```


### Get Reads and CIGAR Strings

In [None]:
fastqfile = headpath + "/passed_reads.fastq"
gafpath = headpath + "/output.gaf"

In [None]:
read_dict = {}
for idx, record in enumerate(SeqIO.parse(fastqfile, "fastq")):
    read_dict[record.id] = str(record.seq)

In [None]:
read_ids = read_dict.keys()

In [None]:
cigar_dict = {}
with open(gafpath, "r") as infile:
    for line in infile:
        data = line.split("\t")
        read_id = data[0].split(" ")[0]
        if ">" in data[5]:
            cigar_dict[read_id] = (
                "+",
                int(data[7]),
                int(data[8]),
                data[5],
                data[15].split(":")[-1][:-1],
            )
        else:
            cigar_dict[read_id] = (
                "-",
                int(data[7]),
                int(data[8]),
                data[5],
                data[15].split(":")[-1][:-1],
            )

In [None]:
cigar_ids = cigar_dict.keys()

In [None]:
set(cigar_ids) - set(read_ids)

### Trim Reads to Constant Length

In [None]:
cigar_dict

In [None]:
test_key = list(cigar_dict.keys())[600]

In [None]:
test_read = read_dict[test_key]
test_cigar = cigar_dict[test_key]

In [None]:
test_read

In [None]:
test_cigar

In [None]:
pattern = re.compile("[0-9]{0,10}[MDI]")
result = pattern.finditer(test_cigar[4])
cigar_seq = [(item.group(0)[-1], int(item.group(0)[:-1])) for item in result]

In [None]:
def rev_comp(instr):
    instr = Seq(instr.upper())
    rcinstr = instr.reverse_complement()
    rcinstr = str(rcinstr)
    return rcinstr


def align_read(read, cigar, pattern=re.compile("[0-9]{0,10}[MDI]"), padding=1750):
    result = pattern.finditer(cigar[4])
    cigar_seq = [(item.group(0)[-1], int(item.group(0)[:-1])) for item in result]
    output_str = "".join(["-" for i in range(cigar[1])])
    current_idx = 0
    for item in cigar_seq:
        if item[0] == "M":
            added_str = read[current_idx : current_idx + item[1]]
            output_str += added_str
            current_idx += item[1]
        elif item[0] == "D":
            added_str = "".join(["-" for i in range(item[1])])
            output_str += added_str
        elif item[0] == "I":
            current_idx += item[1]
    output_str = output_str[:padding]
    current_len = len(output_str)
    pad_len = padding - current_len
    if pad_len > 0:
        output_str += "".join(["-" for i in range(pad_len)])
    if cigar[0] == "-":
        output_str = rev_comp(output_str)
    return output_str


def make_seg_dict(gfafile):
    segment_dict = {}
    with open(gfafile, "r") as infile:
        for line in infile:
            if line[0] == "S":
                splitline = line.split("\t")
                segment_dict[splitline[1]] = splitline[2][:-1]
    return segment_dict


def generate_reference(cigar, segment_dict):
    if cigar[0] == "+":
        traversal = cigar[3].split(">")[1:]
        ref = "".join(list(map(lambda seg: segment_dict[seg], traversal)))
    elif cigar[0] == "-":
        traversal = cigar[3].split("<")[1:][::-1]
        ref = "".join(list(map(lambda seg: segment_dict[seg], traversal)))
    return ref

In [None]:
aligned_read_dict = {}
barcode_dict = {}
for key in cigar_dict.keys():
    cigar = cigar_dict[key]
    #     read = read_dict[key]
    if "GFP" in cigar[3] and "SPACER4" in cigar[3]:
        #         aligned_read_dict[key] = align_read(read,cigar)
        if cigar[0] == "-":
            barcode = cigar[3].split("<")
            barcode = barcode[::-1]
        elif cigar[0] == "+":
            barcode = cigar[3].split(">")
        barcode = barcode[:-1]
        barcode = (
            np.array(["ON" in item for item in barcode if "BIT" in item])
            .astype(int)
            .astype(str)
            .tolist()
        )
        barcode = "".join(barcode)
        barcode_dict[key] = barcode

In [None]:
cigar_dict

In [None]:
len(barcode_dict)

In [None]:
barcode_arr = np.array(list(barcode_dict.values()))

unique, counts = np.unique(barcode_arr, return_counts=True)
plt.hist(counts, range=(3, 50), bins=48)
plt.show()

### Notes

Ok, it looks like 50X depth with very time consuming settings on the alignment are suffecient to get the barcodes.
For the record the command run was:
```
(nanopore) [de64@compute-e-16-192 ~]$ GraphAligner -g ~/scratch/de64/2020-09-24_oDEPool3/aDE4.gfa -f ~/scratch/de64/2020-09-24_oDEPool3/passed_reads.fastq -a ~/scratch/de64/2020-09-24_oDEPool3/output.gaf --high-memory --seeds-first-full-rows 64 -b 35 -C -1
```

In [None]:
np.sum(counts > 100)

In [None]:
final_barcode_arr = unique[counts > 100]
final_barcode_arr_counts = counts[counts > 100]
final_barcode_idx = dict(zip(final_barcode_arr, range(len(final_barcode_arr))))
inv_barcode_idx = dict(zip(range(len(final_barcode_arr)), final_barcode_arr))

In [None]:
len(final_barcode_idx)

In [None]:
def get_barcode_codebook(barcode_dict, final_barcode_idx):
    barcode_codebook = {}
    inv_barcode_codebook = {i: [] for i in range(len(final_barcode_idx))}
    for key, val in barcode_dict.items():
        if val in final_barcode_idx.keys():
            barcode_codebook[key] = final_barcode_idx[val]
            inv_barcode_codebook[final_barcode_idx[val]].append(key)
        else:
            barcode_codebook[key] = None
    return barcode_codebook, inv_barcode_codebook


def get_perc_mapped(barcode_codebook):
    mapped_arr = np.array([val != None for val in barcode_codebook.values()])
    perc_mapped = np.sum(mapped_arr) / mapped_arr.shape[0]
    return perc_mapped

In [None]:
barcode_codebook, inv_barcode_codebook = get_barcode_codebook(
    barcode_dict, final_barcode_idx
)

In [None]:
barcode_codebook

In [None]:
get_perc_mapped(barcode_codebook)

### Export Read Groups

In [None]:
readgroup_path = headpath + "/readgroups"
if os.path.exists(readgroup_path):
    if os.path.exists(readgroup_path):
        shutil.rmtree(readgroup_path)
    os.makedirs(readgroup_path)

seg_dict = make_seg_dict(headpath + "/aDE4.gfa")
record_dict = SeqIO.to_dict(SeqIO.parse(fastqfile, "fastq"))
for key, val in inv_barcode_codebook.items():
    out_str = ""
    ref_str = ""
    for read_name in val:
        seq_record = record_dict[read_name]
        out_str += "@" + str(seq_record.id) + "\n"
        out_str += str(seq_record.seq) + "\n"

    with open(readgroup_path + "/group_" + str(key) + ".fastq", "w") as outfile:
        outfile.write(out_str)

    ref_seq = generate_reference(cigar_dict[read_name], seg_dict)
    ref_str += ">group_" + str(key) + "\n"
    ref_str += ref_seq + "\n"

    with open(readgroup_path + "/ref_" + str(key) + ".fasta", "w") as outfile:
        outfile.write(ref_str)

# for idx, record in enumerate(SeqIO.parse(fastqfile, "fastq")):
#     if record.id in barcode_codebook.keys():
#         read_group = barcode_codebook[record.id]
#         with open(readgroup_path + "/group_" + str(read_group) + ".fastq","a") as outfile:
#             outfile.write("@" + str(record.id) + "\n")
#             outfile.write(str(record.seq) + "\n")

### Map to Reference Path with Minimap2

```

minimap2 -ax map-ont -a ./readgroups/ref_6.fasta ./readgroups/group_6.fastq > ./alignment_6.sam; samtools sort ./alignment_6.sam -o ./alignment_6.sorted.sam; cp ./readgroups/ref_6.fasta ./

```

### Nanopolish

```
sed -n '1~4s/^@/>/p;2~4p' group_6.fastq > group_6.fasta ##convert to fasta

minimap2 -ax map-ont -t 8 ref_6.fasta group_6.fasta | samtools sort -o group_6.sorted.bam -T group_6.tmp

samtools index group_6.sorted.bam

nanopolish variants --consensus group_6.cons -o group_6_polished.vcf -w "group_6" -r group_6.fasta -b group_6.sorted.bam -g ref_6.fasta

```

### Madaka
```
medaka_consensus -i group_6.fastq -d ref_6.fasta -o consensus_6


minimap2 -ax map-ont -t 8 ref_6.fasta group_6.fasta | samtools sort -o group_6.bam

samtools index group_6.bam group_6.bam.bai



```

In [None]:
minimap2 -ax map-ont -t 8 ref_6.fasta consensus.fasta | samtools sort -o consensus.sorted.bam -T consensus.tmp

samtools index consensus.sorted.bam

### Make consensuses

In [None]:
barcode_codebook

In [None]:
seg_dict

In [None]:
barcode_dict[inv_barcode_codebook[1][0]]

In [None]:
def group_reads(final_barcode_idx, reads_path, barcode_dict):
    grouped_reads = {i: [] for i in range(len(final_Nmer_idx))}

    for idx, record in enumerate(SeqIO.parse(reads_path, "fastq")):
        if Nmer_codebook[idx] != None:
            record_str = str(record.seq)
            handle_start = record.seq.find(handle_seq)
            if (
                len(record.seq[handle_start : handle_start + min_read_len])
                == min_read_len
            ):
                grouped_reads[Nmer_codebook[idx]].append(
                    record.seq[handle_start : handle_start + min_read_len]
                )
    return grouped_reads


def get_all_grouped_reads(
    key_list, handle_dict, fwdread_paths, revread_paths, Nmer_codebooks, final_Nmer_idx
):
    grouped_reads_dict = {}
    for key in key_list:
        fwd_path = fwdread_paths[key]
        rev_path = revread_paths[key]
        fwd_handle, rev_handle = tuple(handle_dict[key])
        Nmer_codebook = Nmer_codebooks[key]

        fwd_grouped_reads = group_reads(
            final_Nmer_idx, fwd_path, Nmer_codebook, fwd_handle
        )
        rev_grouped_reads = group_reads(
            final_Nmer_idx, rev_path, Nmer_codebook, rev_handle
        )
        grouped_reads_dict[key] = [fwd_grouped_reads, rev_grouped_reads]
    return grouped_reads_dict

In [None]:
key_list = ["GFP", "BC1", "BC2"]
handle_dict = {
    "GFP": ["AAGTAGTGACAAGTGTTGGC", "AGGCTAGCTAACGTTACTGT"],
    "BC1": ["ACGAACGTTAGCAGCACTAT", "GTATCTGTTATGTAATTGCTAG"],
    "BC2": ["ACGAACGTTAGCAGCACTAT", "ATTACTGATGGCAATGTGAT"],
}

grouped_reads_dict = get_all_grouped_reads(
    key_list, handle_dict, fwdread_paths, revread_paths, Nmer_codebooks, final_Nmer_idx
)

#### Filter out barcodes with low representation in at least one group

In [None]:
def get_underrep_barcodes(grouped_reads_dict, min_count=5):
    underrep_barcodes = []
    for key in grouped_reads_dict.keys():
        for idx, val in grouped_reads_dict[key][0].items():
            if len(val) < min_count:
                underrep_barcodes.append(idx)
        for idx, val in grouped_reads_dict[key][1].items():
            if len(val) < min_count:
                underrep_barcodes.append(idx)
    underrep_barcodes = sorted(list(set(underrep_barcodes)))
    return underrep_barcodes

In [None]:
underrep_barcodes = get_underrep_barcodes(grouped_reads_dict)

In [None]:
len(underrep_barcodes)

In [None]:
def remove_underrep(grouped_reads, underrep_barcodes):
    new_idx = 0
    output_dict = copy.copy(grouped_reads)

    for key, val in grouped_reads.items():
        if key in underrep_barcodes:
            pass
        else:
            output_dict[new_idx] = val
            new_idx += 1
    return output_dict


def remove_all_underrep(grouped_reads_dict, underrep_barcodes):
    represented_reads_dict = copy.copy(grouped_reads_dict)

    for key in grouped_reads_dict.keys():
        represented_reads_dict[key][0] = remove_underrep(
            grouped_reads_dict[key][0], underrep_barcodes
        )
        represented_reads_dict[key][1] = remove_underrep(
            grouped_reads_dict[key][1], underrep_barcodes
        )

    return represented_reads_dict

In [None]:
represented_reads_dict = remove_all_underrep(grouped_reads_dict, underrep_barcodes)

In [None]:
represented_reads_dict["BC1"][1]

In [None]:
def get_group_consensus(grouped_reads):
    consensus_seqs = []

    for key, val in grouped_reads.items():
        working_motif = motifs.create(val)
        consensus_seqs.append(str(working_motif.consensus))

    consensus_seqs = np.array(consensus_seqs)
    return consensus_seqs

In [None]:
consensus_seqs = get_group_consensus(represented_reads_dict["BC1"][1])

In [None]:
consensus_seqs[0]

#### Bit Extractiton

In [None]:
def get_bitmap(start_bit, start_bit_idx, rev_read, bit_len=20, read_length=145):
    bit_starts = list(range(start_bit_idx, read_length, bit_len + 1))[:-1]
    if rev_read:
        bit_map = {
            start_bit - i: slice(bit_start, bit_start + bit_len)
            for i, bit_start in enumerate(bit_starts)
        }
    else:
        bit_map = {
            start_bit + i: slice(bit_start, bit_start + bit_len)
            for i, bit_start in enumerate(bit_starts)
        }
    return bit_map

In [None]:
bc1_f_map = get_bitmap(0, 56, False)
bc1_r_map = get_bitmap(9, 21, True)
bc2_f_map = get_bitmap(0, 56, False)
bc2_r_map = get_bitmap(29, 21, True)

In [None]:
bc1_r_map

#### Define Bit Sequence Reference

In [None]:
pos_seqs = [
    "ACACTACCACCATTTCCTAT",
    "AAACACACACTAAACCACCC",
    "ATCCTCCTTCAATACATCCC",
    "TATCTCATCAATCCCACACT",
    "ACTCCACTACTACTCACTCT",
    "AACTCATCTCAATCCTCCCA",
    "ACCACAACCCATTCCTTTCA",
    "TCTATCATCTCCAAACCACA",
    "ACCCTCTAACTTCCATCACA",
    "AATACTCTCCCACCTCAACT",
    "TTTCTACCACTAATCAACCC",
    "TCCAACTCATCTCTAATCTC",
    "TCCTATTCTCAACCTAACCT",
    "ATAAATCATTCCCACTACCC",
    "ACCCTTTACAAACACACCCT",
    "TTCCTAACAAATCACATCCC",
    "TATCCTTCAATCCCTCCACA",
    "ACCCAACACTCATAACATCC",
    "TTTACTCCCTACACCTCCAA",
    "ACTTTCCACATACTATCCCA",
    "ACATTACACCTCATTCTCCC",
    "TACTACAAACCCATAATCCC",
    "TTCTCCCTCTATCAACTCTA",
    "TTCTTCCCTCAATCTTCATC",
    "TCCTAACAACCAACTACTCC",
    "ACCTTTCTCCATACCCAACT",
    "ACCCTTACTACTACATCATC",
    "AATCTCACCTTCCACTTCAC",
    "TCTATCATTACCCTCCTCCT",
    "TCCTCATCTTACTCCCTCTA",
]

neg_seqs = [
    "TCACCTTTCTCCTTTCCTCT",
    "CCCTCTACTCTCCATCTTAT",
    "AACCTCCTCTCTCCATCATA",
    "TCACCATAATTCCTCCTCCT",
    "ACCAACTTCCACACATCACT",
    "CCCTCTTACTTATCTACCCA",
    "ACATCTTCTCTCCAACCTTC",
    "TATCATCCTCCTTCTCTCAC",
    "CTTCTTCTCTTACACCCTCT",
    "TCCCACCTTCACTTCACTAT",
    "CACCCTAACATACAACTCTC",
    "AAACTTCATCACTCTCCTCC",
    "TCAATCCACCATTCCTCAAC",
    "TAAAACCCATCCCACATCCT",
    "TTAAACAACCCATCCCACCA",
    "CATAACCCTACACACAACAC",
    "CTCTCTACACCCACCAATAA",
    "ATTCCATACCCACTCTCTTC",
    "CCCTTACCAACAACAATCCT",
    "TCAACTCATTACCCACAACC",
    "CATATCCAACCACAACCTCA",
    "CAACCACACTCAACTACCAT",
    "ACCTTCTACTCCCAACATTC",
    "CCTCTTCATCCTCTTTCAAC",
    "AACTCACAAACACCTCACCT",
    "CCCAAAACCACACACCAATT",
    "ATCCATATCCTTCTCACCCT",
    "CTCTTAACTACCCTCATTCC",
    "TTTCCTTCTTCCCACCAACT",
    "CAACCACCAACTTCAATCTC",
]

bc_ref_map = np.array(list(zip(neg_seqs, pos_seqs)))

In [None]:
bc_ref_map.shape

#### Decode Consensus Bits

In [None]:
def str_to_int(string):
    code = {"A": 0, "C": 1, "G": 2, "T": 3}
    conv_str = np.array(list(map(lambda x: code[x], string)))
    return conv_str


def compare_seqs(target_arr, reference_arr):
    target_int_arr = np.array(list(map(str_to_int, target_arr)), dtype="uint8")
    reference_int_arr = np.array(list(map(str_to_int, reference_arr)), dtype="uint8")

    bool_arr = target_int_arr[:, np.newaxis, :] == reference_int_arr[np.newaxis, :, :]

    agreement_arr = np.sum(bool_arr, axis=2, dtype=int)
    hamming_arr = bool_arr.shape[2] - agreement_arr

    return hamming_arr


def get_bit_assignment(seq_arr, bc_ref, single_bit_map, rev_read):
    if rev_read:
        bit_arr = np.array(list(map(lambda x: x[single_bit_map], seq_arr)))
    else:
        bit_arr = np.array(list(map(lambda x: rev_comp(x[single_bit_map]), seq_arr)))

    hamming_arr = compare_seqs(bit_arr, bc_ref)
    assigned_bit_arr = np.argmin(hamming_arr, axis=1)

    return assigned_bit_arr


def get_read_bit_assignment(seq_arr, bit_map, bc_ref_map, rev_read):
    bit_assignment = {}

    for bit in bit_map.keys():
        bc_ref = bc_ref_map[bit]
        single_bit_map = bit_map[bit]
        bit_assignment[bit] = get_bit_assignment(
            seq_arr, bc_ref, single_bit_map, rev_read
        )

    return bit_assignment


def get_perc_matched(grouped_reads, bit_assignment, bit_map, bc_ref_map, rev_read):
    perc_match_dict = {key: [] for key in bit_map.keys()}

    for read_idx in grouped_reads.keys():
        query_arr = np.array([str(item) for item in grouped_reads[read_idx]])
        query_assign = get_read_bit_assignment(query_arr, bit_map, bc_ref_map, rev_read)

        for key in perc_match_dict.keys():
            correct_assign_arr = query_assign[key] == bit_assignment[key][read_idx]
            perc_match = np.sum(correct_assign_arr) / correct_assign_arr.shape[0]
            perc_match_dict[key].append(perc_match)

    return perc_match_dict

In [None]:
consensus_seqs = get_group_consensus(represented_reads_dict["BC1"][0])
bit_assignment = get_read_bit_assignment(consensus_seqs, bc1_f_map, bc_ref_map, False)
bc1_f_perc_match_dict = get_perc_matched(
    represented_reads_dict["BC1"][0], bit_assignment, bc1_f_map, bc_ref_map, False
)

In [None]:
plt.hist(
    bc1_f_perc_match_dict[0],
    range=(0.5, 1.0),
    bins=20,
    label="Bit 0",
    color="grey",
    alpha=0.7,
)
plt.hist(
    bc1_r_perc_match_dict[9],
    range=(0.5, 1.0),
    bins=20,
    label="Bit 9",
    color="salmon",
    alpha=0.7,
)
plt.ylim(0, 4000)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.xlabel("% Bit Agreement", fontsize=20)
plt.ylabel("# of Barcodes", fontsize=20)
plt.legend()
plt.tight_layout()
plt.savefig("BC1.png", dpi=200)

In [None]:
plt.hist(
    bc2_f_perc_match_dict[0],
    range=(0.5, 1.0),
    bins=20,
    label="Bit 0",
    color="grey",
    alpha=0.7,
)
plt.hist(
    bc2_r_perc_match_dict[29],
    range=(0.5, 1.0),
    bins=20,
    label="Bit 29",
    color="salmon",
    alpha=0.7,
)
plt.ylim(0, 4000)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.xlabel("% Bit Agreement", fontsize=20)
plt.ylabel("# of Barcodes", fontsize=20)
plt.legend()
plt.tight_layout()
plt.savefig("BC2.png", dpi=200)

In [None]:
plt.hist(bc1_f_perc_match_dict[0], range=(0.5, 1.0), bins=20)
plt.ylim(0, 4000)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.xlabel("% Bit 0 Agreement", fontsize=20)
plt.ylabel("# of Barcodes", fontsize=20)
plt.show()

In [None]:
plt.hist(bc1_f_perc_match_dict[3], range=(0.5, 1.0), bins=20)
plt.ylim(0, 4000)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.xlabel("% Bit 3 Agreement", fontsize=20)
plt.ylabel("# of Barcodes", fontsize=20)
plt.show()

In [None]:
consensus_seqs = get_group_consensus(represented_reads_dict["BC1"][1])
bit_assignment = get_read_bit_assignment(consensus_seqs, bc1_r_map, bc_ref_map, True)
bc1_r_perc_match_dict = get_perc_matched(
    represented_reads_dict["BC1"][1], bit_assignment, bc1_r_map, bc_ref_map, True
)

In [None]:
plt.hist(bc1_r_perc_match_dict[5], range=(0.5, 1.0), bins=20)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.xlabel("% Bit 5 Agreement", fontsize=20)
plt.ylabel("# of Barcodes", fontsize=20)
plt.show()

In [None]:
plt.hist(bc1_r_perc_match_dict[9], range=(0.5, 1.0), bins=20)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.xlabel("% Bit 9 Agreement", fontsize=20)
plt.ylabel("# of Barcodes", fontsize=20)
plt.show()

In [None]:
consensus_seqs = get_group_consensus(represented_reads_dict["BC2"][0])
bit_assignment = get_read_bit_assignment(consensus_seqs, bc2_f_map, bc_ref_map, False)
bc2_f_perc_match_dict = get_perc_matched(
    represented_reads_dict["BC2"][0], bit_assignment, bc2_f_map, bc_ref_map, False
)

In [None]:
plt.hist(bc2_f_perc_match_dict[0], range=(0.5, 1.0), bins=20)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.xlabel("% Bit 0 Agreement", fontsize=20)
plt.ylabel("# of Barcodes", fontsize=20)
plt.show()

In [None]:
plt.hist(bc2_f_perc_match_dict[3], range=(0.5, 1.0), bins=20)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.xlabel("% Bit 3 Agreement", fontsize=20)
plt.ylabel("# of Barcodes", fontsize=20)
plt.show()

In [None]:
consensus_seqs = get_group_consensus(represented_reads_dict["BC2"][1])
bit_assignment = get_read_bit_assignment(consensus_seqs, bc2_r_map, bc_ref_map, True)
bc2_r_perc_match_dict = get_perc_matched(
    represented_reads_dict["BC2"][1], bit_assignment, bc2_r_map, bc_ref_map, True
)

In [None]:
plt.hist(bc2_r_perc_match_dict[25], range=(0.5, 1.0), bins=20)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.xlabel("% Bit 25 Agreement", fontsize=20)
plt.ylabel("# of Barcodes", fontsize=20)
plt.show()

In [None]:
plt.hist(bc2_r_perc_match_dict[29], range=(0.5, 1.0), bins=20)
plt.xticks(fontsize=16)
plt.yticks(fontsize=16)
plt.xlabel("% Bit 29 Agreement", fontsize=20)
plt.ylabel("# of Barcodes", fontsize=20)
plt.show()

In [None]:
test_assign[1] == bit_assignment[1][0]

In [None]:
def get_bit_assignment(seq_arr,bit_map,rev_read)
bit_arr = np.array(list(map(lambda x: rev_comp(x[bc1_f_map[0]]), consensus_seqs)))

hamming_arr = compare_seqs(bit_arr,bc_ref_map[0])
assigned_bit = np.argmin(hamming_arr,axis=1)
dist_from_assigned = np.min(hamming_arr,axis=1)
good_assign = dist_from_assigned < 3
perc_confident_assignment = np.sum(good_assign)/good_assign.shape[0]

In [None]:
print(perc_confident_assignment)

In [None]:
plt.hist(dist_from_assigned, range=(1, 15))
plt.show()

In [None]:
hamming_arr.shape

In [None]:
bit_0_arr

In [None]:
grouped_reads_dict["BC1"][1][100]

In [None]:
bit_map

In [None]:
def plot_second_max(grouped_reads):
    try:
        m = motifs.create(grouped_reads).counts.normalize(pseudocounts=0.001)
        m = np.array([list(m[key]) for key in m.keys()]).T
        where_max = np.equal(m, np.max(m, axis=1)[:, np.newaxis])
        second_max = np.max(m[~where_max].reshape(m.shape[0], 3), axis=1)
        plt.plot(second_max, c="grey", alpha=0.3)
    except:
        pass

In [None]:
grouped_reads_dict["BC1"][0][100]

In [None]:
def plot_second_max(grouped_reads):
    try:
        m = motifs.create(grouped_reads).counts.normalize(pseudocounts=0.001)
        m = np.array([list(m[key]) for key in m.keys()]).T
        where_max = np.equal(m, np.max(m, axis=1)[:, np.newaxis])
        second_max = np.max(m[~where_max].reshape(m.shape[0], 3), axis=1)
        plt.plot(second_max, c="grey", alpha=0.3)
    except:
        pass


def get_over_thr_arr(grouped_reads):
    over_thr_arr = []
    for _, val in grouped_reads.items():
        try:
            m = motifs.create(val).counts.normalize(pseudocounts=0.001)
            m = np.array([list(m[key]) for key in m.keys()]).T
            where_max = np.equal(m, np.max(m, axis=1)[:, np.newaxis])
            second_max = np.max(m[~where_max].reshape(m.shape[0], 3), axis=1)
            over_thr = second_max > 0.1
            over_thr_arr.append(over_thr)
        except:
            pass
    over_thr_arr = np.array(over_thr_arr)
    return over_thr_arr

In [None]:
for i in range(1000):
    plot_second_max(grouped_reads[i])
plt.show()

In [None]:
over_thr_arr = get_over_thr_arr(grouped_reads)

In [None]:
over_thr_arr.shape

In [None]:
plt.hist(np.sum(over_thr_arr, axis=1) / over_thr_arr.shape[1], range=(0, 0.1), bins=30)
plt.show()

In [None]:
handle_seq = rev_comp("ATCACATTGCCATCAGTAAT")
key = "BC2"

grouped_reads = {i: [] for i in range(len(final_Nmer_idx))}

fwd_path = fwdread_paths[key]
rev_path = revread_paths[key]
Nmer_codebook = Nmer_codebooks[key]
for idx, record in enumerate(SeqIO.parse(rev_path, "fastq")):
    if Nmer_codebook[idx] != None:
        record_str = str(record.seq)
        Nmer_start = record.seq.find(handle_seq)
        if len(record.seq[Nmer_start : Nmer_start + 142]) == 142:
            grouped_reads[Nmer_codebook[idx]].append(
                record.seq[Nmer_start : Nmer_start + 142]
            )

In [None]:
str(grouped_reads[0][0])

In [None]:
for i in range(1000):
    plot_second_max(grouped_reads[i])
plt.show()

In [None]:
over_thr_arr = get_over_thr_arr(grouped_reads)

In [None]:
plt.hist(np.sum(over_thr_arr, axis=1) / over_thr_arr.shape[1], range=(0, 0.1), bins=30)
plt.show()

In [None]:
plt.hist(np.sum(over_thr_arr, axis=1) / over_thr_arr.shape[1], range=(0, 0.1), bins=30)
plt.show()

In [None]:
fwdread_paths["BC1"]

In [None]:
plt.hist(counts, range=(5, 100))

In [None]:
plt.hist(final_Nmer_arr_counts, range=(0, 500))

In [None]:
len(filtered_unique)

In [None]:
filtered_unique

In [None]:
import ast
import csv

import numpy as np
from Bio import SeqIO
from matplotlib import pyplot as plt

fastqlist = ["reads/readset_0.fastq", "reads/readset_1.fastq"]
print(fastqlist)
record_dict = [
    SeqIO.to_dict(SeqIO.parse(fastqfile, "fastq")) for fastqfile in fastqlist
]
record_dict = {key: val for subdict in record_dict for key, val in subdict.items()}