In [None]:
import gzip
import shutil
import glob
import os

from Bio import SeqIO
from Bio.Seq import Seq
from Bio import motifs
from Bio.Alphabet import IUPAC
import numpy as np
from matplotlib import pyplot as plt

In [None]:
def gunzip_all(headpath):
    for filename in glob.iglob(headpath + "/**/*.gz", recursive=True):
        with gzip.open(filename, "rb") as f_in:
            with open(os.path.splitext(filename)[0], "wb") as f_out:
                shutil.copyfileobj(f_in, f_out)

In [None]:
headpath = "/n/scratch3/users/d/de64/2020-08-22_lDE11_run"

In [None]:
"/n/scratch3/users/d/de64/2020-08-22_lDE11_run/BC2_L001-ds.c37aa7639b2b4ee188346cb6e15bac31/"

### Unzip All

In [None]:
gunzip_all(headpath)

In [None]:
!head '/n/scratch3/users/d/de64/2020-08-22_lDE11_run/BC1_L001-ds.b282eb9272a34059a378b4aeae518283/BC1_S2_L001_R1_001.fastq'

### Binning by Nmer

In [None]:
datapath = "/n/scratch3/users/d/de64/2020-08-22_lDE11_run/BC1_L001-ds.b282eb9272a34059a378b4aeae518283/BC1_S2_L001_R1_001.fastq"

In [None]:
def rev_comp(instr):
    instr = Seq(instr.upper(), IUPAC.unambiguous_dna)
    rcinstr = instr.reverse_complement()
    rcinstr = str(rcinstr)
    return rcinstr


def get_fastq_paths(headpath):
    fwdread_paths = {}
    revread_paths = {}
    for path in glob.iglob(headpath + "/**/*.fastq", recursive=True):
        filename = path.split("/")[-1].split("_")
        read_ID = filename[-2]
        dset_name = filename[0]
        if read_ID == "R1":
            fwdread_paths[dset_name] = path
        else:
            revread_paths[dset_name] = path
    return fwdread_paths, revread_paths


def get_Nmer_list(
    fastqpath, handle_seq="ACGAACGTTAGCAGCACTAT", reverse_complement=False
):
    if reverse_complement:
        handle_seq = rev_comp(handle_seq)

    handle_len = len(handle_seq)
    Nmer_dict = {}

    for idx, record in enumerate(SeqIO.parse(fastqpath, "fastq")):
        Nmer_start = record.seq.find(handle_seq)
        if Nmer_start > 0:
            if reverse_complement:
                Nmer = rev_comp(str(record.seq[Nmer_start - 15 : Nmer_start]))
            else:
                Nmer = str(
                    record.seq[Nmer_start + handle_len : Nmer_start + handle_len + 15]
                )
            Nmer_dict[idx] = Nmer
        else:
            Nmer_dict[idx] = None
    return Nmer_dict


def get_Nmer_codebook(Nmer_list, final_Nmer_idx):
    Nmer_codebook = {}
    for key, val in Nmer_list.items():
        if val in final_Nmer_idx.keys():
            Nmer_codebook[key] = final_Nmer_idx[val]
        else:
            Nmer_codebook[key] = None
    return Nmer_codebook

In [None]:
fwdread_paths, revread_paths = get_fastq_paths(headpath)

In [None]:
Nmer_list_dict = {}

In [None]:
Nmer_list_dict["GFP"] = get_Nmer_list(
    revread_paths["GFP"], handle_seq="ACGAACGTTAGCAGCACTAT", reverse_complement=True
)
Nmer_list_dict["BC1"] = get_Nmer_list(
    fwdread_paths["BC1"], handle_seq="ACGAACGTTAGCAGCACTAT", reverse_complement=False
)
Nmer_list_dict["BC2"] = get_Nmer_list(
    fwdread_paths["BC2"], handle_seq="ACGAACGTTAGCAGCACTAT", reverse_complement=False
)

In [None]:
all_Nmers = [
    Nmer for _, val in Nmer_list_dict.items() for _, Nmer in val.items() if Nmer != None
]
all_Nmer_arr = np.array(all_Nmers)

unique, counts = np.unique(all_Nmer_arr, return_counts=True)
final_Nmer_arr = unique[counts > 30]
final_Nmer_arr_counts = counts[counts > 30]
final_Nmer_idx = dict(zip(final_Nmer_arr, range(len(final_Nmer_arr))))

In [None]:
final_Nmer_idx

In [None]:
Nmer_codebooks = {}

Nmer_codebooks["GFP"] = get_Nmer_codebook(Nmer_list_dict["GFP"], final_Nmer_idx)
Nmer_codebooks["BC1"] = get_Nmer_codebook(Nmer_list_dict["BC1"], final_Nmer_idx)
Nmer_codebooks["BC2"] = get_Nmer_codebook(Nmer_list_dict["BC2"], final_Nmer_idx)

### Make consensuses

In [None]:
len(final_Nmer_idx)

In [None]:
rev_comp("AGAGGAAAGGAGAAAGGTGA")

In [None]:
rev_comp("ATAGGAAATGGTGGTAGTGT")

In [None]:
handle_seq = "ACGAACGTTAGCAGCACTAT"
key = "BC2"

grouped_reads = {i: [] for i in range(len(final_Nmer_idx))}

fwd_path = fwdread_paths[key]
rev_path = revread_paths[key]
Nmer_codebook = Nmer_codebooks[key]
for idx, record in enumerate(SeqIO.parse(fwd_path, "fastq")):
    if Nmer_codebook[idx] != None:
        record_str = str(record.seq)
        Nmer_start = record.seq.find(handle_seq)
        if len(record.seq[Nmer_start : Nmer_start + 146]) == 146:
            grouped_reads[Nmer_codebook[idx]].append(
                record.seq[Nmer_start : Nmer_start + 146]
            )

In [None]:
grouped_reads[0]

In [None]:
def plot_second_max(grouped_reads):
    try:
        m = motifs.create(grouped_reads).counts.normalize(pseudocounts=0.001)
        m = np.array([list(m[key]) for key in m.keys()]).T
        where_max = np.equal(m, np.max(m, axis=1)[:, np.newaxis])
        second_max = np.max(m[~where_max].reshape(m.shape[0], 3), axis=1)
        plt.plot(second_max, c="grey", alpha=0.3)
    except:
        pass


def get_over_thr_arr(grouped_reads):
    over_thr_arr = []
    for _, val in grouped_reads.items():
        try:
            m = motifs.create(val).counts.normalize(pseudocounts=0.001)
            m = np.array([list(m[key]) for key in m.keys()]).T
            where_max = np.equal(m, np.max(m, axis=1)[:, np.newaxis])
            second_max = np.max(m[~where_max].reshape(m.shape[0], 3), axis=1)
            over_thr = second_max > 0.1
            over_thr_arr.append(over_thr)
        except:
            pass
    over_thr_arr = np.array(over_thr_arr)
    return over_thr_arr

In [None]:
for i in range(1000):
    plot_second_max(grouped_reads[i])
plt.show()

In [None]:
over_thr_arr = get_over_thr_arr(grouped_reads)

In [None]:
over_thr_arr.shape

In [None]:
plt.hist(np.sum(over_thr_arr, axis=1) / over_thr_arr.shape[1], range=(0, 0.1), bins=30)
plt.show()

In [None]:
handle_seq = rev_comp("ATCACATTGCCATCAGTAAT")
key = "BC2"

grouped_reads = {i: [] for i in range(len(final_Nmer_idx))}

fwd_path = fwdread_paths[key]
rev_path = revread_paths[key]
Nmer_codebook = Nmer_codebooks[key]
for idx, record in enumerate(SeqIO.parse(rev_path, "fastq")):
    if Nmer_codebook[idx] != None:
        record_str = str(record.seq)
        Nmer_start = record.seq.find(handle_seq)
        if len(record.seq[Nmer_start : Nmer_start + 142]) == 142:
            grouped_reads[Nmer_codebook[idx]].append(
                record.seq[Nmer_start : Nmer_start + 142]
            )

In [None]:
str(grouped_reads[0][0])

In [None]:
for i in range(1000):
    plot_second_max(grouped_reads[i])
plt.show()

In [None]:
over_thr_arr = get_over_thr_arr(grouped_reads)

In [None]:
plt.hist(np.sum(over_thr_arr, axis=1) / over_thr_arr.shape[1], range=(0, 0.1), bins=30)
plt.show()

In [None]:
plt.hist(np.sum(over_thr_arr, axis=1) / over_thr_arr.shape[1], range=(0, 0.1), bins=30)
plt.show()

In [None]:
fwdread_paths["BC1"]

In [None]:
plt.hist(counts, range=(5, 100))

In [None]:
plt.hist(final_Nmer_arr_counts, range=(0, 500))

In [None]:
len(filtered_unique)

In [None]:
filtered_unique