In [None]:
import numpy as np
import dask.array as da
import trenchripper as tr
import trenchripper.cluster

In [None]:
def str_to_int(string):
    code = {"A": 0, "C": 1, "G": 2, "T": 3}
    conv_str = np.array(list(map(lambda x: code[x], string)))
    return conv_str


def int_to_str(integer):
    rev_code = ["A", "C", "G", "T"]
    conv_int = "".join(list(map(lambda x: rev_code[x], integer)))
    return conv_int


conv_str = str_to_int("TGTACAATTCATCCATACCA")

In [None]:
dask_controller.daskclient.restart()

In [None]:
dask_controller.shutdown()

In [None]:
dask_controller.daskclient

In [None]:
dask_controller = tr.cluster.dask_controller(
    walltime="04:00:00",
    local=False,
    n_workers=50,
    memory="16GB",
    working_directory="./dask",
)
dask_controller.startdask()

In [None]:
dask_controller.displaydashboard()

In [None]:
ex_1 = np.apply_along_axis(
    "".join, 1, np.random.choice(["A", "C", "G", "T"], size=(100000, 20))
)
ex_2 = np.apply_along_axis(
    "".join, 1, np.random.choice(["A", "C", "G", "T"], size=(100000, 20))
)

In [None]:
ex_1[:10]

In [None]:
ex_1_int = np.array(list(map(str_to_int, ex_1)), dtype="uint8")
ex_2_int = np.array(list(map(str_to_int, ex_2)), dtype="uint8")

In [None]:
ex_1_dask = da.from_array(ex_1_int[:, np.newaxis, :], chunks=(2500, 1, 20))
ex_2_dask = da.from_array(ex_2_int[np.newaxis, :, :], chunks=(1, 2500, 20))

In [None]:
ex_1_reshaped = ex_1_int[:, np.newaxis, :]

In [None]:
import numpy as np

np.array([[0], [1], [2], [3]]) == np.array([[0, 1, 2]])

In [None]:
ex_1_dask

In [None]:
output_arr = ex_1_dask == ex_2_dask

In [None]:
output_arr

In [None]:
output_arr = (ex_1_dask == ex_2_dask).rechunk()
output_arr = np.sum(output_arr, axis=2, dtype="uint8").rechunk()
output_arr = output_arr > 9
output_coords = np.argwhere(output_arr)

In [None]:
output_arr

In [None]:
output_coords = dask_controller.daskclient.compute(output_coords)

In [None]:
output_coords.result()

In [None]:
output_arr_future = dask_controller.daskclient.persist(output_coords)

In [None]:
output_arr_future.to_zarr("./dask_output.zarr", overwrite=True)

In [None]:
print(ex_1_int.shape)

In [None]:
print(ex_2_int.shape)

In [None]:
print(ex_1_int[:, np.newaxis, :].shape)

In [None]:
print(ex_1_int[np.newaxis, :, :].shape)

In [None]:
test_arr = ex_1_int[:, np.newaxis, :] == ex_2_int[np.newaxis, :, :]
match_arr = np.sum(test_arr, axis=2)

In [None]:
match_arr.shape

In [None]:
np.sum(np.any(match_arr > 13, axis=1))

In [None]:
from Bio.Sequencing.Applications import (
    BwaAlignCommandline,
    BwaIndexCommandline,
    BwaSamseCommandline,
)
from Bio import SeqIO
from Bio.Seq import Seq
from Bio.SeqRecord import SeqRecord
from Bio.Alphabet import IUPAC
import subprocess
import numpy as np

In [None]:
ex_1 = np.apply_along_axis(
    "".join, 1, np.random.choice(["A", "C", "G", "T"], size=(10000, 20))
).tolist()
ex_2 = np.apply_along_axis(
    "".join, 1, np.random.choice(["A", "C", "G", "T"], size=(10000, 20))
).tolist()

ex_1 = [SeqRecord(Seq(item, IUPAC.unambiguous_dna)) for item in ex_1]
ex_2 = [SeqRecord(Seq(item, IUPAC.unambiguous_dna)) for item in ex_2]

In [None]:
SeqIO.write(ex_1, "./ex_1.fasta", "fasta")
SeqIO.write(ex_2, "./ex_2.fasta", "fasta")

In [None]:
reference_genome = "./ex_2.fasta"
read_file = "./ex_1.fasta"
output_sai_file = "./ex_1.sai"
output_sam_file = "./output.sam"
index_cmd = BwaIndexCommandline(infile=reference_genome, algorithm="bwtsw")
align_cmd = BwaAlignCommandline(
    n=12, k=12, o=0, e=0, l=20, N=True, reference=reference_genome, read_file=read_file
)
samse_cmd = BwaSamseCommandline(
    reference=reference_genome, read_file=read_file, sai_file=output_sai_file
)

print(index_cmd)
print(align_cmd)
print(samse_cmd)

In [None]:
index_cmd()

In [None]:
align_cmd(stdout=output_sai_file)

In [None]:
samse_cmd(stdout=output_sam_file)

In [None]:
def str_to_int(string):
    code = {"A": 0, "C": 1, "G": 2, "T": 3}
    conv_str = np.array(list(map(lambda x: code[x], string)))
    return conv_str


ex_1 = np.apply_along_axis(
    "".join, 1, np.random.choice(["A", "C", "G", "T"], size=(100000, 20))
)  # example array of strings
ex_2 = np.apply_along_axis(
    "".join, 1, np.random.choice(["A", "C", "G", "T"], size=(100000, 20))
)  # example array of strings

ex_1_int = np.array(
    list(map(str_to_int, ex_1)), dtype="uint8"
)  # conversion to (N, L) array of integers
ex_2_int = np.array(
    list(map(str_to_int, ex_2)), dtype="uint8"
)  # conversion to (N, L) array of integers

ex_1_broadcast = np.array(
    ex_1_int[:, np.newaxis, :]
)  # reshaping for broadcast operation (N, L) -> (N, 1, L)
ex_2_broadcast = np.array(
    ex_2_int[np.newaxis, :, :]
)  # reshaping for broadcast operation (N, L) -> (1, N, L)

bool_arr = ex_1_broadcast == ex_2_broadcast  # broadcast comparison (N, N, L)
match_arr = np.sum(bool_arr, axis=2)  # summing over L (N, N)