In [4]:
from pathlib import Path
import pandas as pd
import numpy as np
from scipy.stats import mode
import editdistance
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
from joblib import Parallel, delayed
import pybedtools

import modules

In [2]:
ROOT = Path().cwd()
PREPROCESSING = Path("../preprocessing")
ASSEMBLING = Path("../reference_genomes")
DATA = Path("../data")
CLADE_FASTA = DATA / "multifasta_for_clades"
ALIGNMENTS = DATA / "alignments"
CONSENSUSES = DATA / "consensuses"

In [3]:
def alignment_to_np(to_align):
    align_np = [np.frombuffer(seq_record.seq.encode('utf-8'), dtype=np.int8) for seq_record in SeqIO.parse(to_align, "fasta")]
    align_np = np.stack(align_np)
    return align_np

In [4]:
def delete_insertions(align_np):
    mask = np.isin(align_np[0], np.frombuffer("-".encode('utf-8'), dtype=np.int8))
    return align_np[:, ~mask]

In [5]:
def delete_insertions(align_np):
    mask = np.isin(align_np[0], np.frombuffer("-".encode('utf-8'), dtype=np.int8))
    return align_np[:, ~mask]

In [6]:
def to_consensus(align_np):
    consensus_np = mode(align_np[1:], axis=0, keepdims=False)
    return consensus_np

In [7]:
def consensus_to_seq(consensus_np):
    #consensus = Seq(consensus_np.mode.tobytes().decode("utf-8"))
    consensus = Seq(consensus_np.tobytes().decode("utf-8"))
    return consensus

In [25]:
def are_all_rows_unique(array):
    unique_rows = np.unique(array, axis=0)
    return unique_rows.shape[0] == array.shape[0]

In [26]:
def create_distance_matrix(array):
    # Создание массива для хранения расстояний между строками
    num_rows, num_columns = array.shape
    distances = np.zeros((num_rows, num_rows), dtype="int8")

    # Вычисление расстояний между строками
    for i in range(num_rows):
        for j in range(i + 1, num_rows):
            distances[i, j] = editdistance.eval(array[i], array[j])
            distances[j, i] = distances[i, j]  # расстояние Левенштейна симметрично
    return distances

In [57]:
def check_region(array): 
    number_of_unique_rows = np.unique(array, axis=0, return_counts=True)[1].shape[0]
    return number_of_unique_rows

In [59]:
to_align = Path("../data/alignments/NC_001348.1/clade_1-merge_align.fasta")
a = alignment_to_np(to_align)
a_copy = a.copy()
b = delete_insertions(a)
c = to_consensus(b).mode
d = consensus_to_seq(c)

In [60]:
a_copy

array([[ 97, 103, 103, ...,  97, 103, 103],
       [103, 103,  99, ..., 103, 103, 103],
       [103, 103,  99, ..., 103,  45,  45],
       ...,
       [ 97, 103, 103, ..., 103, 103, 103],
       [103, 103,  99, ..., 103, 103, 103],
       [103, 103,  99, ..., 103, 103, 103]], dtype=int8)

In [61]:
np.isin(a_copy[0], np.frombuffer("-".encode('utf-8'), dtype=np.int8))

array([False, False, False, ..., False, False, False])

In [62]:
consensuses_by_ref = {}
for to_ref in ASSEMBLING.glob("*.fasta"):
    consensuses = np.stack([consensus_to_seq(np.load(to_consensus)) for to_consensus in sorted(CONSENSUSES.joinpath(to_ref.stem).glob("*consensus.npy"))])
    consensuses_by_ref[to_ref.stem] = consensuses

In [63]:
primary_array = consensuses_by_ref['NC_001348.1']

In [67]:
[i.shape for i in primary_array]

[(124884,),
 (124884,),
 (124884,),
 (124884,),
 (124884,),
 (124884,),
 (124884,),
 (124884,)]

In [64]:
start = 20029
window = 930
left_area = primary_array[:, start:start+window][:, :20]
right_area = primary_array[:, start:start+window][:, -20:]
left_score, right_score = check_region(left_area), check_region(right_area)

In [65]:
primary_array[:, start:start+window]

array([['a', 'c', 'c', ..., 't', 't', 'c'],
       ['a', 'c', 'c', ..., 't', 't', 'c'],
       ['a', 'c', 'c', ..., 't', 't', 'c'],
       ...,
       ['a', 'c', 'c', ..., 't', 't', 'c'],
       ['a', 'c', 'c', ..., 't', 't', 'c'],
       ['a', 'c', 'c', ..., 't', 't', 'c']], dtype='<U1')

In [66]:
left_area

array([['a', 'c', 'c', 't', 't', 'g', 'a', 'a', 'g', 't', 'c', 'c', 'a',
        'c', 'c', 'g', 'a', 'a', 'c', 'a'],
       ['a', 'c', 'c', 't', 't', 'g', 'a', 'a', 'g', 't', 'c', 'c', 'a',
        'c', 'c', 'g', 'a', 'a', 'c', 'a'],
       ['a', 'c', 'c', 't', 't', 'g', 'a', 'a', 'g', 't', 'c', 'c', 'a',
        'c', 'c', 'g', 'a', 'a', 'c', 'a'],
       ['a', 'c', 'c', 't', 't', 'g', 'a', 'a', 'g', 't', 'c', 'c', 'a',
        'c', 'c', 'g', 'a', 'a', 'c', 'a'],
       ['a', 'c', 'c', 't', 't', 'g', 'a', 'a', 'g', 't', 'c', 'c', 'a',
        'c', 'c', 'g', 'a', 'a', 'c', 'a'],
       ['a', 'c', 'c', 't', 't', 'g', 'a', 'a', 'g', 't', 'c', 'c', 'a',
        'c', 'c', 'g', 'a', 'a', 'c', 'a'],
       ['a', 'c', 'c', 't', 't', 'g', 'a', 'a', 'g', 't', 'c', 'c', 'a',
        'c', 'c', 'g', 'a', 'a', 'c', 'a'],
       ['a', 'c', 'c', 't', 't', 'g', 'a', 'a', 'g', 't', 'c', 'c', 'a',
        'c', 'c', 'g', 'a', 'a', 'c', 'a']], dtype='<U1')

In [49]:
check_region(left_area)

1

In [46]:
right_area

array([['c', 'a', 'g', 't', 'c', 'g', 'a', 'g', 'a', 'c', 'g', 'c', 'g',
        'g', 'a', 't', 'g', 'a', 'g', 'g'],
       ['c', 'a', 'g', 't', 'c', 'g', 'a', 'g', 'a', 'c', 'g', 'c', 'g',
        'g', 'a', 't', 'g', 'a', 'g', 'g'],
       ['c', 'a', 'g', 't', 'c', 'g', 'a', 'g', 'a', 'c', 'g', 'c', 'g',
        'g', 'a', 't', 'g', 'a', 'g', 'g'],
       ['c', 'a', 'g', 't', 'c', 'g', 'a', 'g', 'a', 'c', 'g', 'c', 'g',
        'g', 'a', 't', 'g', 'a', 'g', 'g'],
       ['c', 'a', 'g', 't', 'c', 'g', 'a', 'g', 'a', 'c', 'g', 'c', 'g',
        'g', 'a', 't', 'g', 'a', 'g', 'g'],
       ['c', 'a', 'g', 't', 'c', 'g', 'a', 'g', 'a', 'c', 'g', 'c', 'g',
        'g', 'a', 't', 'g', 'a', 'g', 'g'],
       ['c', 'a', 'g', 't', 'c', 'g', 'a', 'g', 'a', 'c', 'g', 'c', 'g',
        'g', 'a', 't', 'g', 'a', 'g', 'g'],
       ['c', 'a', 'g', 't', 'c', 'g', 'a', 'g', 'a', 'c', 'g', 'c', 'g',
        'g', 'a', 't', 'g', 'a', 'g', 'g']], dtype='<U1')

In [43]:
assert left_area.shape[1] == right_area.shape[1] == 20

In [58]:
np.unique(left_area, axis=0, return_counts=True)[1].shape[0]

1