In [88]:
from pathlib import Path
import pandas as pd
import numpy as np
from scipy.stats import mode
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import pybedtools

In [8]:
ROOT = Path().cwd()
PREPROCESSING = Path("../preprocessing")
ASSEMBLING = Path("../reference_genomes")
DATA = Path("../data")
CLADE_FASTA = DATA / "multifasta_for_clades"
ALIGNMENTS = DATA / "alignments"
CONSENSUSES = DATA / "consensuses"
BED = PREPROCESSING / "insertion_regions"

In [9]:
for name in PREPROCESSING, ASSEMBLING, DATA, CLADE_FASTA, ALIGNMENTS, CONSENSUSES, BED:
    name.mkdir(parents=True, exist_ok=True)

In [62]:
def alignment_to_np(to_align):
    align_np = [np.frombuffer(seq_record.seq.encode('utf-8'), dtype=np.int8) for seq_record in SeqIO.parse(to_align, "fasta")]
    align_np = np.stack(align_np)
    return align_np

In [83]:
def insertion_coordinats_finder(array):
    #поиск реальных коордиант инсерций после их вырезания из выравнивания
    insetion_coordinats = []
    for i in np.where(array == 45)[0]:
        #sub_array = a[0][:i]
        insetion_coordinats.append(i-len(insetion_coordinats))
    insetion_coordinats = np.unique(np.asarray(insetion_coordinats))
    return insetion_coordinats

In [64]:
def delete_insertions(align_np):
    mask = np.isin(align_np[0], np.frombuffer("-".encode('utf-8'), dtype=np.int8))
    return align_np[:, ~mask]

In [86]:
for to_ref in ASSEMBLING.glob("*.fasta"):
    print(to_ref.stem)
    to_bed = BED / f"insertion_regions-{to_ref.stem}.bed"
    to_bed.parent.mkdir(exist_ok=True, parents=True)
    stream = open(to_bed, 'w')
    unique_insertions = np.unique(np.concatenate([insertion_coordinats_finder(alignment_to_np(to_fasta)[0]) for to_fasta in ALIGNMENTS.joinpath(to_ref.stem).glob("*.fasta")]))
    for coord in unique_insertions:
        stream.write(f"{to_ref.stem}\t{coord}\t{coord+1}\n")
    stream.close()

DQ008354.1
NC_001348.1


In [89]:
to_bed = BED / f"insertion_regions-NC_001348.1.bed"
insertion_regions = pybedtools.BedTool(to_bed)
insertion_regions = insertion_regions.sort()
print(insertion_regions)

NC_001348.1	10	11
NC_001348.1	25	26
NC_001348.1	32	33
NC_001348.1	52	53
NC_001348.1	96	97
NC_001348.1	108	109
NC_001348.1	1869	1870
NC_001348.1	13953	13954
NC_001348.1	13994	13995
NC_001348.1	14018	14019
NC_001348.1	14055	14056
NC_001348.1	14122	14123
NC_001348.1	14126	14127
NC_001348.1	14129	14130
NC_001348.1	14134	14135
NC_001348.1	14137	14138
NC_001348.1	14197	14198
NC_001348.1	14238	14239
NC_001348.1	20728	20729
NC_001348.1	20789	20790
NC_001348.1	20795	20796
NC_001348.1	20800	20801
NC_001348.1	20865	20866
NC_001348.1	20879	20880
NC_001348.1	20912	20913
NC_001348.1	20921	20922
NC_001348.1	20926	20927
NC_001348.1	20932	20933
NC_001348.1	20938	20939
NC_001348.1	40488	40489
NC_001348.1	41443	41444
NC_001348.1	41486	41487
NC_001348.1	41495	41496
NC_001348.1	41499	41500
NC_001348.1	41504	41505
NC_001348.1	42401	42402
NC_001348.1	59756	59757
NC_001348.1	60266	60267
NC_001348.1	60269	60270
NC_001348.1	71374	71375
NC_001348.1	74854	74855
NC_001348.1	78134	78135
NC_001348.1	78135	78136
NC_0