In [2]:
from pathlib import Path
import pandas as pd
import numpy as np
from scipy.stats import mode
from Bio.Seq import Seq
from Bio import SeqIO
from Bio.SeqRecord import SeqRecord

In [3]:
ROOT = Path().cwd()
DATA = Path("../data")
CLADE_FASTA = DATA / "multifasta_for_clades"
ALIGNMENTS = DATA / "alignments"

In [4]:
to_fasta = ALIGNMENTS / "clade_1_merge_align.fasta"
seq_records = [seq_record for seq_record in SeqIO.parse(to_fasta, "fasta")]
len(seq_records)

20

In [13]:
def alignment_to_np(to_align):
    align_np = [np.frombuffer(seq_record.seq.encode('utf-8'), dtype=np.int8) for seq_record in SeqIO.parse(to_align, "fasta")]
    align_np = np.stack(align_np)
    return align_np

In [25]:
def delete_insertions(align_np):
    mask = np.isin(align_np[0], np.frombuffer("-".encode('utf-8'), dtype=np.int8))
    return align_np[:, ~mask]

In [26]:
def to_consensus(align_np):
    consensus_np = mode(align_np[1:], axis=0, keepdims=False)
    consensus = Seq(consensus_np.mode.tobytes().decode("utf-8"))
    return consensus

In [30]:
%%time
align_crie_np = alignment_to_np(ALIGNMENTS / "clade_4_merge_align.fasta")
align_crie_np_modif = delete_insertions(align_crie_np)
consensus = to_consensus(align_crie_np_modif)
consensus

CPU times: user 3.27 s, sys: 58.2 ms, total: 3.32 s
Wall time: 3.28 s


Seq('agcccagccctctcgcggccccctcgagagagaaaaaaaaaagcgaccccacct...ag-')

In [17]:
align_crie_np

array([[103, 103,  99, ..., 103, 103, 103],
       [103, 103,  99, ..., 103, 103,  45],
       [ 97, 103,  99, ...,  97, 103,  99]], dtype=int8)

In [8]:
def alignment_to_consensus(to_align):
    align_np = alignment_to_np(to_align)
    consensus_np = mode(align_np[1:], axis=0, keepdims=False)
    consensus = Seq(consensus_np.mode.tobytes().decode("utf-8"))
    return consensus

In [7]:
align_crie_np = alignment_to_np(ALIGNMENTS / "clade_4_merge_align.fasta")

In [279]:
%%time
align_crie_np = []
to_fasta = ALIGNMENTS / "clade_1_merge_align.fasta"
for seq_record in SeqIO.parse(to_fasta, "fasta"):
    align_crie_np.append(np.frombuffer(seq_record.seq.encode('utf-8'), dtype=np.int8))
align_crie_np = np.stack(align_crie_np)

CPU times: user 30.6 ms, sys: 7.19 ms, total: 37.8 ms
Wall time: 35.7 ms


In [280]:
align_crie_np

array([[103, 103,  99, ..., 103, 103, 103],
       [103, 103,  99, ..., 103, 103,  45],
       [103, 103,  99, ...,  45,  45,  45],
       ...,
       [ 97, 103, 103, ..., 103, 103,  45],
       [103, 103,  99, ..., 103, 103,  45],
       [103, 103,  99, ..., 103, 103,  45]], dtype=int8)

In [276]:
%%time
align_crie_np = []
to_fasta = ALIGNMENTS / "clade_1_merge_align.fasta"
for seq_record in SeqIO.parse(to_fasta, "fasta"):
    align_crie_np.append(seq_record.seq)
align_crie_np

CPU times: user 28.9 ms, sys: 4.29 ms, total: 33.2 ms
Wall time: 31.3 ms


[Seq('ggcccagccctctcgcggccccctcgagagagaaaaaaaaaagcgaccccacct...ggg'),
 Seq('ggcccagccctctcgcggccccctcgagagagaaaaaaaaaagcgaccccacct...gg-'),
 Seq('ggcccagccctctcgcggccccctcgagagagaaaaaaaaaagcgaccccacct...---'),
 Seq('aggccagccctctcgcggccccctcgagagagaaaaaaaaaagcgaccccacct...gg-'),
 Seq('ggcccagccctctcgcggccccctcgagagagaaaaaaaaaagcgaccccacct...gg-'),
 Seq('ggcccagccctctcgcggccccctcgagagag-aaaaaaaaagcgaccccacct...gg-'),
 Seq('aggccagccctctcgcggccccctcgagagagaaaaaaaaaagcgaccccacct...gg-'),
 Seq('aggccagccctctcgcggccccctcgagagagaaaaaaaaaagcgaccccacct...gg-'),
 Seq('aggccagccctctcgcggccccctcgagagagaaaaaaaaaagcgaccccacct...gg-'),
 Seq('ggcccagccctctcgcggccccctcgagagagaaaaaaaaaagcgaccccacct...gg-'),
 Seq('ggcccagccctctcgcggccccctcgagagagaaaaaaaaaagcgaccccacct...gg-'),
 Seq('aggccagccctctcgcggccccctcgagagagaaaaaaaaaagcgaccccacct...gg-'),
 Seq('-ggccagccctctcgcggccccctcgagagagaaaaaaaaaagcgaccccacct...gg-'),
 Seq('ggcccagccctctcgcggccccctcgagagagaaaaaaaaaagcgaccccacct...gg-'),
 Seq('aggccagccctctc

In [251]:
align_crie_np.shape

(20, 127611)

In [247]:
align_crie_np.max()

121

In [258]:
consensus_np = mode(align_crie_np[1:], axis=0, keepdims=False)

In [259]:
consensus_np.mode

array([103, 103,  99, ..., 103, 103,  45], dtype=int8)

In [270]:
consensus = Seq(consensus_np.mode.tobytes().decode("utf-8"))

In [271]:
consensus

Seq('ggcccagccctctcgcggccccctcgagagagaaaaaaaaaagcgaccccacct...gg-')

In [8]:
mask = np.isin(align_crie_np[0], np.frombuffer("-".encode('utf-8'), dtype=np.int8))
mask

array([False, False, False, ..., False, False, False])

In [24]:
align_crie_np[:, ~mask]

array([[103, 103,  99, ..., 103, 103, 103],
       [103, 103,  99, ..., 103, 103,  45],
       [ 97, 103,  99, ...,  97, 103,  99]], dtype=int8)

In [23]:
np.frombuffer("-".encode('utf-8'), dtype=np.int8)

array([45], dtype=int8)