Extract Ebola reads and introduce random mutations. How many mutations can we handle? Are we more robust to Kraken?

In [5]:
out = "/home/laura/projects/virus-watch-data/benchmarking/ebola_benchmark/mutated_files"

### Simulate SNPs
https://pypi.org/project/Mutation-Simulator/ (Mutation-Simulator only works with Python >=3.10; activate environment with 'conda activate mutation_simulator')

Glossary:  
In substitution mutations, transitions are defined as the interchange of the purine-based A↔G or pryimidine-based C↔T. Transversions are defined as the interchange between two-ring purine nucleobases and one-ring pyrimidine bases.

In [6]:
snp_rates = [
    0.02,
    0.03,
    0.04,
    0.05,     # Approx 1 SNP every 20th bp
    0.06,
    0.07,
    0.08,
    0.09,
    0.1,
    0.11,
    0.12,
    0.13,
    0.14,
    0.15,
    0.16,
    0.17,
    0.18,
    0.19,
    0.2,
    0.21,
    0.22,
    0.23,
    0.24,
    0.25,
    0.26,
    0.27,
    0.28,
    0.29,
    0.3
    ]

In [7]:
import numpy as np

In [8]:
for mut_round in np.arange(10):
    for snp_rate in snp_rates:
        new_out = f"{mut_round}_ebov_snp_{str(snp_rate).split('.')[-1]}"

        !mutation-simulator -q /home/laura/projects/virus-watch-data/PRJNA665227/raw/SRR12698539_2_extracted_u10.fa -o $out/$new_out args -sn $snp_rate

Double-check number of variable bases:

In [9]:
from Bio import SeqIO

mutation_rates = []
for mut_round in np.arange(10):
    for snp_rate in snp_rates:
        ref = f"/home/laura/projects/virus-watch-data/PRJNA665227/raw/SRR12698539_2_extracted_u10.fa"
        ref = {record.id: record.seq for record in SeqIO.parse(ref, 'fasta')}

        que = f"{out}/{mut_round}_ebov_snp_{str(snp_rate).split('.')[-1]}_ms.fa"
        que = {record.id: record.seq for record in SeqIO.parse(que, 'fasta')}

        diff = length = 0
        diff_per_seq = []

        for record, rseq in ref.items():
            seq_diff = 0
            for position, rbase in enumerate(rseq):
                if rbase != que[record][position]:
                    diff += 1
                    seq_diff += 1
                length += 1

            diff_per_seq.append(seq_diff)


        # Save actual mutation rate
        actual_mut_rate = round(diff * 100 / length, 2)
        mutation_rates.append(actual_mut_rate)

        print (f"Target mutation rate: {snp_rate*100} %\nTotel length of all sequences: {length}; Total number of differing nucleotides: {diff} ({actual_mut_rate} %)\nDifferences per sequence: {diff_per_seq}\n")

Target mutation rate: 2.0 %
Totel length of all sequences: 59488; Total number of differing nucleotides: 676 (1.14 %)
Differences per sequence: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 