<a href="https://colab.research.google.com/github/pachterlab/LSCHWCP_2023/blob/main/Notebooks/Figure_2/Figure_2c/1_create_ebov_mutations.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Introduce random mutation to reads that aligned to the ZEBOV RdRP

In [1]:
import numpy as np

In [2]:
# Get fasta file with reads that aligned to the ZEBOV RdRP
!wget https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/Notebooks/Figure_2/Figure_2c/SRR12698539_2_extracted_u10.fa

--2023-12-07 21:33:31--  https://raw.githubusercontent.com/pachterlab/LSCHWCP_2023/main/Notebooks/Figure_2/Figure_2c/SRR12698539_2_extracted_u10.fa
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.109.133, 185.199.110.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 75549 (74K) [text/plain]
Saving to: ‘SRR12698539_2_extracted_u10.fa’


2023-12-07 21:33:31 (11.5 MB/s) - ‘SRR12698539_2_extracted_u10.fa’ saved [75549/75549]



In [3]:
# Define folder to save mutated sequences in
out = "mutated_files"
!mkdir $out

### Simulate SNPs
https://pypi.org/project/Mutation-Simulator/ (Mutation-Simulator only works with Python >=3.10)

Glossary:  
In substitution mutations, transitions are defined as the interchange of the purine-based A↔G or pryimidine-based C↔T. Transversions are defined as the interchange between two-ring purine nucleobases and one-ring pyrimidine bases.

In [4]:
!pip install Mutation-Simulator

Collecting Mutation-Simulator
  Downloading mutation_simulator-3.0.1-py3-none-any.whl (43 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/44.0 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m44.0/44.0 kB[0m [31m1.5 MB/s[0m eta [36m0:00:00[0m
Collecting pyfaidx (from Mutation-Simulator)
  Downloading pyfaidx-0.7.2.2-py3-none-any.whl (28 kB)
Installing collected packages: pyfaidx, Mutation-Simulator
Successfully installed Mutation-Simulator-3.0.1 pyfaidx-0.7.2.2


In [5]:
snp_rates = [
    0.02,
    0.03,
    0.04,
    0.05,     # Approx 1 SNP every 20th bp
    0.06,
    0.07,
    0.08,
    0.09,
    0.1,
    0.11,
    0.12,
    0.13,
    0.14,
    0.15,
    0.16,
    0.17,
    0.18,
    0.19,
    0.2,
    0.21,
    0.22,
    0.23,
    0.24,
    0.25,
    0.26,
    0.27,
    0.28,
    0.29,
    0.3
    ]

In [6]:
for mut_round in np.arange(10):
    for snp_rate in snp_rates:
        new_out = f"{mut_round}_ebov_snp_{str(snp_rate).split('.')[-1]}"

        !mutation-simulator -q SRR12698539_2_extracted_u10.fa -o $out/$new_out args -sn $snp_rate

Double-check number of 'mutated' bases:

In [8]:
!pip install biopython

Collecting biopython
  Downloading biopython-1.81-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.1 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.1/3.1 MB[0m [31m30.2 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: biopython
Successfully installed biopython-1.81


In [9]:
from Bio import SeqIO

mutation_rates = []
for mut_round in np.arange(10):
    for snp_rate in snp_rates:
        ref = f"SRR12698539_2_extracted_u10.fa"
        ref = {record.id: record.seq for record in SeqIO.parse(ref, 'fasta')}

        que = f"{out}/{mut_round}_ebov_snp_{str(snp_rate).split('.')[-1]}_ms.fa"
        que = {record.id: record.seq for record in SeqIO.parse(que, 'fasta')}

        diff = length = 0
        diff_per_seq = []

        for record, rseq in ref.items():
            seq_diff = 0
            for position, rbase in enumerate(rseq):
                if rbase != que[record][position]:
                    diff += 1
                    seq_diff += 1
                length += 1

            diff_per_seq.append(seq_diff)


        # Save actual mutation rate
        actual_mut_rate = round(diff * 100 / length, 2)
        mutation_rates.append(actual_mut_rate)

        print (f"Target mutation rate: {snp_rate*100} %\nTotel length of all sequences: {length}; Total number of differing nucleotides: {diff} ({actual_mut_rate} %)\nDifferences per sequence: {diff_per_seq}\n")

Target mutation rate: 2.0 %
Totel length of all sequences: 59488; Total number of differing nucleotides: 676 (1.14 %)
Differences per sequence: [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 