In [107]:
from Bio.codonalign.codonseq import CodonSeq
from Bio.codonalign.codonseq import cal_dn_ds
import random
from comp_dnds import dnds
import time
import pandas as pd
from plotnine import *
import numpy as np

In [110]:
from Bio import __version__ as biopython_version
biopython_version

'1.81'

In [108]:
def biopython_dnds(ref_seq, obs_seq):
    ref_codons = CodonSeq(ref_seq)
    obs_codons = CodonSeq(obs_seq)
    return cal_dn_ds(ref_codons, obs_codons, method='NG86')

In [29]:
d = dnds()
def run_comp_dnds(ref_seq, obs_seq):
    return d.compute(ref_seq, obs_seq)

In [6]:
def seq_gen(length):
    return ''.join(random.choice('ATGC') for _ in range(length))

In [11]:
def mutate_sequence(seq, prob_mutation):
    return ''.join(random.choice('ATGC') if random.random() < prob_mutation else seq[i] for i in range(len(seq)))

In [28]:
reference = [seq_gen(999) for i in range(1000)]
observed = [mutate_sequence(seq, 0.01) for seq in reference]

Execution time benchmark

In [31]:
biopython_durations = []
comp_dnds_durations = []
for r, o in zip(reference, observed):
    # biopython cal_dn_ds
    start = time.time()
    biopython_dnds(r, o)
    end = time.time()
    duration = end - start
    # comp_dnds
    start = time.time()
    run_comp_dnds(r, o)
    end = time.time()
    duration2 = end - start
    biopython_durations.append(duration)
    comp_dnds_durations.append(duration2)

In [42]:
df = pd.DataFrame({'biopython': biopython_durations, 'comp_dnds': comp_dnds_durations})
df_melt = df.melt(value_name='duration', var_name='method')


In [86]:
df_stats = df_melt.groupby('method').describe().reset_index()

In [87]:
df_stats.columns = df_stats.columns.droplevel(0)

In [88]:
df_stats

Unnamed: 0,Unnamed: 1,count,mean,std,min,25%,50%,75%,max
0,biopython,1000.0,0.020846,0.004049,0.019148,0.019862,0.020147,0.020553,0.115118
1,comp_dnds,1000.0,0.000652,0.000119,0.000589,0.000605,0.000613,0.000662,0.002213


In [89]:
df_stats.columns = ['method'] + df_stats.columns[1:].to_list()

In [90]:
df_stats

Unnamed: 0,method,count,mean,std,min,25%,50%,75%,max
0,biopython,1000.0,0.020846,0.004049,0.019148,0.019862,0.020147,0.020553,0.115118
1,comp_dnds,1000.0,0.000652,0.000119,0.000589,0.000605,0.000613,0.000662,0.002213


In [112]:
g = (
    ggplot(df_stats, aes(x='method', y='mean', fill='method')) 
    + geom_bar(stat='identity') 
    + geom_errorbar(aes(ymin="mean-std",ymax="mean+std"))
    + theme_classic()
    + labs(y='mean duration (s)', x='method')
)
g.save("../plots/biopython_benchmark.png", dpi=72)



Calculating speedup

In [97]:
(df.biopython / df.comp_dnds).describe()

count    1000.000000
mean       32.269249
std         4.285556
min         9.605567
25%        30.835351
50%        32.746612
75%        33.419786
max        75.994136
dtype: float64

Checking results consistency with biopython calc_dnds

In [121]:
agree = []
for r, o in zip(reference, observed):
    # biopython cal_dn_ds
    start = time.time()
    biodn, biods = biopython_dnds(r, o)
    # comp_dnds
    comp_dn, comp_ds = run_comp_dnds(r, o)
    agree.append(abs(biodn - comp_dn) < 1e-3 and abs(biods - comp_ds) < 1e-3)
    

In [123]:
agree.count(True) / len(agree)

0.998

The calculated dN and dS values are identical to those calculated by BioPython (difference of less than 10e3 for 99.8% of the results).