In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import seaborn as sns
from pymutspec.annotation import CodonAnnotation
from pymutspec.constants import possible_sbs192, possible_codons 

sns.set_style()

# Explore how many synonymous mutations can be observed in all trinucleotide contexts according to genetic code 

## Gencode 2: vert mtDNA

In [2]:
# load custom class that collect expected substitution counts from sequences
# and can do many other things
coda = CodonAnnotation(2)

In [3]:
# only 64 possiple trinucleotides or codons
len(possible_codons)

64

In [5]:
# collect all possible mutations from codons in any coson position
data = []
# iterate over codons
for cdn in possible_codons:
    # translate coson to amino acid
    aa = coda.translate_codon(cdn)

    # iterate over nucleotides
    for nucl in 'ACGT':
        # and iterate over position of mutation in given codon
        for pic in range(3):
            # pass if there is no mutation
            if cdn[pic] == nucl:
                continue
            
            # get alternative codon after mutation
            cdn_alt = list(cdn)
            cdn_alt[pic] = nucl
            cdn_alt = ''.join(cdn_alt)
            # get alternative amino acid after mutation (can be same aa in case of syn sbs)
            aa_alt = coda.translate_codon(cdn_alt)

            # collect substitution type according to position of mutation in codon
            if pic == 0:
                sbs = f'N[{cdn[0]}>{cdn_alt[0]}]{cdn[1]}'
            elif pic == 1:
                sbs = f'{cdn[0]}[{cdn[1]}>{cdn_alt[1]}]{cdn[-1]}'
            elif pic == 2:
                sbs = f'{cdn[1]}[{cdn[2]}>{cdn_alt[2]}]N'
            
            # save to list
            data.append([cdn, cdn_alt, aa, aa_alt, pic, sbs])

# create dataframe from collected mutations
df_all_sbs = pd.DataFrame(data, columns=['cdn', 'cdn_alt', 'aa', 'aa_alt', 'pic', 'sbs'])
df_all_sbs['is_syn'] = df_all_sbs['aa'] == df_all_sbs['aa_alt']
df_all_sbs

Unnamed: 0,cdn,cdn_alt,aa,aa_alt,pic,sbs,is_syn
0,AAA,CAA,K,Q,0,N[A>C]A,False
1,AAA,ACA,K,T,1,A[A>C]A,False
2,AAA,AAC,K,N,2,A[A>C]N,False
3,AAA,GAA,K,E,0,N[A>G]A,False
4,AAA,AGA,K,*,1,A[A>G]A,False
...,...,...,...,...,...,...,...
571,TTT,TCT,F,S,1,T[T>C]T,False
572,TTT,TTC,F,F,2,T[T>C]N,True
573,TTT,GTT,F,V,0,N[T>G]T,False
574,TTT,TGT,F,C,1,T[T>G]T,False


We can observe totally 576 unique mutations in 2nd gencode according trinucleotide context

In [6]:
# and only 132 will be synonymous
df_all_sbs[df_all_sbs.is_syn]

Unnamed: 0,cdn,cdn_alt,aa,aa_alt,pic,sbs,is_syn
5,AAA,AAG,K,K,2,A[A>G]N,True
17,AAC,AAT,N,N,2,A[C>T]N,True
18,AAG,AAA,K,K,2,A[G>A]N,True
30,AAT,AAC,N,N,2,A[T>C]N,True
38,ACA,ACC,T,T,2,C[A>C]N,True
...,...,...,...,...,...,...,...
547,TTA,TTG,L,L,2,T[A>G]N,True
557,TTC,TTT,F,F,2,T[C>T]N,True
560,TTG,TTA,L,L,2,T[G>A]N,True
561,TTG,CTG,L,L,0,N[T>C]T,True


In [10]:
# check how many codon changes can be observed in different contexts
cxt2nmut = df_all_sbs[df_all_sbs.is_syn].sbs.value_counts()
cxt2nmut

A[A>G]N    4
C[T>A]N    4
T[T>C]N    4
T[G>A]N    4
A[C>T]N    4
T[A>G]N    4
G[T>C]N    4
G[G>A]N    4
G[C>T]N    4
G[A>G]N    4
C[T>G]N    4
C[T>C]N    4
T[C>T]N    4
C[G>T]N    4
C[A>T]N    4
C[G>C]N    4
A[G>A]N    4
C[A>C]N    4
C[A>G]N    4
A[T>C]N    4
C[C>A]N    4
C[C>G]N    4
C[C>T]N    4
C[G>A]N    4
N[C>T]T    2
T[T>G]N    2
T[T>A]N    2
T[G>T]N    2
T[G>C]N    2
T[C>G]N    2
T[C>A]N    2
T[A>T]N    2
G[C>G]N    2
T[A>C]N    2
G[T>G]N    2
G[T>A]N    2
G[G>T]N    2
G[G>C]N    2
G[C>A]N    2
G[A>T]N    2
G[A>C]N    2
N[T>C]T    2
Name: sbs, dtype: int64

In [11]:
# for example, A[A>G]N synonymous mutation can be observed in 4 dfferent codons 
# that encode 4 amino acids
df_all_sbs[(df_all_sbs.is_syn) & (df_all_sbs.sbs == 'A[A>G]N')]

Unnamed: 0,cdn,cdn_alt,aa,aa_alt,pic,sbs,is_syn
5,AAA,AAG,K,K,2,A[A>G]N,True
149,CAA,CAG,Q,Q,2,A[A>G]N,True
293,GAA,GAG,E,E,2,A[A>G]N,True
438,TAA,TAG,*,*,2,A[A>G]N,True


In [13]:
# but G[A>C]N synonymous mutation can be observed obly in 2 dfferent codons 
# that encode 2 amino acids
df_all_sbs[(df_all_sbs.is_syn) & (df_all_sbs.sbs == 'G[A>C]N')]

Unnamed: 0,cdn,cdn_alt,aa,aa_alt,pic,sbs,is_syn
219,CGA,CGC,R,R,2,G[A>C]N,True
364,GGA,GGC,G,G,2,G[A>C]N,True


In [15]:
# all these mutations in 1st or 3rd codon position can have 
# any nucleotide N from the neigbour codon. We can calculate 
# the total number of available synonymous mutations using this information:

sset = set()
# iterate over all unique mutation types, fill N values by 4 variants of nucleotides
# and collect all available substitution at trinucleotide context 
for x in df_all_sbs[df_all_sbs.is_syn].sbs.unique():
    for n in 'ACGT':
        sset.add(x.replace('N', n))

print(len(sset), 'out of 192 possible mutation types (with contexts)\ncan be observed and '
      'have non-zero values in synonymous spectra')

160 out of 192 possible mutation types (with contexts)
can be observed and have non-zero values in synonymous spectra


In [19]:
# let's check which substitutions cannot be observed in synonymous spectrum 
unseen_sbs = pd.DataFrame({'SBS': sorted(set(possible_sbs192).difference(sset))})
unseen_sbs['Context'] = unseen_sbs.SBS.str.get(0) + \
    unseen_sbs.SBS.str.get(2) + unseen_sbs.SBS.str.get(-1)
unseen_sbs.index += 1
unseen_sbs

Unnamed: 0,SBS,Context
1,A[A>C]A,AAA
2,A[A>C]C,AAC
3,A[A>C]G,AAG
4,A[A>C]T,AAT
5,A[A>T]A,AAA
6,A[A>T]C,AAC
7,A[A>T]G,AAG
8,A[A>T]T,AAT
9,A[C>A]A,ACA
10,A[C>A]C,ACC


On the other hand gencode 1 have different structure:
## Gencode 1

In [20]:
# load custom class that collect expected substitution counts from sequences
# and can do many other things
coda = CodonAnnotation(1)

In [21]:
# only 64 possiple trinucleotides or codons
len(possible_codons)

64

In [23]:
# collect all possible mutations from codons in any coson position
data = []
# iterate over codons
for cdn in possible_codons:
    # translate coson to amino acid
    aa = coda.translate_codon(cdn)

    # iterate over nucleotides
    for nucl in 'ACGT':
        # and iterate over position of mutation in given codon
        for pic in range(3):
            # pass if there is no mutation
            if cdn[pic] == nucl:
                continue
            
            # get alternative codon after mutation
            cdn_alt = list(cdn)
            cdn_alt[pic] = nucl
            cdn_alt = ''.join(cdn_alt)
            # get alternative amino acid after mutation (can be same aa in case of syn sbs)
            aa_alt = coda.translate_codon(cdn_alt)

            # collect substitution type according to position of mutation in codon
            if pic == 0:
                sbs = f'N[{cdn[0]}>{cdn_alt[0]}]{cdn[1]}'
            elif pic == 1:
                sbs = f'{cdn[0]}[{cdn[1]}>{cdn_alt[1]}]{cdn[-1]}'
            elif pic == 2:
                sbs = f'{cdn[1]}[{cdn[2]}>{cdn_alt[2]}]N'
            
            # save to list
            data.append([cdn, cdn_alt, aa, aa_alt, pic, sbs])

# create dataframe from collected mutations
df_all_sbs = pd.DataFrame(data, columns=['cdn', 'cdn_alt', 'aa', 'aa_alt', 'pic', 'sbs'])
df_all_sbs['is_syn'] = df_all_sbs['aa'] == df_all_sbs['aa_alt']
df_all_sbs

Unnamed: 0,cdn,cdn_alt,aa,aa_alt,pic,sbs,is_syn
0,AAA,CAA,K,Q,0,N[A>C]A,False
1,AAA,ACA,K,T,1,A[A>C]A,False
2,AAA,AAC,K,N,2,A[A>C]N,False
3,AAA,GAA,K,E,0,N[A>G]A,False
4,AAA,AGA,K,R,1,A[A>G]A,False
...,...,...,...,...,...,...,...
571,TTT,TCT,F,S,1,T[T>C]T,False
572,TTT,TTC,F,F,2,T[T>C]N,True
573,TTT,GTT,F,V,0,N[T>G]T,False
574,TTT,TGT,F,C,1,T[T>G]T,False


We can observe totally 576 unique mutations in 2nd gencode according trinucleotide context

In [24]:
# and only 138 will be synonymous in gencode 1
df_all_sbs[df_all_sbs.is_syn]

Unnamed: 0,cdn,cdn_alt,aa,aa_alt,pic,sbs,is_syn
5,AAA,AAG,K,K,2,A[A>G]N,True
17,AAC,AAT,N,N,2,A[C>T]N,True
18,AAG,AAA,K,K,2,A[G>A]N,True
30,AAT,AAC,N,N,2,A[T>C]N,True
38,ACA,ACC,T,T,2,C[A>C]N,True
...,...,...,...,...,...,...,...
547,TTA,TTG,L,L,2,T[A>G]N,True
557,TTC,TTT,F,F,2,T[C>T]N,True
560,TTG,TTA,L,L,2,T[G>A]N,True
561,TTG,CTG,L,L,0,N[T>C]T,True


In [25]:
# check how many codon changes can be observed in different contexts
cxt2nmut = df_all_sbs[df_all_sbs.is_syn].sbs.value_counts()
cxt2nmut

A[A>G]N    4
C[G>A]N    4
G[T>C]N    4
T[T>C]N    4
G[C>T]N    4
C[T>G]N    4
C[T>C]N    4
C[T>A]N    4
C[G>T]N    4
A[C>T]N    4
C[G>C]N    4
C[C>T]N    4
C[C>G]N    4
C[C>A]N    4
C[A>T]N    4
C[A>G]N    4
C[A>C]N    4
A[T>C]N    4
A[G>A]N    4
T[C>T]N    4
T[G>A]N    3
T[A>G]N    3
T[T>A]N    3
T[C>A]N    3
T[A>T]N    3
T[A>C]N    3
G[G>A]N    3
G[A>G]N    3
T[T>G]N    2
T[G>T]N    2
T[G>C]N    2
N[A>C]G    2
T[C>G]N    2
N[C>T]T    2
G[T>G]N    2
G[T>A]N    2
G[G>T]N    2
G[G>C]N    2
G[C>G]N    2
G[C>A]N    2
G[A>T]N    2
G[A>C]N    2
N[C>A]G    2
N[T>C]T    2
T[A>G]A    1
T[G>A]A    1
Name: sbs, dtype: int64

In [26]:
# for example, A[A>G]N synonymous mutation can be observed in 4 dfferent codons 
# that encode 4 amino acids
df_all_sbs[(df_all_sbs.is_syn) & (df_all_sbs.sbs == 'A[A>G]N')]

Unnamed: 0,cdn,cdn_alt,aa,aa_alt,pic,sbs,is_syn
5,AAA,AAG,K,K,2,A[A>G]N,True
149,CAA,CAG,Q,Q,2,A[A>G]N,True
293,GAA,GAG,E,E,2,A[A>G]N,True
438,TAA,TAG,*,*,2,A[A>G]N,True


In [27]:
# but G[A>C]N synonymous mutation can be observed obly in 2 dfferent codons 
# that encode 2 amino acids
df_all_sbs[(df_all_sbs.is_syn) & (df_all_sbs.sbs == 'G[A>C]N')]

Unnamed: 0,cdn,cdn_alt,aa,aa_alt,pic,sbs,is_syn
219,CGA,CGC,R,R,2,G[A>C]N,True
364,GGA,GGC,G,G,2,G[A>C]N,True


In [28]:
# all these mutations in 1st or 3rd codon position can have 
# any nucleotide N from the neigbour codon. We can calculate 
# the total number of available synonymous mutations using this information:

sset = set()
# iterate over all unique mutation types, fill N values by 4 variants of nucleotides
# and collect all available substitution at trinucleotide context 
for x in df_all_sbs[df_all_sbs.is_syn].sbs.unique():
    for n in 'ACGT':
        sset.add(x.replace('N', n))

print(len(sset), 'out of 192 possible mutation types (with contexts)\ncan be observed and '
      'have non-zero values in synonymous spectra')

162 out of 192 possible mutation types (with contexts)
can be observed and have non-zero values in synonymous spectra


Look! 2 substitutions more in this 1st gencode!