In [1]:
import warnings
warnings.filterwarnings("ignore")

import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from pymutspec.annotation import CodonAnnotation
from pymutspec.constants import possible_sbs192, possible_codons 

sns.set_style()

## Gencode 2: vert mito

In [2]:
coda = CodonAnnotation(2)

In [3]:
len(possible_codons)

64

In [4]:
data = []
for cdn in possible_codons:
    aa = coda.translate_codon(cdn)
    for nucl in 'ACGT':
        for pic in range(3):
            if cdn[pic] == nucl:
                continue
            cdn_alt = list(cdn)
            cdn_alt[pic] = nucl
            cdn_alt = ''.join(cdn_alt)
            aa_alt = coda.translate_codon(cdn_alt)
            if pic == 0:
                sbs = f'N[{cdn[0]}>{cdn_alt[0]}]{cdn[1]}'
            elif pic == 1:
                sbs = f'{cdn[0]}[{cdn[1]}>{cdn_alt[1]}]{cdn[-1]}'
            elif pic == 2:
                sbs = f'{cdn[1]}[{cdn[2]}>{cdn_alt[2]}]N'
            data.append([cdn, cdn_alt, aa, aa_alt, pic, sbs])

df_all_sbs = pd.DataFrame(data, columns=['cdn', 'cdn_alt', 'aa', 'aa_alt', 'pic', 'sbs'])
df_all_sbs['is_syn'] = df_all_sbs['aa'] == df_all_sbs['aa_alt']
df_all_sbs

Unnamed: 0,cdn,cdn_alt,aa,aa_alt,pic,sbs,is_syn
0,AAA,CAA,K,Q,0,N[A>C]A,False
1,AAA,ACA,K,T,1,A[A>C]A,False
2,AAA,AAC,K,N,2,A[A>C]N,False
3,AAA,GAA,K,E,0,N[A>G]A,False
4,AAA,AGA,K,*,1,A[A>G]A,False
...,...,...,...,...,...,...,...
571,TTT,TCT,F,S,1,T[T>C]T,False
572,TTT,TTC,F,F,2,T[T>C]N,True
573,TTT,GTT,F,V,0,N[T>G]T,False
574,TTT,TGT,F,C,1,T[T>G]T,False


In [5]:
df_all_sbs[df_all_sbs.is_syn]

Unnamed: 0,cdn,cdn_alt,aa,aa_alt,pic,sbs,is_syn
5,AAA,AAG,K,K,2,A[A>G]N,True
17,AAC,AAT,N,N,2,A[C>T]N,True
18,AAG,AAA,K,K,2,A[G>A]N,True
30,AAT,AAC,N,N,2,A[T>C]N,True
38,ACA,ACC,T,T,2,C[A>C]N,True
...,...,...,...,...,...,...,...
547,TTA,TTG,L,L,2,T[A>G]N,True
557,TTC,TTT,F,F,2,T[C>T]N,True
560,TTG,TTA,L,L,2,T[G>A]N,True
561,TTG,CTG,L,L,0,N[T>C]T,True


In [6]:
df_all_sbs[df_all_sbs.is_syn].sbs.value_counts()

A[A>G]N    4
C[T>A]N    4
T[T>C]N    4
T[G>A]N    4
A[C>T]N    4
T[A>G]N    4
G[T>C]N    4
G[G>A]N    4
G[C>T]N    4
G[A>G]N    4
C[T>G]N    4
C[T>C]N    4
T[C>T]N    4
C[G>T]N    4
C[A>T]N    4
C[G>C]N    4
A[G>A]N    4
C[A>C]N    4
C[A>G]N    4
A[T>C]N    4
C[C>A]N    4
C[C>G]N    4
C[C>T]N    4
C[G>A]N    4
N[C>T]T    2
T[T>G]N    2
T[T>A]N    2
T[G>T]N    2
T[G>C]N    2
T[C>G]N    2
T[C>A]N    2
T[A>T]N    2
G[C>G]N    2
T[A>C]N    2
G[T>G]N    2
G[T>A]N    2
G[G>T]N    2
G[G>C]N    2
G[C>A]N    2
G[A>T]N    2
G[A>C]N    2
N[T>C]T    2
Name: sbs, dtype: int64

In [7]:
# only 160 out of 192 possible contexts are available synonymous spectra
sset = set()
for x in df_all_sbs[df_all_sbs.is_syn].sbs.unique():
    for n in 'ACGT':
        sset.add(x.replace('N', n))

len(sset)

160

In [17]:
unseen_sbs = pd.DataFrame({'SBS': sorted(set(possible_sbs192).difference(sset))})
unseen_sbs['Context'] = unseen_sbs.SBS.str.get(0) + \
    unseen_sbs.SBS.str.get(2) + unseen_sbs.SBS.str.get(-1)
unseen_sbs.index += 1
unseen_sbs.head(16)

Unnamed: 0,SBS,Context
1,A[A>C]A,AAA
2,A[A>C]C,AAC
3,A[A>C]G,AAG
4,A[A>C]T,AAT
5,A[A>T]A,AAA
6,A[A>T]C,AAC
7,A[A>T]G,AAG
8,A[A>T]T,AAT
9,A[C>A]A,ACA
10,A[C>A]C,ACC


In [18]:
unseen_sbs.tail(16)

Unnamed: 0,SBS,Context
17,A[G>C]A,AGA
18,A[G>C]C,AGC
19,A[G>C]G,AGG
20,A[G>C]T,AGT
21,A[G>T]A,AGA
22,A[G>T]C,AGC
23,A[G>T]G,AGG
24,A[G>T]T,AGT
25,A[T>A]A,ATA
26,A[T>A]C,ATC
