In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pymutspec.annotation import CodonAnnotation
from pymutspec.constants import possible_codons

In [2]:
coda = CodonAnnotation(1)
alphabet = 'ACGT'

In [3]:

def collect_possible_changes():
    nucls = alphabet
    i = 1
    data = []
    for cdn1 in possible_codons:
        aa1 = coda.translate_codon(cdn1)
        for pic in range(3):
            nuc1 = cdn1[pic]
            for nuc2 in nucls:
                if nuc1 == nuc2:
                    continue
                cdn2 = list(cdn1)
                cdn2[pic] = nuc2
                cdn2 = ''.join(cdn2)
                aa2 = coda.translate_codon(cdn2)
                is_syn = aa1 == aa2
                sbs = f'{nuc1}>{nuc2}'
                data.append((pic, cdn1, cdn2, aa1, aa2, is_syn, sbs))
                i += 1

    df_changes = pd.DataFrame(data, columns=['pic', 'cdn1', 'cdn2', 'aa1', 'aa2', 'is_syn', 'sbs'])
    return df_changes


def nuc_spectrum_to_matrix(spec):
    '''
    convert dictionary of mutation counts to mutation matrix
    '''
    M = np.zeros((4,4))
    for i1,n1 in enumerate(alphabet):
        for i2,n2 in enumerate(alphabet):
            if n1!=n2:
                M[i2,i1] = spec[f"{n1}>{n2}"]
    # normalize off-diagonal rates (just for standardization, doesn't affect the results)
    M /= M.sum()
    # # will the diagonal with 'outflow' term to guarantee conservation of probability
    d = M.sum(axis=0)
    np.fill_diagonal(M,-d)

    return M


def cdn_spectrum_to_matrix(cdn_sbs):
    '''
    convert dictionary of mutation counts to mutation matrix
    '''
    n = len(possible_codons)
    M = np.zeros((n, n))
    for i1,cdn1 in enumerate(possible_codons):
        for i2,cdn2 in enumerate(possible_codons):
            if cdn1!=cdn2:
                val = cdn_sbs[(cdn1, cdn2)] if (cdn1, cdn2) in cdn_sbs.index else 0.
                M[i2,i1] = val
    # normalize off-diagonal rates (just for standardization, doesn't affect the results)
    M /= M.sum()
    # will the diagonal with 'outflow' term to guarantee conservation of probability
    d = M.sum(axis=0)
    np.fill_diagonal(M,-d)
    return M


def get_equilibrium_probabilities(M):
    evals, evecs = np.linalg.eig(M)
    # find zero eigenvalue
    ii = np.argmin(np.abs(evals))
    assert np.abs(evals[ii])<1e-10
    # pull out corresponding eigenvector, return normalized to sum_i p_i = 1
    p = evecs[:,ii]
    return p/p.sum()

In [4]:
def get_equilibrium_data(ms12, taxid):
    
    # derive all possible changes in the gencode 
    df_changes = collect_possible_changes()
    
    # # load Kelley Harris spectra
    # ms12 = pd.read_csv('./data/external/rates_by_clade.csv')
    # ms12 = ms12[ms12.clade == '20A'].copy()
    # ms12['mut_type'] = ms12['mut_type'].str.replace('to', '>')
    
    # sbs2rate = ms12.set_index('mut_type')['rate'].to_dict()

    sbs2rate = ms12.set_index('Mut')['MutSpec'].to_dict()
    
    M = nuc_spectrum_to_matrix(sbs2rate)
    eq_prob = get_equilibrium_probabilities(M).astype(float)
    nucl_eq = pd.Series(dict(zip(alphabet, eq_prob)))
    nucl_eq.name = 'freq'
    nucl_eq.index.name = 'nucl'
    nucl_eq = nucl_eq.reset_index()
    nucl_eq['taxid'] = taxid
    # print(nucl_eq)
    
    
    df_changes = collect_possible_changes()
    
    df_changes['rate'] = df_changes['sbs'].map(sbs2rate)
    cdn_sbs = df_changes.groupby(['cdn1', 'cdn2'])['rate'].sum()
    M = cdn_spectrum_to_matrix(cdn_sbs)
    eq_prob = get_equilibrium_probabilities(M).astype(float)
    
    eq_freqs = pd.Series(dict(zip(possible_codons, eq_prob)))
    eq_freqs.name = 'freq'
    eq_freqs.index.name = 'cdn'
    eq_freqs = eq_freqs.reset_index()
    eq_freqs['aa'] = eq_freqs['cdn'].map(coda.translate_codon)
    eq_freqs['taxid'] = taxid
    # print(eq_freqs)
    # eq_freqs.to_csv('../data/equilibrium_freqs_20A.csv', index=False)
    return nucl_eq, eq_freqs

In [6]:
dfms12syn = pd.read_csv('ms12syn_all_virus.csv', index_col=0)

In [7]:
dfms12syn

Unnamed: 0,Mut,ObsNum,ExpNum,MutSpec,MutSpec_q05,MutSpec_median,MutSpec_q95,taxid
0,A>C,36.70270,202.2300,0.034784,0.022643,0.035047,0.044282,10990
1,A>G,198.66800,439.5780,0.086621,0.075908,0.086631,0.096035,10990
2,A>T,22.77440,161.0210,0.027108,0.016305,0.026067,0.034970,10990
3,C>A,17.76760,76.0393,0.044784,0.028092,0.046021,0.060705,10990
4,C>G,1.99772,55.9279,0.006846,0.000000,0.006849,0.014763,10990
...,...,...,...,...,...,...,...,...
7,G>C,2.13421,72.8211,0.003805,0.000000,0.003394,0.007554,77763
8,G>T,50.06880,72.8211,0.089263,0.066487,0.086102,0.114044,77763
9,T>A,49.02400,198.8740,0.032003,0.025325,0.032111,0.040197,77763
10,T>C,478.02300,482.0980,0.128729,0.111404,0.130994,0.142362,77763


In [11]:
# dfms12covid = pd.read_csv('SARS-CoV-2-MutSpec/data/spectra_verification/spectra12.csv')
# dfms12covid['taxid'] = 2697049
# covid_df = dfms12covid.pivot(columns='Mut', values='MutSpec_syn', index='taxid').reset_index()

In [8]:
nucl_eq_lst = []
eq_freqs_lst = []

for taxid in dfms12syn['taxid'].unique():
    nucl_eq_lst.append(get_equilibrium_data(dfms12syn[dfms12syn['taxid'] == taxid], taxid)[0]) 
    eq_freqs_lst.append(get_equilibrium_data(dfms12syn[dfms12syn['taxid'] == taxid], taxid)[1])

  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)


In [9]:
df_nucl_eq_all = pd.concat(nucl_eq_lst).reset_index(drop=True)

In [10]:
df_nucl_eq_all

Unnamed: 0,nucl,freq,taxid
0,A,0.220359,10990
1,C,0.152095,10990
2,G,0.087429,10990
3,T,0.540117,10990
4,A,0.276123,11082
...,...,...,...
115,T,0.490928,694014
116,A,0.283073,77763
117,C,0.173994,77763
118,G,0.162405,77763


In [11]:
df_eq_freqs_all = pd.concat(eq_freqs_lst).reset_index(drop=True)

In [12]:
df_eq_freqs_all.aa.value_counts()

L    180
R    180
S    180
A    120
T    120
P    120
V    120
G    120
I     90
*     90
C     60
Y     60
K     60
D     60
E     60
N     60
H     60
Q     60
F     60
M     30
W     30
Name: aa, dtype: int64

In [13]:
df_aa_eq_freq_all = df_eq_freqs_all.groupby(['taxid', 'aa']) \
                                .agg({'freq':'sum'}) \
                                .reset_index() \
                                .pivot(index='taxid', columns='aa', values='freq') \
                                .drop('*', axis=1)

In [14]:
df_aa_eq_freq_all = df_aa_eq_freq_all.div(df_aa_eq_freq_all.sum(axis=1), axis=0).reset_index()

In [15]:
df_aa_eq_freq_all

aa,taxid,A,C,D,E,F,G,H,I,K,...,M,N,P,Q,R,S,T,V,W,Y
0,10990,0.013954,0.034301,0.013994,0.006222,0.211905,0.008021,0.024345,0.113975,0.015683,...,0.010919,0.035272,0.024275,0.010825,0.020176,0.100198,0.03517,0.049553,0.004332,0.086453
1,11082,0.049888,0.03613,0.034785,0.03847,0.041609,0.066067,0.026266,0.063355,0.042655,...,0.021009,0.038569,0.03767,0.029049,0.088358,0.092238,0.055315,0.076087,0.018948,0.04006
2,11320_1,0.024011,0.018362,0.031522,0.0492,0.028642,0.030145,0.025108,0.105047,0.131746,...,0.020867,0.08441,0.019125,0.039188,0.073211,0.068976,0.064296,0.047022,0.007793,0.04917
3,11320_2,0.028012,0.011864,0.030102,0.055831,0.013933,0.028842,0.029237,0.084439,0.166349,...,0.016476,0.089689,0.027207,0.054226,0.083844,0.062998,0.083463,0.03387,0.00553,0.03535
4,1133363,0.017771,0.016706,0.017881,0.014777,0.07037,0.007244,0.043867,0.126259,0.066617,...,0.0113,0.080612,0.043598,0.036251,0.032548,0.092736,0.080118,0.030513,0.002506,0.075317
5,1157337,0.05159,0.031605,0.024551,0.01656,0.056918,0.029388,0.043098,0.061605,0.023167,...,0.012432,0.034344,0.090565,0.029071,0.06815,0.117459,0.07217,0.052925,0.008887,0.044213
6,118655,0.025078,0.020685,0.033687,0.053506,0.029393,0.03768,0.022421,0.100955,0.123816,...,0.022941,0.077954,0.016691,0.035612,0.078585,0.069322,0.058032,0.053541,0.009914,0.047867
7,11983,0.052789,0.03174,0.028102,0.022459,0.048635,0.037268,0.039806,0.062872,0.030469,...,0.014603,0.038125,0.074775,0.031813,0.075248,0.108991,0.071618,0.057106,0.010764,0.04306
8,12110,0.093382,0.032256,0.027538,0.022514,0.025477,0.074226,0.034644,0.028912,0.015181,...,0.010621,0.018569,0.11748,0.028324,0.115896,0.101295,0.062968,0.058628,0.015751,0.021751
9,12162,0.063806,0.052981,0.02681,0.021218,0.06481,0.077587,0.022048,0.04272,0.013134,...,0.016032,0.016595,0.052473,0.017449,0.085024,0.104863,0.039497,0.094911,0.025899,0.032796


In [29]:
df_aa_eq_freq_all.to_csv('aminoacid_eq_freq_all_virus.csv', index=False)

In [30]:
df_nucl_eq_all.to_csv('nucl_eq_freq_all_virus.csv', index=False)

## Add Sars-cov-2 (count early)

In [20]:
df_eq_freq_sarscov2 = pd.read_csv('../data/equilibrium_freqs_20A.csv')

In [21]:
df_eq_freq_sarscov2

Unnamed: 0,cdn,freq,aa
0,AAA,0.004267,K
1,AAC,0.001897,N
2,AAG,0.000726,K
3,AAT,0.019418,N
4,ACA,0.001897,T
...,...,...,...
59,TGT,0.015035,C
60,TTA,0.088360,L
61,TTC,0.039289,F
62,TTG,0.015035,L


In [22]:
df_aa_eq_freq_sarscov2 = df_eq_freq_sarscov2.groupby(['aa']) \
                                .agg({'freq':'sum'}) \
                                .T\
                                .drop('*', axis=1)\
                                .reset_index(drop=True)

In [23]:
df_aa_eq_freq_sarscov2 = df_aa_eq_freq_sarscov2.div(df_aa_eq_freq_sarscov2.sum(axis=1), axis=0)

In [24]:
df_aa_eq_freq_sarscov2.insert(0, 'taxid', 2697049)

In [25]:
df_aa_eq_freq_sarscov2

aa,taxid,A,C,D,E,F,G,H,I,K,...,M,N,P,Q,R,S,T,V,W,Y
0,2697049,0.002044,0.016945,0.003724,0.000872,0.453163,0.000782,0.009731,0.119522,0.005127,...,0.003392,0.021885,0.005341,0.00228,0.002916,0.058378,0.012011,0.020914,0.000577,0.099586


In [31]:
df_aa_eq_freq_sarscov2.to_csv('aminoacid_eq_freq_sars_cov2.csv', index=False)