In [5]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pymutspec.annotation import CodonAnnotation
from pymutspec.constants import possible_codons

In [2]:
coda = CodonAnnotation(2)
alphabet = 'ACGT'

In [3]:

def collect_possible_changes():
    nucls = alphabet
    i = 1
    data = []
    for cdn1 in possible_codons:
        aa1 = coda.translate_codon(cdn1)
        for pic in range(3):
            nuc1 = cdn1[pic]
            for nuc2 in nucls:
                if nuc1 == nuc2:
                    continue
                cdn2 = list(cdn1)
                cdn2[pic] = nuc2
                cdn2 = ''.join(cdn2)
                aa2 = coda.translate_codon(cdn2)
                is_syn = aa1 == aa2
                sbs = f'{nuc1}>{nuc2}'
                data.append((pic, cdn1, cdn2, aa1, aa2, is_syn, sbs))
                i += 1

    df_changes = pd.DataFrame(data, columns=['pic', 'cdn1', 'cdn2', 'aa1', 'aa2', 'is_syn', 'sbs'])
    return df_changes


def nuc_spectrum_to_matrix(spec):
    '''
    convert dictionary of mutation counts to mutation matrix
    '''
    M = np.zeros((4,4))
    for i1,n1 in enumerate(alphabet):
        for i2,n2 in enumerate(alphabet):
            if n1!=n2:
                M[i2,i1] = spec[f"{n1}>{n2}"]
    # normalize off-diagonal rates (just for standardization, doesn't affect the results)
    M /= M.sum()
    # # will the diagonal with 'outflow' term to guarantee conservation of probability
    d = M.sum(axis=0)
    np.fill_diagonal(M,-d)

    return M


def cdn_spectrum_to_matrix(cdn_sbs):
    '''
    convert dictionary of mutation counts to mutation matrix
    '''
    n = len(possible_codons)
    M = np.zeros((n, n))
    for i1,cdn1 in enumerate(possible_codons):
        for i2,cdn2 in enumerate(possible_codons):
            if cdn1!=cdn2:
                val = cdn_sbs[(cdn1, cdn2)] if (cdn1, cdn2) in cdn_sbs.index else 0.
                M[i2,i1] = val
    # normalize off-diagonal rates (just for standardization, doesn't affect the results)
    M /= M.sum()
    # will the diagonal with 'outflow' term to guarantee conservation of probability
    d = M.sum(axis=0)
    np.fill_diagonal(M,-d)
    return M


def get_equilibrium_probabilities(M):
    evals, evecs = np.linalg.eig(M)
    # find zero eigenvalue
    ii = np.argmin(np.abs(evals))
    assert np.abs(evals[ii])<1e-10
    # pull out corresponding eigenvector, return normalized to sum_i p_i = 1
    p = evecs[:,ii]
    return p/p.sum()

In [4]:
def get_equilibrium_data(ms12, taxid):
    
    # derive all possible changes in the gencode 
    df_changes = collect_possible_changes()
    
    # # load Kelley Harris spectra
    # ms12 = pd.read_csv('./data/external/rates_by_clade.csv')
    # ms12 = ms12[ms12.clade == '20A'].copy()
    # ms12['mut_type'] = ms12['mut_type'].str.replace('to', '>')
    
    # sbs2rate = ms12.set_index('mut_type')['rate'].to_dict()

    sbs2rate = ms12.set_index('Mut')['MutSpec'].to_dict()
    
    M = nuc_spectrum_to_matrix(sbs2rate)
    eq_prob = get_equilibrium_probabilities(M).astype(float)
    nucl_eq = pd.Series(dict(zip(alphabet, eq_prob)))
    nucl_eq.name = 'freq'
    nucl_eq.index.name = 'nucl'
    nucl_eq = nucl_eq.reset_index()
    nucl_eq['taxid'] = taxid
    # print(nucl_eq)
    
    
    df_changes = collect_possible_changes()
    
    df_changes['rate'] = df_changes['sbs'].map(sbs2rate)
    cdn_sbs = df_changes.groupby(['cdn1', 'cdn2'])['rate'].sum()
    M = cdn_spectrum_to_matrix(cdn_sbs)
    eq_prob = get_equilibrium_probabilities(M).astype(float)
    
    eq_freqs = pd.Series(dict(zip(possible_codons, eq_prob)))
    eq_freqs.name = 'freq'
    eq_freqs.index.name = 'cdn'
    eq_freqs = eq_freqs.reset_index()
    eq_freqs['aa'] = eq_freqs['cdn'].map(coda.translate_codon)
    eq_freqs['taxid'] = taxid
    # print(eq_freqs)
    # eq_freqs.to_csv('../data/equilibrium_freqs_20A.csv', index=False)
    return nucl_eq, eq_freqs

In [6]:
dfms12syn = pd.read_csv('SARS-CoV-2-MutSpec/viral_spectra/ms12syn_all_virus.csv', index_col='Unnamed: 0')

In [7]:
dfms12syn

Unnamed: 0,Mut,ObsNum,ExpNum,MutSpec,MutSpec_q05,MutSpec_median,MutSpec_q95,taxid
0,A>C,36.70270,202.2300,0.034784,0.022643,0.035047,0.044282,10990
1,A>G,198.66800,439.5780,0.086621,0.075908,0.086631,0.096035,10990
2,A>T,22.77440,161.0210,0.027108,0.016305,0.026067,0.034970,10990
3,C>A,17.76760,76.0393,0.044784,0.028092,0.046021,0.060705,10990
4,C>G,1.99772,55.9279,0.006846,0.000000,0.006849,0.014763,10990
...,...,...,...,...,...,...,...,...
7,G>C,2.13421,72.8211,0.003805,0.000000,0.003394,0.007554,77763
8,G>T,50.06880,72.8211,0.089263,0.066487,0.086102,0.114044,77763
9,T>A,49.02400,198.8740,0.032003,0.025325,0.032111,0.040197,77763
10,T>C,478.02300,482.0980,0.128729,0.111404,0.130994,0.142362,77763


In [11]:
# dfms12covid = pd.read_csv('SARS-CoV-2-MutSpec/data/spectra_verification/spectra12.csv')
# dfms12covid['taxid'] = 2697049
# covid_df = dfms12covid.pivot(columns='Mut', values='MutSpec_syn', index='taxid').reset_index()

In [19]:
nucl_eq_lst = []
eq_freqs_lst = []

for taxid in dfms12syn['taxid'].unique():
    nucl_eq_lst.append(get_equilibrium_data(dfms12syn[dfms12syn['taxid'] == taxid], taxid)[0]) 
    eq_freqs_lst.append(get_equilibrium_data(dfms12syn[dfms12syn['taxid'] == taxid], taxid)[1])

  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)


In [28]:
df_nucl_eq_all = pd.concat(nucl_eq_lst).reset_index(drop=True)

In [102]:
df_nucl_eq_all

Unnamed: 0,nucl,freq,taxid
0,A,0.220359,10990
1,C,0.152095,10990
2,G,0.087429,10990
3,T,0.540117,10990
4,A,0.276123,11082
...,...,...,...
115,T,0.490928,694014
116,A,0.283073,77763
117,C,0.173994,77763
118,G,0.162405,77763


In [29]:
df_eq_freqs_all = pd.concat(eq_freqs_lst).reset_index(drop=True)

In [33]:
df_eq_freqs_all.aa.value_counts()

aa
S    180
L    180
R    120
T    120
*    120
V    120
P    120
G    120
A    120
D     60
C     60
W     60
Y     60
K     60
E     60
N     60
H     60
Q     60
I     60
M     60
F     60
Name: count, dtype: int64

In [97]:
df_aa_eq_freq_all = df_eq_freqs_all.groupby(['taxid', 'aa']) \
                                .agg({'freq':'sum'}) \
                                .reset_index() \
                                .pivot(index='taxid', columns='aa', values='freq') \
                                .drop('*', axis=1)

In [98]:
df_aa_eq_freq_all = df_aa_eq_freq_all.div(df_aa_eq_freq_all.sum(axis=1), axis=0).reset_index()

In [99]:
df_aa_eq_freq_all

aa,taxid,A,C,D,E,F,G,H,I,K,...,M,N,P,Q,R,S,T,V,W,Y
0,10990,0.013889,0.034141,0.013929,0.006193,0.210914,0.007984,0.024231,0.086049,0.01561,...,0.038261,0.035107,0.024161,0.010774,0.013889,0.09973,0.035005,0.049321,0.01518,0.086049
1,11082,0.050774,0.036772,0.035403,0.039154,0.042349,0.067241,0.026733,0.040772,0.043413,...,0.045092,0.039254,0.03834,0.029565,0.050774,0.093878,0.056298,0.077439,0.040668,0.040772
2,11320_1,0.024711,0.018898,0.032442,0.050634,0.029478,0.031024,0.02584,0.050604,0.135588,...,0.078982,0.086871,0.019683,0.040331,0.024711,0.070987,0.066171,0.048393,0.029495,0.050604
3,11320_2,0.02916,0.01235,0.031335,0.058119,0.014504,0.030023,0.030435,0.036798,0.173164,...,0.068251,0.093363,0.028322,0.056448,0.02916,0.065579,0.086882,0.035257,0.022907,0.036798
4,1133363,0.017833,0.016765,0.017943,0.014828,0.070616,0.007269,0.04402,0.07558,0.06685,...,0.062459,0.080893,0.04375,0.036378,0.017833,0.09306,0.080397,0.030619,0.013854,0.07558
5,1157337,0.051804,0.031736,0.024652,0.016629,0.057154,0.02951,0.043277,0.044397,0.023263,...,0.029947,0.034487,0.09094,0.029192,0.051804,0.117946,0.072469,0.053144,0.021408,0.044397
6,118655,0.025869,0.021338,0.034749,0.055193,0.030319,0.038868,0.023128,0.049376,0.12772,...,0.078426,0.080411,0.017217,0.036734,0.025869,0.071507,0.059862,0.055229,0.033891,0.049376
7,11983,0.053207,0.031991,0.028324,0.022637,0.04902,0.037563,0.040121,0.043401,0.030711,...,0.034686,0.038427,0.075367,0.032064,0.053207,0.109854,0.072185,0.057558,0.025567,0.043401
8,12110,0.094506,0.032644,0.027869,0.022785,0.025784,0.07512,0.035061,0.022012,0.015364,...,0.017997,0.018793,0.118894,0.028665,0.094506,0.102514,0.063726,0.059333,0.026689,0.022012
9,12162,0.064139,0.053257,0.026949,0.021329,0.065148,0.077992,0.022163,0.032967,0.013203,...,0.026091,0.016682,0.052747,0.01754,0.064139,0.10541,0.039703,0.095406,0.042149,0.032967


In [101]:
# df_aa_eq_freq_all.to_csv('SARS-CoV-2-MutSpec/viral_spectra/aminoacid_eq_freq_all_virus.csv')

In [104]:
# df_nucl_eq_all.to_csv('SARS-CoV-2-MutSpec/viral_spectra/nucl_eq_freq_all_virus.csv')

## Add Sars-cov-2 (count early)

In [14]:
df_eq_freq_sarscov2 = pd.read_csv('SARS-CoV-2-MutSpec/data/equilibrium_freqs_20A.csv')

In [15]:
df_eq_freq_sarscov2

Unnamed: 0,cdn,freq,aa
0,AAA,0.004267,K
1,AAC,0.001897,N
2,AAG,0.000726,K
3,AAT,0.019418,N
4,ACA,0.001897,T
...,...,...,...
59,TGT,0.015035,C
60,TTA,0.088360,L
61,TTC,0.039289,F
62,TTG,0.015035,L


In [32]:
df_aa_eq_freq_sarscov2 = df_eq_freq_sarscov2.groupby(['aa']) \
                                .agg({'freq':'sum'}) \
                                .T\
                                .drop('*', axis=1)\
                                .reset_index(drop=True)

In [36]:
df_aa_eq_freq_sarscov2 = df_aa_eq_freq_sarscov2.div(df_aa_eq_freq_sarscov2.sum(axis=1), axis=0)

In [37]:
df_aa_eq_freq_sarscov2.insert(0, 'taxid', 2697049)

In [38]:
df_aa_eq_freq_sarscov2

aa,taxid,A,C,D,E,F,G,H,I,K,...,M,N,P,Q,R,S,T,V,W,Y
0,2697049,0.002038,0.016902,0.003714,0.00087,0.452024,0.00078,0.009707,0.099336,0.005114,...,0.02327,0.02183,0.005327,0.002274,0.002038,0.058231,0.01198,0.020861,0.003959,0.099336


In [40]:
# df_aa_eq_freq_sarscov2.to_csv('SARS-CoV-2-MutSpec/viral_spectra/aminoacid_eq_freq_sars_cov2.csv')