In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from pymutspec.annotation import CodonAnnotation
from pymutspec.constants import possible_codons

In [2]:
coda = CodonAnnotation(1)
alphabet = 'ACGT'

In [22]:
from utils import (
    collect_possible_changes, nuc_spectrum_to_matrix, 
    cdn_spectrum_to_matrix, get_equilibrium_probabilities,
    get_equilibrium_freqs,
)

In [None]:
def get_equilibrium_data(ms12, taxid):
    sbs2rate = ms12.set_index('Mut')['MutSpec'].to_dict()
    
    M = nuc_spectrum_to_matrix(sbs2rate)
    eq_prob = get_equilibrium_probabilities(M).astype(float)
    nucl_eq = pd.Series(dict(zip(alphabet, eq_prob)))
    nucl_eq.name = 'freq'
    nucl_eq.index.name = 'nucl'
    nucl_eq = nucl_eq.reset_index()
    nucl_eq['taxid'] = taxid
    # print(nucl_eq)
    
    eq_freqs, eq_aa = get_equilibrium_freqs(ms12)
    eq_freqs['taxid'] = taxid
    return nucl_eq, eq_freqs

In [None]:
# TODO replace calculation with this table
spectra = pd.read_csv('./data/viral_spectra_dataset.csv').melt(
    ['Type', 'taxname', 'df', 'taxid', 'virusname'], 
    var_name='Mut', value_name='MutSpec')
spectra

Unnamed: 0,Type,taxname,df,taxid,virusname,Mut,MutSpec
0,ds,Hepatitis B virus,nemu,10407,HBV,A>C,0.109127
1,ds,Rice black streaked dwarf virus,nemu,10990,RBSDV,A>C,0.034784
2,+,West Nile virus,nemu,11082,WNV,A>C,0.012922
3,+,Norwalk virus,nemu,11983,norw,A>C,0.009244
4,+,Hepatovirus A,nemu,12092,HAV,A>C,0.008840
...,...,...,...,...,...,...,...
439,-,Influenza B Victoria,Bloom_etal,-,flu_vic,T>G,0.010400
440,-,RSV-A,Bloom_etal,-,rsv-a,T>G,0.008174
441,-,RSV-B,Bloom_etal,-,rsv-b,T>G,0.007654
442,+,SARS-CoV-2 20A,Bloom_etal,-,CoV20A,T>G,0.006130


In [None]:
dfms12syn = pd.read_csv('./data/ms12syn_all_virus.csv', index_col=0).reset_index()
dfms12syn

In [None]:
dfms12syn['taxid'].unique()

array([  10407,   10990,   11082,   11320, 1133363, 1157337,  118655,
         11983,   12092,   12110,   12162,   12637,  138948,  138950,
        138951,  162145, 1678143, 1933178,  198112,   28295,   28344,
       3052230, 3052493, 3052763,  351073,   38170,   40054,   54290,
         57482,  693997,  694014,   77763])

In [8]:
# dfms12covid = pd.read_csv('SARS-CoV-2-MutSpec/data/spectra_verification/spectra12.csv')
# dfms12covid['taxid'] = 2697049
# covid_df = dfms12covid.pivot(columns='Mut', values='MutSpec_syn', index='taxid').reset_index()

In [26]:
nucl_eq_lst = []
eq_freqs_lst = []

for taxid in dfms12syn['taxid'].unique():
    nucl_eq_lst.append(get_equilibrium_data(dfms12syn[dfms12syn['taxid'] == taxid], taxid)[0]) 
    eq_freqs_lst.append(get_equilibrium_data(dfms12syn[dfms12syn['taxid'] == taxid], taxid)[1])

In [27]:
df_nucl_eq_all = pd.concat(nucl_eq_lst).reset_index(drop=True)

In [28]:
df_nucl_eq_all

Unnamed: 0,nucl,freq,taxid
0,A,0.256787,10407
1,C,0.199648,10407
2,G,0.312785,10407
3,T,0.230780,10407
4,A,0.177305,10990
...,...,...,...
123,T,0.093360,694014
124,A,0.198467,77763
125,C,0.328246,77763
126,G,0.328644,77763


In [31]:
df_eq_freqs_all = pd.concat(eq_freqs_lst).reset_index(drop=True)
df_eq_freqs_all

Unnamed: 0,cdn,eq_freq,aa,taxid
0,AAA,0.013528,Lys,10407
1,AAC,0.016650,Asn,10407
2,AAG,0.011135,Lys,10407
3,AAT,0.015695,Asn,10407
4,ACA,0.016650,Thr,10407
...,...,...,...,...
2043,TGT,0.022595,Cys,77763
2044,TTA,0.033151,Leu,77763
2045,TTC,0.024570,Phe,77763
2046,TTG,0.022595,Leu,77763


In [None]:
df_eq_freqs_all.aa.value_counts()

aa
Ser    192
Arg    192
Leu    192
Pro    128
Thr    128
Val    128
Ala    128
Gly    128
*       96
Ile     96
Lys     64
Asn     64
Gln     64
Tyr     64
Asp     64
His     64
Glu     64
Phe     64
Cys     64
Met     32
Trp     32
Name: count, dtype: int64

In [33]:
df_aa_eq_freq_all = df_eq_freqs_all.groupby(['taxid', 'aa']) \
                                .agg({'eq_freq':'sum'}) \
                                .reset_index() \
                                .pivot(index='taxid', columns='aa', values='eq_freq') \
                                .drop('*', axis=1)

In [34]:
df_aa_eq_freq_all = df_aa_eq_freq_all.div(df_aa_eq_freq_all.sum(axis=1), axis=0).reset_index()

In [35]:
df_aa_eq_freq_all

aa,taxid,Ala,Arg,Asn,Asp,Cys,Gln,Glu,Gly,His,...,Leu,Lys,Met,Phe,Pro,Ser,Thr,Trp,Tyr,Val
0,10407,0.05921,0.080345,0.033741,0.027613,0.031618,0.031575,0.021135,0.039762,0.04118,...,0.117514,0.025728,0.013406,0.044461,0.088078,0.110885,0.072755,0.010922,0.038877,0.055946
1,10990,0.022529,0.030072,0.029765,0.016467,0.038834,0.01414,0.007543,0.012196,0.030936,...,0.181103,0.013792,0.011718,0.159932,0.041474,0.12311,0.045076,0.006091,0.069103,0.056643
2,11082,0.055157,0.079762,0.033672,0.030789,0.038733,0.028378,0.024606,0.05076,0.032109,...,0.119644,0.026766,0.017244,0.051881,0.054356,0.104354,0.060487,0.015782,0.042462,0.071759
3,11320,0.03233,0.091079,0.06853,0.036377,0.021084,0.039971,0.058749,0.045838,0.024603,...,0.074964,0.112241,0.022854,0.022921,0.022421,0.073775,0.064574,0.011201,0.039634,0.055139
4,11983,0.054221,0.075625,0.035377,0.027691,0.033093,0.030276,0.021404,0.038329,0.03917,...,0.12233,0.027288,0.014397,0.050699,0.076703,0.110759,0.069446,0.011231,0.042423,0.058721
5,12092,0.029037,0.047685,0.039339,0.026068,0.042966,0.019651,0.018649,0.029356,0.02651,...,0.151541,0.027795,0.01846,0.106732,0.024914,0.10229,0.046161,0.012263,0.064799,0.07369
6,12110,0.081325,0.100855,0.02011,0.026538,0.03572,0.026546,0.01953,0.059465,0.035803,...,0.11017,0.01498,0.011496,0.034913,0.110508,0.110169,0.062349,0.01517,0.026681,0.062386
7,12162,0.062533,0.084618,0.020647,0.028035,0.047233,0.020517,0.022085,0.06557,0.02603,...,0.127701,0.016277,0.01618,0.060648,0.058506,0.107985,0.046547,0.020999,0.035746,0.084412
8,12637,0.04993,0.09644,0.057605,0.036272,0.021556,0.046021,0.04651,0.052505,0.034188,...,0.076362,0.070893,0.018462,0.020033,0.046389,0.083875,0.080209,0.011476,0.035192,0.051494
9,28295,0.029724,0.036246,0.022908,0.015574,0.03965,0.013971,0.006522,0.01454,0.032639,...,0.181007,0.009306,0.010069,0.152767,0.059247,0.132797,0.046611,0.006697,0.059433,0.056417


In [36]:
df_aa_eq_freq_all.to_csv('./data/aminoacid_eq_freq_all_virus.csv', index=False)

In [37]:
df_nucl_eq_all.to_csv('./data/nucl_eq_freq_all_virus.csv', index=False)

## Add Sars-cov-2 (count early)

In [17]:
df_eq_freq_sarscov2 = pd.read_csv('../data/equilibrium_freqs_20A.csv')

In [18]:
df_eq_freq_sarscov2

Unnamed: 0,cdn,freq,aa
0,AAA,0.004267,K
1,AAC,0.001897,N
2,AAG,0.000726,K
3,AAT,0.019418,N
4,ACA,0.001897,T
...,...,...,...
59,TGT,0.015035,C
60,TTA,0.088360,L
61,TTC,0.039289,F
62,TTG,0.015035,L


In [19]:
df_aa_eq_freq_sarscov2 = df_eq_freq_sarscov2.groupby(['aa']) \
                                .agg({'freq':'sum'}) \
                                .T\
                                .drop('*', axis=1)\
                                .reset_index(drop=True)

In [20]:
df_aa_eq_freq_sarscov2 = df_aa_eq_freq_sarscov2.div(df_aa_eq_freq_sarscov2.sum(axis=1), axis=0)

In [21]:
df_aa_eq_freq_sarscov2.insert(0, 'taxid', 2697049)

In [22]:
df_aa_eq_freq_sarscov2

aa,taxid,A,C,D,E,F,G,H,I,K,...,M,N,P,Q,R,S,T,V,W,Y
0,2697049,0.002044,0.016945,0.003724,0.000872,0.453163,0.000782,0.009731,0.119522,0.005127,...,0.003392,0.021885,0.005341,0.00228,0.002916,0.058378,0.012011,0.020914,0.000577,0.099586


In [22]:
# df_aa_eq_freq_sarscov2.to_csv('./data/aminoacid_eq_freq_sars_cov2.csv', index=False)

____

In [23]:
df_rates = pd.read_csv('./data/Bloom_etal/rates_by_clade.csv')

In [24]:
df_rates = df_rates[df_rates['clade'].isin(['20A', '22C'])].copy()
df_rates['mut_type'] = df_rates['mut_type'].str.replace('to', '>')
df_rates = df_rates.rename(columns={'mut_type':'Mut', 'rate':'MutSpec'})

In [25]:
df_rates

Unnamed: 0,clade,Mut,count,total_count,fraction,parent_nt,parent_nt_frac,MutSpec
0,20A,A>C,286,17202,0.016626,A,0.28962,0.057407
1,20A,A>G,1617,17202,0.094001,A,0.28962,0.32457
2,20A,A>T,515,17202,0.029938,A,0.28962,0.10337
3,20A,C>A,339,17202,0.019707,C,0.13704,0.14381
4,20A,C>G,130,17202,0.007557,C,0.13704,0.055147
5,20A,C>T,7113,17202,0.4135,C,0.13704,3.0174
6,20A,G>A,837,17202,0.048657,G,0.064987,0.74872
7,20A,G>C,137,17202,0.007964,G,0.064987,0.12255
8,20A,G>T,2670,17202,0.15521,G,0.064987,2.3884
9,20A,T>A,565,17202,0.032845,T,0.50836,0.06461


In [26]:
nucl_eq_lst_sars = []
eq_freqs_lst_sars = []

for clade in df_rates['clade'].unique():
    nucl_eq_lst_sars.append(get_equilibrium_data(df_rates[df_rates['clade'] == clade], clade)[0]) 
    eq_freqs_lst_sars.append(get_equilibrium_data(df_rates[df_rates['clade'] == clade], clade)[1])

  eq_prob = get_equilibrium_probabilities(M).astype(float)
  eq_prob = get_equilibrium_probabilities(M).astype(float)


In [27]:
df_nucl_eq_sars = pd.concat(nucl_eq_lst_sars).reset_index(drop=True)

In [28]:
df_nucl_eq_sars

Unnamed: 0,nucl,freq,taxid
0,A,0.162199,20A
1,C,0.072122,20A
2,G,0.027598,20A
3,T,0.738081,20A
4,A,0.200377,22C
5,C,0.077208,22C
6,G,0.050313,22C
7,T,0.672101,22C


In [30]:
# df_nucl_eq_sars.to_csv('./data/nucl_eq_freq_sars_cov2.csv', index=False)

In [15]:
df_eq_freqs_sars = pd.concat(eq_freqs_lst_sars).reset_index(drop=True)

In [16]:
df_eq_freqs_sars

Unnamed: 0,cdn,freq,aa,taxid
0,AAA,0.004267,K,20A
1,AAC,0.001897,N,20A
2,AAG,0.000726,K,20A
3,AAT,0.019418,N,20A
4,ACA,0.001897,T,20A
...,...,...,...,...
123,TGT,0.022727,C,22C
124,TTA,0.090514,L,22C
125,TTC,0.034877,F,22C
126,TTG,0.022727,L,22C


In [17]:
df_eq_freqs_sars.aa.value_counts()

aa
L    12
R    12
S    12
A     8
T     8
P     8
V     8
G     8
I     6
*     6
C     4
Y     4
K     4
D     4
E     4
N     4
H     4
Q     4
F     4
M     2
W     2
Name: count, dtype: int64

In [25]:
df_aa_eq_freq_sars = df_eq_freqs_sars.groupby(['taxid', 'aa']) \
                                .agg({'freq':'sum'}) \
                                .reset_index() \
                                .pivot(index='taxid', columns='aa', values='freq') \
                                .drop('*', axis=1)

In [26]:
df_aa_eq_freq_sars = df_aa_eq_freq_sars.div(df_aa_eq_freq_sars.sum(axis=1), axis=0).reset_index()

In [28]:
# df_aa_eq_freq_sars = df_aa_eq_freq_sars.rename(columns={'taxid':'clade'})
# df_aa_eq_freq_sars['taxid'] = '2697049'

In [27]:
df_aa_eq_freq_sars

aa,taxid,A,C,D,E,F,G,H,I,K,...,M,N,P,Q,R,S,T,V,W,Y
0,20A,0.002044,0.016945,0.003724,0.000872,0.453163,0.000782,0.009731,0.119522,0.005127,...,0.003392,0.021885,0.005341,0.00228,0.002916,0.058378,0.012011,0.020914,0.000577,0.099586
1,22C,0.004049,0.026409,0.007873,0.002634,0.352779,0.002638,0.012082,0.133302,0.010491,...,0.007062,0.031357,0.006213,0.004042,0.006683,0.061958,0.016124,0.035244,0.001773,0.105176


In [30]:
# df_aa_eq_freq_sars.to_csv('./data/aminoacid_eq_freq_sars_cov2.csv', index=False)