In [1]:
import pandas as pd
import numpy as np

In [2]:
codons = pd.read_csv('codon-table-grouped.csv')
codons

Unnamed: 0,aminoacid,codon
0,A,GCA
1,A,GCC
2,A,GCG
3,A,GCT
4,C,TGC
...,...,...
59,V,GTG
60,V,GTT
61,W,TGG
62,Y,TAC


In [4]:
codons.aminoacid.unique()

array(['A', 'C', 'D', 'E', 'F', 'G', 'H', 'I', 'K', 'L', 'M', 'N', 'P',
       'Q', 'R', '*', 'S', 'T', 'V', 'W', 'Y'], dtype=object)

In [5]:
codon_usage = pd.read_excel('codon_usage_Human.xlsx')
codon_usage

Unnamed: 0,Triplet,Amino_acid,Fraction,Frequency_Thousand,Number
0,TTT,F,0.45,16.9,336562
1,TTC,F,0.55,20.4,406571
2,TTA,L,0.07,7.2,143715
3,TTG,L,0.13,12.6,249879
4,TAT,Y,0.43,12.0,239268
...,...,...,...,...,...
59,GCG,A,0.11,7.6,150708
60,GGT,G,0.16,10.8,215544
61,GGC,G,0.34,22.8,453917
62,GGA,G,0.25,16.3,325243


In [22]:
def find_aminoacid(codon='ATG'):
    return codons[codons.codon == codon].aminoacid.values[0]

find_aminoacid('ATG')

'M'

In [23]:
def most_frequent(aminoacid='L'):
    return codon_usage[codon_usage.Amino_acid == aminoacid].sort_values(by='Fraction', ascending=False).iloc[0,0]

most_frequent('L')

'CTG'

In [6]:
virac = pd.read_csv('side-by-side.csv')
virac

Unnamed: 0,abspos,codonOrig,codonVaccine
0,0,ATG,ATG
1,3,TTT,TTC
2,6,GTT,GTG
3,9,TTT,TTC
4,12,CTT,CTG
...,...,...,...
1269,3807,TTA,CTG
1270,3810,CAT,CAC
1271,3813,TAC,TAC
1272,3816,ACA,ACA


In [24]:
original_seq = list(virac.codonOrig.values)
vaccine_seq = list(virac.codonVaccine.values)
original_aminoacid_seq = [find_aminoacid(i) for i in original_seq]
vaccine_aminoacid_seq = [find_aminoacid(i) for i in vaccine_seq]

In [31]:
for index, i in enumerate(zip(original_aminoacid_seq, vaccine_aminoacid_seq)):
    if i[0]!=i[1]:
        print(index, i[0], i[1])

985 K P
986 V P


In [44]:
pred_seq = []

for i in virac.values:
    aa = find_aminoacid(i[1])
    pred = most_frequent(aa)
    pred_seq.append(pred)
    
pred_aminoacid_seq = [find_aminoacid(i) for i in pred_seq]

In [50]:
def performance(pred, vaccine):
    full = len(vaccine)
    diff = 0
    for index, i in enumerate(zip(pred_seq, vaccine_seq)):
        if i[0]!=i[1]:
            diff += 1
            
    return (full-diff)/full


print('AA %', performance(pred_aminoacid_seq, vaccine_aminoacid_seq))
print('Codon %', performance(pred_seq, vaccine_seq))
print('DNA %', performance(''.join(pred_seq), ''.join(vaccine_seq)))

AA % 0.7810047095761381
Codon % 0.7810047095761381
DNA % 0.9270015698587127


In [43]:
for index, i in enumerate(zip(pred_seq, vaccine_seq)):
    if i[0]!=i[1]:
        print(index, i[0], i[1])

8 CCC CCT
11 AGC TCC
14 TGC TGT
20 CGG AGA
21 ACC ACA
24 CCC CCT
25 CCC CCA
31 TTC TTT
33 CGG AGA
43 CGG AGA
44 AGC TCC
49 AGC TCT
56 CCC CCT
70 AGC TCC
73 AAC AAT
77 CGG AGA
88 GGC GGG
91 TTC TTT
97 AGC TCC
101 CGG AGA
108 ACC ACA
126 GTG GTC
128 AAG AAA
142 GTG GTC
153 GAG GAA
171 AGC TCC
173 CCC CCT
179 GAG GAA
189 CGG CGC
193 TTC TTT
208 CCC CCT
211 CTG CTC
214 GAC GAT
216 CCC CCT
220 AGC TCT
221 GCC GCT
223 GAG GAA
227 GAC GAT
237 TTC TTT
239 ACC ACA
245 CGG AGA
249 ACC ACA
250 CCC CCT
252 GAC GAT
256 GGC GGA
258 ACC ACA
259 GCC GCT
260 GGC GGT
263 GCC GCT
265 TAC TAT
271 CCC CCT
272 CGG AGA
289 GAC GAT
290 TGC TGT
291 GCC GCT
293 GAC GAT
294 CCC CCT
298 ACC ACA
304 AGC TCC
308 GAG GAA
323 GAG GAA
324 AGC TCC
330 AAC AAT
333 AAC AAT
342 AAC AAT
345 CGG AGA
348 AGC TCT
359 AAC AAT
365 AGC TCC
370 AGC TCC
382 AGC TCC
383 CCC CCT
392 ACC ACA
403 GGC GGA
404 GAC GAT
405 GAG GAA
409 ATC ATT
411 CCC CCT
412 GGC GGA
414 ACC ACA
431 TGC TGT
433 ATC ATT
442 AGC TCC
443 AAG AAA
444 GTG GTC
