This script mines data from a Nextstrain exported dataset<br />
This dataset contains the entropies of all residues in all Sars-Cov-2 genes<br />
We are particullary interested in the spike residues<br />

To obtain the dataset this steps were performed (14/04/2021):
1. Go to the [Nextstrain web page](https://nextstrain.org/ncov/global?c=gt-S_570)
2. In the very bottom of the web page click on 'download data'
3. Download the 'genetic diversity data (tsv)' dataset
    
The reference genome for this data is here https://www.ncbi.nlm.nih.gov/nuccore/MN908947<br />
The sequence of the spike protein residues was manually copied

In [10]:
sequence = ('MFVFLVLLPLVSSQCVNLTTRTQLPPAYTNSFTRGVYYPDKVFR'
            'SSVLHSTQDLFLPFFSNVTWFHAIHVSGTNGTKRFDNPVLPFNDGVYFASTEKSNIIR'
            'GWIFGTTLDSKTQSLLIVNNATNVVIKVCEFQFCNDPFLGVYYHKNNKSWMESEFRVY'
            'SSANNCTFEYVSQPFLMDLEGKQGNFKNLREFVFKNIDGYFKIYSKHTPINLVRDLPQ'
            'GFSALEPLVDLPIGINITRFQTLLALHRSYLTPGDSSSGWTAGAAAYYVGYLQPRTFL'
            'LKYNENGTITDAVDCALDPLSETKCTLKSFTVEKGIYQTSNFRVQPTESIVRFPNITN'
            'LCPFGEVFNATRFASVYAWNRKRISNCVADYSVLYNSASFSTFKCYGVSPTKLNDLCF'
            'TNVYADSFVIRGDEVRQIAPGQTGKIADYNYKLPDDFTGCVIAWNSNNLDSKVGGNYN'
            'YLYRLFRKSNLKPFERDISTEIYQAGSTPCNGVEGFNCYFPLQSYGFQPTNGVGYQPY'
            'RVVVLSFELLHAPATVCGPKKSTNLVKNKCVNFNFNGLTGTGVLTESNKKFLPFQQFG'
            'RDIADTTDAVRDPQTLEILDITPCSFGGVSVITPGTNTSNQVAVLYQDVNCTEVPVAI'
            'HADQLTPTWRVYSTGSNVFQTRAGCLIGAEHVNNSYECDIPIGAGICASYQTQTNSPR'
            'RARSVASQSIIAYTMSLGAENSVAYSNNSIAIPTNFTISVTTEILPVSMTKTSVDCTM'
            'YICGDSTECSNLLLQYGSFCTQLNRALTGIAVEQDKNTQEVFAQVKQIYKTPPIKDFG'
            'GFNFSQILPDPSKPSKRSFIEDLLFNKVTLADAGFIKQYGDCLGDIAARDLICAQKFN'
            'GLTVLPPLLTDEMIAQYTSALLAGTITSGWTFGAGAALQIPFAMQMAYRFNGIGVTQN'
            'VLYENQKLIANQFNSAIGKIQDSLSSTASALGKLQDVVNQNAQALNTLVKQLSSNFGA'
            'ISSVLNDILSRLDKVEAEVQIDRLITGRLQSLQTYVTQQLIRAAEIRASANLAATKMS'
            'ECVLGQSKRVDFCGKGYHLMSFPQSAPHGVVFLHVTYVPAQEKNFTTAPAICHDGKAH'
            'FPREGVFVSNGTHWFVTQRNFYEPQIITTDNTFVSGNCDVVIGIVNNTVYDPLQPELD'
            'SFKEELDKYFKNHTSPDVDLGDISGINASVVNIQKEIDRLNEVAKNLNESLIDLQELG'
            'KYEQYIKWPWYIWLGFIAGLIAIVMVTIMLCCMTSCCSCLKGCCSCGSCCKFDEDDSE'
            'PVLKGVKLHYT')

print(len(sequence))

1273


In [12]:
from pandas import read_csv

path = '/home/dbeltran_local/downloads/'
%cd $path

# The dataset format is tsv (tabulator separated values)
mutations_dataset = 'nextstrain_ncov_global_diversity.tsv'

# Then we read the file using the tabulator as separator
df = read_csv(mutations_dataset, sep='\t')

# The original dataset contains data about many other covid related genes
# Get only spike mutations
df = df.loc[df['gene'] == 'S']

# Harvest all the entropies from the dataset
# Use the reference protein sequence as a mold
entropies = []
for r, residue in enumerate(sequence):
    entropy = 0
    # WARNING: Residues numeration goes from 1 to n while in our array we count from 0 to n
    row = df.loc[df['position'] == r+1]
    if not row.empty:
        entropy = row.iloc[0]['entropy']
        # Round to the third decimal
        entropy = round(entropy * 1000) / 1000
    entropies.append(entropy)
print(len(entropies))
    
# Now print entropies and update the database manually
# The whole array may not fit in the mongo shell buffer, so you may have to do it in pieces
print(entropies)

/home/dbeltran_local/downloads
1273
[0.002, 0.005, 0.011, 0, 0.1, 0.026, 0, 0.006, 0.01, 0, 0, 0.043, 0.079, 0.011, 0.002, 0.019, 0, 0.213, 0.029, 0.103, 0.017, 0.016, 0.016, 0, 0.002, 0.097, 0.03, 0.004, 0.013, 0, 0, 0, 0.007, 0, 0.016, 0, 0, 0, 0, 0, 0.002, 0.004, 0, 0, 0, 0, 0, 0, 0.014, 0, 0.006, 0.04, 0, 0.023, 0, 0, 0, 0, 0, 0, 0.002, 0, 0.006, 0.013, 0.006, 0.006, 0.057, 0.012, 0.46, 0.459, 0.014, 0.019, 0.01, 0.008, 0.044, 0.031, 0.009, 0.013, 0, 0.233, 0, 0.002, 0, 0, 0.002, 0, 0, 0, 0, 0.002, 0, 0, 0, 0.01, 0.072, 0.022, 0, 0.047, 0, 0, 0.002, 0.032, 0, 0, 0.002, 0, 0.004, 0, 0, 0, 0.004, 0.002, 0.002, 0, 0, 0.002, 0, 0.002, 0.004, 0, 0, 0, 0, 0, 0, 0, 0, 0.002, 0, 0.002, 0, 0.002, 0.002, 0, 0, 0.002, 0.002, 0.131, 0.002, 0.006, 0.045, 0.062, 0.049, 0.446, 0.009, 0.027, 0.002, 0.005, 0, 0.005, 0.01, 0.141, 0.072, 0.023, 0.007, 0.008, 0.091, 0.008, 0.002, 0.004, 0.004, 0.002, 0, 0.002, 0, 0, 0, 0, 0, 0.002, 0.002, 0, 0.008, 0.007, 0, 0.015, 0.005, 0.026, 0, 0.007, 0.013, 0, 0,