The purpose of this calculation is to reproduce the results from [Asuncion et al., Philippine Journal of Science, 2019](https://github.com/pgniewko/conotoxins/tree/master/papers/ML/ASA.PJS.2019.pdf). The data used in this work is downloaded from the [ConoServer](http://www.conoserver.org/?page=download) (access data: 11/18/19). Instead of using [protr](https://cran.r-project.org/web/packages/protr/vignettes/protr.html) package for peptides featurizaion, we use a Python implementation [pydpi](https://pypi.org/project/pydpi/#files). Other tools could be also used e.g. [iFeature](https://github.com/Superzchen/iFeature/) by [Chen et al., Bioinformatics 2018](https://github.com/pgniewko/conotoxins/tree/master/papers/ML/iFeature.2018.pdf).

In [2]:
import sys
import xml.etree.ElementTree as ET

from toxin import Toxin

In [3]:
db_file = "../data/conoserver/conoserver_protein.xml"
tree = ET.parse(db_file)
root =tree.getroot()

toxins = []
for entry in root.iter('entry'):
    seq = entry.find('sequence').text if entry.find('sequence') is not None else "None"
    name = entry.find('name').text if entry.find('name') is not None else "None"
    toxin_class =  entry.find('class').text if entry.find('class') is not None else "None"
    organism = entry.find('organismLatin').text if entry.find('organismLatin') is not None else "None"
    geneSuperfamily = entry.find('geneSuperfamily').text if entry.find('geneSuperfamily') is not None else "None"
    cysteineFramewrok = entry.find('cysteineFramewrok').text if entry.find('cysteineFramewrok') is not None else "None"
    pharmacologicalFamily = entry.find('pharmacologicalFamily').text if entry.find('pharmacologicalFamily') is not None else "None"
    isoelecticPoint = entry.find('isoelecticPoint').text if entry.find('isoelecticPoint') is not None else "None"
    
    toxin = Toxin(seq,
                  name,
                  toxin_class,
                  organism,
                  geneSuperfamily,
                  cysteineFramewrok,
                  pharmacologicalFamily, 
                  isoelecticPoint)
    
    toxins.append(toxin)
    


In [4]:
from pydpi.pypro import PyPro
protein="ADGCGVGEGTGQGPMCNCMCMKWVYADEDAADLESDSFADEDASLESDSFPWSNQRVFCSFADEDAS"
cds = PyPro()
cds.ReadProteinSequence(protein)

In [5]:
print(toxins[4])

FDGRNAAANDKASDLVALTVRGCCSHPACSVNHPELCG


In [6]:
print(toxins[4].get_features())

[15.789, 5.263, 7.895, 7.895, 10.526, 2.632, 0.0, 7.895, 5.263, 0.0, 7.895, 2.632, 0.0, 2.632, 5.263, 7.895, 2.632, 0.0, 0.0, 7.895, 2.7, 0.0, 2.7, 0.0, 2.7, 0.0, 0.0, 0.0, 0.0, 0.0, 2.7, 0.0, 0.0, 0.0, 0.0, 2.7, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.7, 0.0, 0.0, 0.0, 0.0, 2.7, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.7, 0.0, 0.0, 2.7, 0.0, 0.0, 0.0, 0.0, 2.7, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.7, 0.0, 0.0, 2.7, 2.7, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.7, 0.0, 0.0, 2.7, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 5.41, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.7, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 2.7, 0.0, 0.0, 2.7, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0