The purpose of this calculation is to reproduce the results from [Asuncion et al., Philippine Journal of Science, 2019](https://github.com/pgniewko/conotoxins/tree/master/papers/ML/ASA.PJS.2019.pdf). The data used in this work is downloaded from the [ConoServer](http://www.conoserver.org/?page=download) (access data: 11/18/19). Instead of using [protr](https://cran.r-project.org/web/packages/protr/vignettes/protr.html) package for peptides featurizaion, we use a Python implementation [pydpi](https://pypi.org/project/pydpi/#files). Other tools could be also used e.g. [iFeature](https://github.com/Superzchen/iFeature/) by [Chen et al., Bioinformatics 2018](https://github.com/pgniewko/conotoxins/tree/master/papers/ML/iFeature.2018.pdf).

In [1]:
import sys
import xml.etree.ElementTree as ET

from toxin import Toxin
from experiment import Experiment

In [2]:
db_file = "../data/conoserver/conoserver_protein.xml"
tree = ET.parse(db_file)
root =tree.getroot()

toxins = []
for entry in root.iter('entry'):
    pid = entry.find('id').text if entry.find('id') is not None else None
    seq = entry.find('sequence').text if entry.find('sequence') is not None else "None"
    name = entry.find('name').text if entry.find('name') is not None else "None"
    toxin_class =  entry.find('class').text if entry.find('class') is not None else "None"
    organism = entry.find('organismLatin').text if entry.find('organismLatin') is not None else "None"
    geneSuperfamily = entry.find('geneSuperfamily').text if entry.find('geneSuperfamily') is not None else "None"
    cysteineFramewrok = entry.find('cysteineFramewrok').text if entry.find('cysteineFramewrok') is not None else "None"
    pharmacologicalFamily = entry.find('pharmacologicalFamily').text if entry.find('pharmacologicalFamily') is not None else "None"
    isoelecticPoint = entry.find('isoelecticPoint').text if entry.find('isoelecticPoint') is not None else "None"
    
    toxin = Toxin(pid,
                  seq,
                  name,
                  toxin_class,
                  organism,
                  geneSuperfamily,
                  cysteineFramewrok,
                  pharmacologicalFamily, 
                  isoelecticPoint)
    
    toxins.append(toxin)
    


In [3]:
from pydpi.pypro import PyPro
protein="ADGCGVGEGTGQGPMCNCMCMKWVYADEDAADLESDSFADEDASLESDSFPWSNQRVFCSFADEDAS"
cds = PyPro()
cds.ReadProteinSequence(protein)
print(toxins[4])

P00005
conotoxin
A superfamily
Conus pergrandis
FDGRNAAANDKASDLVALTVRGCCSHPACSVNHPELCG
None


In [4]:
experiment = Experiment(toxins, min_val=1)
data = experiment.prepare_experiment()

P00002 synthetic construct alpha conotoxin conotoxin
P00003 synthetic construct alpha conotoxin conotoxin
P00004 synthetic construct alpha conotoxin conotoxin
P00033 synthetic construct alpha conotoxin conotoxin
P00034 synthetic construct alpha conotoxin conotoxin
P00035 synthetic construct alpha conotoxin conotoxin
P00075 synthetic construct alpha conotoxin conotoxin
P00112 synthetic construct alpha conotoxin conotoxin
P00408 synthetic construct None conotoxin
P00505 synthetic construct alpha conotoxin conotoxin
P00516 synthetic construct alpha conotoxin conotoxin
P00517 synthetic construct alpha conotoxin conotoxin
P01262 synthetic construct None contryphan
P01355 synthetic construct None contryphan
P01541 synthetic construct delta conotoxin conotoxin
P01570 synthetic construct mu conotoxin conotoxin
P01612 synthetic construct alpha conotoxin conotoxin
P01657 synthetic construct omega conotoxin conotoxin
P01658 synthetic construct omega conotoxin conotoxin
P01661 synthetic construct 

In [8]:
list_ = []
print(toxins[toxins.index('P00022')])

P00022
conotoxin
A superfamily
Conus geographus
ECCNPACGRHYSCGK
alpha conotoxin


In [5]:
for k,v in data.items():
    print (k, len(v))

alpha conotoxin 55
iota conotoxin 2
mu conotoxin 23
kappa conotoxin 8
chi conotoxin 4
omega conotoxin 31
gamma conotoxin 4
delta conotoxin 18
epsilon conotoxin 1
rho conotoxin 1
sigma conotoxin 1


In [6]:
for v in data['alpha conotoxin']:
    print (v)
    print ('----')

P00001
conotoxin
A superfamily
Conus striatus
ICCNPACGPKYSCX
alpha conotoxin
----
P00006
conotoxin
A superfamily
Conus omaria
GCCSHPACNVNNPHICGX
alpha conotoxin
----
P00010
conotoxin
A superfamily
Conus imperialis
GCCSDPRCAWRCX
alpha conotoxin
----
P00015
conotoxin
None
Conus anemone
GGCCSHPACAANNQDYCX
alpha conotoxin
----
P00022
conotoxin
A superfamily
Conus geographus
ECCNPACGRHYSCGK
alpha conotoxin
----
P00023
conotoxin
A superfamily
Conus magus
GRCCHPACGKNYSCX
alpha conotoxin
----
P00024
conotoxin
A superfamily
Conus geographus
ECCHPACGKHFSCX
alpha conotoxin
----
P00025
conotoxin
A superfamily
Conus striatus
YCCHPACGKNFDCX
alpha conotoxin
----
P00026
conotoxin
A superfamily
Conus bullatus
GCCSTPPCAVLYCX
alpha conotoxin
----
P00028
conotoxin
A superfamily
Conus regius
GCCSDPRCKHQCX
alpha conotoxin
----
P00029
conotoxin
A superfamily
Conus regius
GCCSDPRCKHECX
alpha conotoxin
----
P00030
conotoxin
A superfamily
Conus regius
GCCSDPRCRYRCX
alpha conotoxin
----
P00031
conotoxin
A superf

In [7]:
1+31 # Ca-channels,epsilon+omega

32

In [8]:
8 # K-channel, kappa

8

In [9]:
len(toxins)

6260

In [10]:
'P00001' in toxins

True

P00025
conotoxin
Conus striatus
YCCHPACGKNFDCX
alpha conotoxin


In [10]:
toxins[toxins.index('P00022')]