In [None]:
# download the file model_trained_on_original_dataset.h5 from https://github.com/nicolagulmini/spaan or
!git clone git://github.com/nicolagulmini/spaan
!pip install Bio # biopython to handle fasta sequences

# install the tensorflow library
from tensorflow import keras

from Bio import SeqIO
from spaan.data_processing import *
import numpy as np

Cloning into 'spaan'...
remote: Enumerating objects: 167, done.[K
remote: Counting objects: 100% (167/167), done.[K
remote: Compressing objects: 100% (133/133), done.[K
remote: Total 167 (delta 92), reused 81 (delta 31), pack-reused 0[K
Receiving objects: 100% (167/167), 5.63 MiB | 12.66 MiB/s, done.
Resolving deltas: 100% (92/92), done.
Collecting Bio
  Downloading bio-0.7.4-py3-none-any.whl (80 kB)
[K     |████████████████████████████████| 80 kB 4.8 MB/s 
Collecting biopython>=1.79
  Downloading biopython-1.79-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.whl (2.3 MB)
[K     |████████████████████████████████| 2.3 MB 22.4 MB/s 
Installing collected packages: biopython, Bio
Successfully installed Bio-0.7.4 biopython-1.79


In [None]:
model = keras.models.load_model('./spaan/model_trained_on_original_dataset.h5') # insert the path to location!

In [None]:
# method for fitting in the multi-input neural network
def fit_in_nn(x): # taken from the other colab notebook
    x_1, x_2, x_3, x_4, x_5, x_6, x_7 = [[] for _ in range(7)]
    for el in x:
        x_1.append(el[0])
        x_2.append(el[1])
        x_3.append(el[2])
        x_4.append(el[3])
        x_5.append(el[4])
        x_6.append(el[5])
        x_7.append(el[6])
    return np.array(x_1), np.array(x_2), np.array(x_3), np.array(x_4), np.array(x_5), np.array(x_6), np.array(x_7)

In [None]:
f = open("spaan/data/results")
id_list = []
results = []
i = 0
for line in f:
    if i > 0:
        results.append(float(line.split('\t')[1]))
        id_list.append(line.split('\t')[2].split(' ')[0][1:])
    i += 1

815


In [None]:
positive_ds = list(SeqIO.parse("spaan/data/adh.fasta", "fasta"))
negative_ds = list(SeqIO.parse("spaan/data/not_adh.fasta", "fasta"))

x, y = [], []

for protein in positive_ds:
    if protein.id in id_list:
        tmp = [
                    aminoacids_frequencies(protein.seq),
                    multiplet_frequencies(protein.seq, 3),
                    multiplet_frequencies(protein.seq, 4),
                    multiplet_frequencies(protein.seq, 5),
                    dipeptide_frequencies(protein.seq),
                    charge_composition(protein.seq),
                    hydrophobic_composition(protein.seq)
        ]
        x.append(tmp)
        y.append(1)
        
for protein in negative_ds:
    if protein.id in id_list:
        tmp = [
                    aminoacids_frequencies(protein.seq),
                    multiplet_frequencies(protein.seq, 3),
                    multiplet_frequencies(protein.seq, 4),
                    multiplet_frequencies(protein.seq, 5),
                    dipeptide_frequencies(protein.seq),
                    charge_composition(protein.seq),
                    hydrophobic_composition(protein.seq)
        ]
        x.append(tmp)
        y.append(0)
print(len(x))

815


In [None]:
print("Test accuracy: " + str(model.evaluate(
    x=fit_in_nn(x),
    y=np.array(y),
    verbose=0   
    )[1]))

Test accuracy: 0.8098159432411194


In [None]:
true_positive = 0
true_negative = 0
false_positive = 0
false_negative = 0

spaan_true_positive = 0
spaan_true_negative = 0
spaan_false_positive = 0
spaan_false_negative = 0

for j in range(len(x)): # to optimize...
    p_ad = float(model.predict([np.array([x[j][i]]) for i in range(len(x[j]))]))
    real_label = y[j]
    if real_label == 1:
        if round(p_ad) == real_label:
            true_positive += 1
        else:
            false_positive += 1
    else:
        if round(p_ad) == real_label:
            true_negative += 1
        else:
            false_negative += 1

    if real_label == 1:
        if round(results[j]) == real_label:
            spaan_true_positive += 1
        else:
            spaan_false_positive += 1
    else:
        if round(results[j]) == real_label:
            spaan_true_negative += 1
        else:
            spaan_false_negative += 1

In [None]:
true_positive = round(true_positive/len(x)*100, 2)
true_negative = round(true_negative/len(x)*100, 2)
false_positive = round(false_positive/len(x)*100, 2)
false_negative = round(false_negative/len(x)*100, 2)

spaan_true_positive = round(spaan_true_positive/len(x)*100, 2)
spaan_true_negative = round(spaan_true_negative/len(x)*100, 2)
spaan_false_positive = round(spaan_false_positive/len(x)*100, 2)
spaan_false_negative = round(spaan_false_negative/len(x)*100, 2)

In [None]:
print(
    'true positive', true_positive, '%\n'
    'true negative', true_negative, '%\n'
    'false positive', false_positive, '%\n'
    'false negative', false_negative, '%.'
)

# spaan
print(
    '\nspaan_true positive', spaan_true_positive, '%\n'
    'spaan_true negative', spaan_true_negative, '%\n'
    'spaan_false positive', spaan_false_positive, '%\n'
    'spaan_false negative', spaan_false_negative, '%.'
)

true positive 33.87 %
true negative 47.12 %
false positive 12.64 %
false negative 6.38 %.

spaan_true positive 34.23 %
spaan_true negative 48.83 %
spaan_false positive 12.27 %
spaan_false negative 4.66 %.
