In [61]:
import argparse
import os
import numpy as np
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV
from sklearn import metrics

## Create input vectors

In [4]:
vectors_file = "../output/sp_words-n3-w5-s100-k10-1.w2v" 
ppi_file = "../data/ppi_data.fasta"
os.path.isfile(vectors_file), os.path.isfile(ppi_file)

In [5]:
names = []
seqs = []
ppis = []
# read data
with open(ppi_file) as file:
    i = 0
    for line in file:
        if (i % 3 == 0):
            # name
            names.append(line[1:].strip())
        elif (i % 3 == 1):
            # sequence
            seqs.append(line.strip())
        elif (i % 3 == 2):
            # labels
            ppis.append(line.strip())
        i = i + 1
len(names)

540

In [6]:
vectors = {}
with open(vectors_file) as file:
    i = 0
    for line in file:
        aa_vec = line.split()
        feature = np.array([float(x) for x in aa_vec[1:]])
        vectors.update({aa_vec[0]: feature})
        i = i + 1
len(vectors)

9617

In [7]:
def convert_seq_gram(long_gram, vectors, offset=3):
    seq_gram_strings = [long_gram[i:i+offset] for i in range(2*offset-1)] # get short grams within long gram
    seq_gram_vectors = [vectors[gram] for gram in seq_gram_strings] # convert into feature vectors
    gram_sum = np.array(seq_gram_vectors).sum(axis=0) # sum up short grams
    return gram_sum

def max_ppi_label(ppi_string):
    ppi_vector = list(map(lambda x:0 if x=="-" else 1, ppi_string))
    labels = {x:list(ppi_vector).count(x) for x in set(ppi_vector)}
    return max(labels, key=labels.get)

In [9]:
offset = 3
features = []
labels = []
for i in range(len(names)):
    seq = seqs[i]
    ppi = ppis[i]
    for j in range(len(seq)-2*offset):
        # sequence residues
        seq_vecs = convert_seq_gram(seq[j:j+2*offset+1], vectors, offset)
        features.append(seq_vecs)
        # ppi labels per residue
        ppi_label = max_ppi_label(ppi[j:j+2*offset+1])
        labels.append(ppi_label)
len(features), len(labels)

(89541, 89541)

## Train Model

In [51]:
X = features[0:100]
y = labels[0:100]

In [52]:
parameter_space = {
    'hidden_layer_sizes': [25, 50, 100],
    'learning_rate_init': [0.001, 0.01],
    'n_iter_no_change': [100, 200, 500],
}
mlp = MLPClassifier(random_state=42)

In [53]:
skf = StratifiedKFold(n_splits=10, random_state=42)
clf = GridSearchCV(mlp, parameter_space, n_jobs=-1, cv=skf).fit(X, y)



In [72]:
clf.cv_results_



{'mean_fit_time': array([0.33171282, 0.33590779, 0.33900571, 0.27934034, 0.32501683,
        0.40866849, 0.42415822, 0.39637182, 0.38607955, 0.30812349,
        0.37918332, 0.37728591, 0.85321479, 0.7892489 , 0.74207389,
        0.48612316, 0.60425568, 0.63124022]),
 'std_fit_time': array([0.03231202, 0.02453539, 0.0290133 , 0.02288382, 0.02427545,
        0.0568056 , 0.01157816, 0.01127419, 0.00978036, 0.0328819 ,
        0.01708476, 0.01806633, 0.21717413, 0.16388218, 0.02375772,
        0.0405654 , 0.0539802 , 0.04344804]),
 'mean_score_time': array([0.00209713, 0.00339706, 0.00149899, 0.00109959, 0.00129733,
        0.00240045, 0.00279901, 0.00160153, 0.00129986, 0.001     ,
        0.00260086, 0.00179925, 0.00419755, 0.00240016, 0.00389843,
        0.0009989 , 0.00159936, 0.00289824]),
 'std_score_time': array([1.75698995e-03, 3.52571258e-03, 9.22847788e-04, 2.99931577e-04,
        7.79005081e-04, 1.68643693e-03, 3.70690634e-03, 1.02107786e-03,
        6.40673018e-04, 8.05346763e-

## Metrics
Accuracy, Precision, Recall, AUC (ROC)

In [65]:
y_pred = clf.predict(X)
metrics.accuracy_score(y, y_pred)

1.0

In [66]:
metrics.precision_score(y_true=y, y_pred=y_pred)

1.0

In [67]:
metrics.recall_score(y_true=y, y_pred=y_pred)

1.0

In [69]:
clf

GridSearchCV(cv=StratifiedKFold(n_splits=10, random_state=42, shuffle=False),
       error_score='raise-deprecating',
       estimator=MLPClassifier(activation='relu', alpha=0.0001, batch_size='auto', beta_1=0.9,
       beta_2=0.999, early_stopping=False, epsilon=1e-08,
       hidden_layer_sizes=(100,), learning_rate='constant',
       learning_rate_init=0.001, max_iter=200, momentum=0.9,
       n_iter_no_change=10, nesterovs_momentum=True, power_t=0.5,
       random_state=42, shuffle=True, solver='adam', tol=0.0001,
       validation_fraction=0.1, verbose=False, warm_start=False),
       fit_params=None, iid='warn', n_jobs=-1,
       param_grid={'hidden_layer_sizes': [25, 50, 100], 'learning_rate_init': [0.001, 0.01], 'n_iter_no_change': [100, 200, 500]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)