In [3]:
#!/usr/bin/env python
# coding: utf-8
import numpy as np
import pandas as pd
import pickle as pk
import math
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
import lightgbm as lgb
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import roc_curve, roc_auc_score, accuracy_score, precision_score, confusion_matrix, recall_score, f1_score, auc, matthews_corrcoef
from imblearn.pipeline import make_pipeline
from mlxtend.classifier import StackingCVClassifier
from lightgbm import LGBMClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.linear_model import LogisticRegression
import joblib

# read the training data file
covalent = pd.read_excel('D:/JCIM-write/HyperCys/data/train_dataset.xlsx')

covalent=covalent.loc[:,['label','helix','strand','coil','accessible_surface_area','PSSM_1','PSSM_2','PSSM_3','PSSM_4','PSSM_5','PSSM_6',
                         'PSSM_7','PSSM_8','PSSM_9','PSSM_10','PSSM_11','PSSM_12','PSSM_13','PSSM_14','PSSM_15','PSSM_16','PSSM_17','PSSM_18',
                         'PSSM_19','PSSM_20','monogram','bigram_1','bigram_2','bigram_3','bigram_4','bigram_5','bigram_6','bigram_7',
                         'bigram_8','bigram_9','bigram_10','bigram_11','bigram_12','bigram_13','bigram_14','bigram_15','bigram_16',
                         'bigram_17','bigram_18','bigram_19','bigram_20','PSEE']]#Sequence-based
X=covalent.loc[:, covalent.columns != 'label']
y=covalent['label']

scaler = StandardScaler()
X = scaler.fit_transform(X)
print('data scaled')
# Six different classifiers
classifier1 = SVC(C=100,  gamma= 0.001, kernel="sigmoid",probability=True,random_state=42)

classifier2 = KNeighborsClassifier(9)

classifier3 = LogisticRegression(C=0.1)

classifier4 = lgb.LGBMClassifier(colsample_bytree=0.3,max_depth=15,n_estimators=100, num_leaves=50)

classifier5=MLPClassifier(alpha= 10, hidden_layer_sizes=(50, 50, 50), learning_rate='adaptive', solver='adam')

classifier6= RandomForestClassifier(criterion="gini", max_depth= 3000, min_samples_split= 4, n_estimators= 80) # Define classifier
# stacked classifier
clf = StackingCVClassifier(classifiers = [make_pipeline(scaler, classifier1), make_pipeline(scaler, classifier2), make_pipeline(scaler,  classifier3), make_pipeline(scaler,  classifier4), make_pipeline(scaler,  classifier5), make_pipeline(scaler,  classifier6)],
							shuffle = False,
							use_probas = True,
							cv = 10,
						   verbose=2,
						   n_jobs=-1,
							store_train_meta_features = True,
							use_features_in_secondary =True,
							meta_classifier =make_pipeline(scaler, LogisticRegression(C=0.1))
)


clf.fit(X,y)
print('model file created')
print('saving model file in disk')
# save the model to disk
modelfile = 'D:/JCIM-write/HyperCys/model/sequence_based.model'
joblib.dump(clf, modelfile)
print('model file saved')



X_test = pd.read_excel('D:/JCIM-write/HyperCys/data/test_dataset.xlsx')
X_test=X_test.loc[:,['helix','strand','coil','accessible_surface_area','PSSM_1','PSSM_2','PSSM_3','PSSM_4','PSSM_5','PSSM_6',
                         'PSSM_7','PSSM_8','PSSM_9','PSSM_10','PSSM_11','PSSM_12','PSSM_13','PSSM_14','PSSM_15','PSSM_16','PSSM_17','PSSM_18',
                         'PSSM_19','PSSM_20','monogram','bigram_1','bigram_2','bigram_3','bigram_4','bigram_5','bigram_6','bigram_7',
                         'bigram_8','bigram_9','bigram_10','bigram_11','bigram_12','bigram_13','bigram_14','bigram_15','bigram_16',
                         'bigram_17','bigram_18','bigram_19','bigram_20','PSEE']]


X_test = scaler.transform(X_test) # the scaler instance is used on test data to transform it the same way it did on the training set

y_pred = clf.predict(X_test)
y_pred_prob = clf.predict_proba(X_test)
y_pred = np.column_stack([y_pred, y_pred_prob])

#save the prediction output of test data
output_file = 'D:/JCIM-write/HyperCys/model/sequence_based_prediction.predict'
out_file = open(output_file, 'wb')
np.savetxt(fname=output_file, X=y_pred, fmt='%d %0.4f %0.4f', header='predClass, probNonCov, probCov', comments='')


# In[ ]:






data scaled
Fitting 6 classifiers...
Fitting classifier1: pipeline (1/6)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc',
                 SVC(C=100, gamma=0.001, kernel='sigmoid', probability=True,
                     random_state=42))])


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    4.7s remaining:    1.9s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    4.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.0s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting classifier2: pipeline (2/6)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('kneighborsclassifier', KNeighborsClassifier(n_neighbors=9))])
Fitting classifier3: pipeline (3/6)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('logisticregression', LogisticRegression(C=0.1))])
Fitting classifier4: pipeline (4/6)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('lgbmclassifier',
                 LGBMClassifier(colsample_bytree=0.3, max_depth=15,
                                num_leaves=50))])


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    3.0s remaining:    1.2s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    3.0s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting classifier5: pipeline (5/6)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('mlpclassifier',
                 MLPClassifier(alpha=10, hidden_layer_sizes=(50, 50, 50),
                               learning_rate='adaptive'))])


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    1.4s remaining:    0.5s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    1.8s finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting classifier6: pipeline (6/6)
Pipeline(steps=[('standardscaler', StandardScaler()),
                ('randomforestclassifier',
                 RandomForestClassifier(max_depth=3000, min_samples_split=4,
                                        n_estimators=80))])


[Parallel(n_jobs=-1)]: Done   7 out of  10 | elapsed:    0.2s remaining:    0.0s
[Parallel(n_jobs=-1)]: Done  10 out of  10 | elapsed:    0.3s finished


model file created
saving model file in disk
model file saved
