In [1]:
from Bio.PDB import *
import urllib.request
import numpy as np
import pandas as pd
from math import sqrt
import time
import configparser
import os
import pymysql
import random


In [2]:
config = configparser.ConfigParser()
config.read("/home/yuan/Documents/config.ini")
config.sections()

['office_ssh', 'office_mysql']

In [3]:
peptidasesList = pd.read_csv("/home/yuan/Documents/MCSA_EC3.4_peptidases.csv") #for big machine
#peptidasesList = pd.read_csv("/Volumes/Lab_Public/Yuan/temp/MCSA_EC3.4_peptidases.csv") #for mac
peptidasesList = peptidasesList[peptidasesList.iloc[:,4] == "residue"]
peptidasesList = peptidasesList.reset_index(drop=True)

In [4]:
group_by = peptidasesList.groupby(["PDB","chain/kegg compound","resid/chebi id","PDB code"])
unique_pos = peptidasesList[["PDB","chain/kegg compound","resid/chebi id","PDB code"]].drop_duplicates()
unique_pos.columns = ["PDB","CHAIN","site","res_type"]
unique_pos.site = unique_pos.site.astype(int)
unique_pos = unique_pos.reset_index(drop=True)

In [5]:
catalytic_site_dic = {}
for i in range(len(unique_pos)):
    if unique_pos.loc[i,"PDB"] not in catalytic_site_dic:
        catalytic_site_dic[unique_pos.loc[i,"PDB"]] = {unique_pos.loc[i,"CHAIN"]:[unique_pos.loc[i,"site"]]}
    elif unique_pos.loc[i,"CHAIN"] not in catalytic_site_dic[unique_pos.loc[i,"PDB"]]:
        catalytic_site_dic[unique_pos.loc[i,"PDB"]][unique_pos.loc[i,"CHAIN"]] = [unique_pos.loc[i,"site"]]
    else:
        catalytic_site_dic[unique_pos.loc[i,"PDB"]][unique_pos.loc[i,"CHAIN"]].append(unique_pos.loc[i,"site"])
for protein in catalytic_site_dic:
    for chain in catalytic_site_dic[protein]:
        catalytic_site_dic[protein][chain] = list(catalytic_site_dic[protein][chain])

In [6]:
def n_nearest_neighbour(PDB_ID,Chain,res_ID,n):
    
    conn = pymysql.connect(host=config["office_mysql"]["sql_host"],user=config["office_mysql"]["sql_username"],
                           passwd=config["office_mysql"]["sql_password"],
                           db=config["office_mysql"]["sql_main_database"],port=3306)
    try:
        with conn as cursor: #auto commit; no close() called
            with cursor: # close() called here
                sql_select = "Select da.* FROM pdbdb.`Distance_angle2.0` da "
                sql_where = "WHERE da.pdbID = \"{}\" and da.chain = \"{}\" and da.ID_1= \"{}\";".format(PDB_ID,Chain,res_ID)  
                sql = sql_select+sql_where
                data = pd.read_sql_query(sql, conn)
        return(data.groupby(["ID"]).first().sort_values("Distance").iloc[2:(n+2),]) #from 2 to exclude self vs self

    except Exception as e: # catch exceptions
        print("~~~~~~~~~~~~~~")
        print(e)
    finally:
        if conn:
            conn.close()

In [7]:
def n_nearest_neighbour_by_seq(PDB_ID,Chain,res_ID,n):
    if n >= res_ID:
        return []
    conn = pymysql.connect(host=config["office_mysql"]["sql_host"],user=config["office_mysql"]["sql_username"],
                       passwd=config["office_mysql"]["sql_password"],
                       db=config["office_mysql"]["sql_main_database"],port=3306)
    try:
        with conn as cursor: #auto commit; no close() called
            with cursor: # close() called here
                sql_select = "Select da.* FROM pdbdb.`Distance_angle2.0` da "
                sql_where = "WHERE da.pdbID = \"{}\" and da.chain = \"{}\" and da.ID_1= \"{}\";".format(PDB_ID,Chain,res_ID)  
                sql = sql_select+sql_where
                data = pd.read_sql_query(sql, conn)
        head_part = data.groupby(["ID"]).first().iloc[(res_ID-n-1):(res_ID-1),]
        tail_part = data.groupby(["ID"]).first().iloc[res_ID:(res_ID+n),]
        return(pd.concat([head_part,tail_part])) #from 2 to exclude self vs self

    except Exception as e: # catch exceptions
        print("~~~~~~~~~~~~~~")
        print(e)
    finally:
        if conn:
            conn.close()

In [8]:
def One_residue_retrieval(residue_1,PDB_ID,Chain):
    data, data_1, data_2 = None, None, None
    conn = pymysql.connect(host=config["office_mysql"]["sql_host"],user=config["office_mysql"]["sql_username"],
                           passwd=config["office_mysql"]["sql_password"],
                           db=config["office_mysql"]["sql_main_database"],port=3306)
    try:
        with conn as cursor: #auto commit; no close() called
            with cursor: # close() called here
                sql_select = "Select da.* FROM pdbdb.`Distance_angle2.0` da "
                sql_where = "WHERE da.pdbID = \"{}\" and da.chain=\"{}\" and da.ID_1 = {};".format(PDB_ID,Chain,residue_1)    
                sql = sql_select+sql_where
                data_1 = pd.read_sql_query(sql, conn)
                data = data_1
        
    except Exception as e: # catch exceptions
        print("~~~~~~~~~~~~~~")
        print(e)
    finally:
        if conn:
            conn.close()
        return(data)

In [9]:
def Get_FEATURE(PDBID,Chain,Seq):
    data, data_1, data_2 = None, None, None
    conn = pymysql.connect(host=config["office_mysql"]["sql_host"],user=config["office_mysql"]["sql_username"],
                           passwd=config["office_mysql"]["sql_password"],
                           db=config["office_mysql"]["sql_main_database"],port=3306)
    try:
        with conn as cursor: #auto commit; no close() called
            with cursor: # close() called here
                sql_select = "Select da.* FROM pdbdb.`FEATURE` da "
                sql_where = "WHERE da.PDBID = \"{}\" and da.Chain=\"{}\" and da.Seq = {};".format(PDBID,Chain,Seq)    
                sql = sql_select+sql_where
                data_1 = pd.read_sql_query(sql, conn)
                data = data_1[data_1["Component"]=="C"] # currently only get C component
        
    except Exception as e: # catch exceptions
        print("~~~~~~~~~~~~~~")
        print(e)
    finally:
        if conn:
            conn.close()
        return(data)

In [10]:
def Get_PSSM(PDBID,Chain,Seq):
    data, data_1, data_2 = None, None, None
    conn = pymysql.connect(host=config["office_mysql"]["sql_host"],user=config["office_mysql"]["sql_username"],
                           passwd=config["office_mysql"]["sql_password"],
                           db=config["office_mysql"]["sql_main_database"],port=3306)
    try:
        with conn as cursor: #auto commit; no close() called
            with cursor: # close() called here
                sql_select = "Select da.* FROM pdbdb.`PSSM_uniref50` da "
                sql_where = "WHERE da.PDBID = \"{}\" and da.Chain=\"{}\" and da.Seq in {};".format(PDBID,Chain,Seq)    
                sql = sql_select+sql_where
                data_1 = pd.read_sql_query(sql, conn)
                data = data_1
        
    except Exception as e: # catch exceptions
        print("~~~~~~~~~~~~~~")
        print(e)
    finally:
        if conn:
            conn.close()
        return(data)

In [11]:
def Get_Neigh_PSSM(PDBID,Chain,Seq,num_neigh):
    '''
    Retrieve redsidue‘s and neighbours' PSSM.
    No or imcomplete records will return an empty dataframe
    '''
    data, data_1, data_2 = None, None, None
    conn = pymysql.connect(host=config["office_mysql"]["sql_host"],user=config["office_mysql"]["sql_username"],
                           passwd=config["office_mysql"]["sql_password"],
                           db=config["office_mysql"]["sql_main_database"],port=3306)
    try:
        with conn as cursor: #auto commit; no close() called
            with cursor: # close() called here
                sql_select = "Select da.* FROM pdbdb.`PSSM_uniref50` da "
                sql_where = "WHERE da.PDBID = \"{}\" and da.Chain=\"{}\" and da.Seq = {};".format("XXX",Chain,Seq)    
                sql = sql_select+sql_where
                data_1 = pd.read_sql_query(sql, conn)
        if Seq <= num_neigh:
            data = data_1
        else:
            with conn as cursor: #auto commit; no close() called
                with cursor: # close() called here
                    sql_select = "Select da.* FROM pdbdb.`PSSM_uniref50` da "
                    neigh = "(" + ",".join(map(str,list(range(Seq-num_neigh,Seq+num_neigh+1)))) + ")"
                    sql_where = "WHERE da.PDBID = \"{}\" and da.Chain=\"{}\" and da.Seq in {};".format(PDBID,Chain,neigh)    
                    sql = sql_select+sql_where
                    data_2 = pd.read_sql_query(sql, conn)
            if data_2.shape[0] < 2*num_neigh + 1:
                data = data_1
            else:
                data = data_2

    except Exception as e: # catch exceptions
        print("~~~~~~~~~~~~~~")
        print(e)
    finally:
        if conn:
            conn.close()
        return(data)

In [12]:
import pandas as pd
import os
import numpy as np
from datetime import datetime

from sklearn.metrics import classification_report, roc_auc_score, roc_curve, make_scorer, confusion_matrix, \
    recall_score, precision_score, f1_score
from sklearn.model_selection import StratifiedKFold, StratifiedShuffleSplit, GroupKFold, cross_val_score
from sklearn.metrics import auc, precision_recall_curve,accuracy_score
from sklearn.utils import shuffle
from sklearn import tree, svm, naive_bayes, neighbors
from sklearn.ensemble import BaggingClassifier, AdaBoostClassifier, RandomForestClassifier, GradientBoostingClassifier
from sklearn.preprocessing import LabelBinarizer, OneHotEncoder,LabelEncoder
from scipy import interp
import matplotlib.pyplot as plt
from xgboost import XGBClassifier

In [13]:
clfs = {'decision_tree': tree.DecisionTreeClassifier(),
        'naive_gaussian': naive_bayes.GaussianNB(),
        'K_neighbor': neighbors.KNeighborsClassifier(),
        'bagging_knn': BaggingClassifier(neighbors.KNeighborsClassifier(), max_samples=0.5, max_features=0.5),
        'bagging_tree': BaggingClassifier(tree.DecisionTreeClassifier(), max_samples=0.5, max_features=0.5),
        'random_forest': RandomForestClassifier(n_estimators=100),
        'adaboost': AdaBoostClassifier(n_estimators=200),
        'gradient_boost': GradientBoostingClassifier(n_estimators=50, learning_rate=1.0, max_depth=1, random_state=0),
        'svm': svm.SVC(C=0.9,kernel='rbf',gamma='auto'),
        'xgboost': XGBClassifier()
        }
#'svm': svm.SVC(probability=True)

In [14]:
aminoAcidCodes = ["ALA","ARG","ASN","ASP","CYS","GLN","GLY","GLU","HIS","ILE","LEU","LYS",
                 "MET","PHE","PRO","PYL","SER","SEC","THR","TRP","TYR","VAL"]
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform(aminoAcidCodes)
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit(integer_encoded)
print(onehot_encoded)

OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=False)


In [208]:
onehot_encoded.transform([[aminoAcidCodes.index("VAL")]])

array([[0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
        0., 0., 0., 0., 0., 1.]])

In [15]:
amino_acid_property_by_res = {"ALA":1,"ILE":1,"LEU":1,"MET":1,"VAL":1,
                      "PHE":2, "TRP":2,"TYR":2,
                       "ASN":3,"CYS":3,"GLN":3,"SER":3,"THR":3,
                      "ASP":4,"GLU":4,"ARG":5,"HIS":5,"LYS":5,"GLY":6,"PRO":6}
label_encoder = LabelEncoder()
integer_encoded = label_encoder.fit_transform([1,2,3,4,5,6])
onehot_encoder = OneHotEncoder(sparse=False)
integer_encoded = integer_encoded.reshape(len(integer_encoded), 1)
onehot_encoded = onehot_encoder.fit(integer_encoded)
print(onehot_encoded)


OneHotEncoder(categories='auto', drop=None, dtype=<class 'numpy.float64'>,
              handle_unknown='error', sparse=False)


In [16]:
pos_sample_and_neig = pd.DataFrame()
for key, value in catalytic_site_dic.items():
    for chain, sites in value.items():
        for site in sites:
            neigh = n_nearest_neighbour(key,chain,site,1)
            if len(neigh) > 0:
                pos_sample_and_neig = pos_sample_and_neig.append(neigh.iloc[0,:])

In [17]:
pos_sample_and_neig = pos_sample_and_neig.reset_index(drop=True)

In [18]:
pos_sample_and_neig = pos_sample_and_neig[neigh.columns]
pos_sample_and_neig = pos_sample_and_neig.astype({"ID_1":int,"ID_2":int})

In [19]:
pos_sample_and_neig.head()

Unnamed: 0,pdbID,chain,ID_1,Res_1,ID_2,Res_2,Distance,Angle
0,1lam,A,262,LYS,257,GLY,4.262492,124.883418
1,1lam,A,336,ARG,432,ALA,4.115262,145.134467
2,1lok,A,97,HIS,118,ASP,4.34173,120.108316
3,1lok,A,117,ASP,180,MET,4.621907,106.551924
4,1lok,A,151,GLU,152,GLU,4.572974,122.672384


In [20]:
neg_sample_and_neig = pd.DataFrame()
for i in range(len(pos_sample_and_neig)):
    result_step1 = One_residue_retrieval(pos_sample_and_neig.iloc[i,2],pos_sample_and_neig.iloc[i,0],pos_sample_and_neig.iloc[i,1])
    result_step2 = result_step1.groupby(["ID"]).first() #duplicate discard
    result_step3 = result_step2[~result_step2.ID_2.isin(catalytic_site_dic[pos_sample_and_neig.iloc[i,0]][pos_sample_and_neig.iloc[i,1]])] # activate discard
    result_step4 = result_step3[result_step3["Res_2"]==pos_sample_and_neig.loc[i,"Res_1"]]
    if result_step4.shape[0] > 0:
        neg_sample_and_neig = pd.concat([neg_sample_and_neig,result_step4.sample(1,replace = False)])


In [21]:
pos_sample_and_neig_pruned = pos_sample_and_neig.iloc[:,[0,1,2,3]].copy(deep=True)
pos_sample_and_neig_pruned["type"] = "Pos"
pos_sample_and_neig_pruned.columns = ['pdbID', 'chain', 'ID', 'Res', 'type']

In [22]:
neg_sample_and_neig_pruned = neg_sample_and_neig.iloc[:,[0,1,4,5]].copy(deep=True)
neg_sample_and_neig_pruned["type"] = "Neg"
neg_sample_and_neig_pruned.columns = ['pdbID', 'chain', 'ID', 'Res', 'type']

In [23]:
pos_neg_sample = pd.concat([pos_sample_and_neig_pruned,neg_sample_and_neig_pruned])

In [24]:
neighbour_information_list = []
num_pos = 0
num_neg = 0
for i in range(0,len(pos_neg_sample)):
    features = n_nearest_neighbour(pos_neg_sample.iloc[i,0],pos_neg_sample.iloc[i,1],\
                                       pos_neg_sample.iloc[i,2],20)
    if not isinstance(features,list):
        if pos_neg_sample.iloc[i,4] == "Pos":
            if features.shape[0] > 0:
                num_pos = num_pos + 1
                neighbour_information_list.append(np.insert(features.iloc[:,5:8].values.flatten(),0,pos_neg_sample.iloc[i,3]))
        else:
            if features.shape[0] > 0:
                num_neg = num_neg + 1
                neighbour_information_list.append(np.insert(features.iloc[:,5:8].values.flatten(),0,pos_neg_sample.iloc[i,3]))

In [28]:
pd.DataFrame(neighbour_information_list).to_csv('/home/yuan/Documents/Catalytic_NN20.csv')

In [627]:
train_dataset = []
for each_res in neighbour_information_list:
    one_res_data = []
    i = 0
    for element in each_res:
        if isinstance(element,str):
#             one_res_data.extend(onehot_encoded.transform([[amino_acid_property_by_res[element]-1]])[0].tolist())
#             one_res_data.extend(onehot_encoded.transform([[aminoAcidCodes.index(element)]])[0].tolist())    
            pass
        else:
            one_res_data.append(element)
            
#         if isinstance(element,str):
#              pass
#         else:
#             if i % 2 ==0:
#                 one_res_data.append(element)
#             i = i + 1    
    train_dataset.append(one_res_data)
wrong_sample = []
for i in range(0,len(train_dataset)):
    if(len(train_dataset[i])<len(train_dataset[0])):
        wrong_sample.append(i)

y_label =[1 for x in range(0,num_pos)]
y_label.extend([2 for i in range(0,num_neg)])

final_y_label = [y_label[i] for i in range(0,num_neg+num_pos) if i not in wrong_sample]
final_y_label = np.asarray(final_y_label)
shuffle_y_label = shuffle(final_y_label,random_state=0)

final_train_dataset = [train_dataset[i] for i in range(0,num_neg+num_pos) if i not in wrong_sample]
final_train_dataset = np.vstack(np.asarray(final_train_dataset))
shuffle_train_dataset = shuffle(final_train_dataset,random_state=0)

for clf in clfs:
    print(clf)
    scores_acc = cross_val_score(clfs[clf],shuffle_train_dataset,shuffle_y_label,scoring='accuracy',n_jobs=5,cv=10)
    scores_roc = cross_val_score(clfs[clf],shuffle_train_dataset,shuffle_y_label,scoring='roc_auc',n_jobs=5,cv=10)
    scores_precision = cross_val_score(clfs[clf],shuffle_train_dataset,shuffle_y_label,scoring='precision',n_jobs=5,cv=10)
    scores_recall = cross_val_score(clfs[clf],shuffle_train_dataset,shuffle_y_label,scoring='recall',n_jobs=5,cv=10)
    print('accuracy:',"{:.4f}".format(sum(scores_acc)/10),'roc:',"{:.4f}".format(sum(scores_roc)/10), \
         'precision:',"{:.4f}".format(sum(scores_precision)/10),'recall:',"{:.4f}".format(sum(scores_recall)/10))


decision_tree
accuracy: 0.6466 roc: 0.6310 precision: 0.6475 recall: 0.6500
naive_gaussian
accuracy: 0.6833 roc: 0.7485 precision: 0.6317 recall: 0.8923
K_neighbor
accuracy: 0.5655 roc: 0.5862 precision: 0.5676 recall: 0.5846
bagging_knn
accuracy: 0.5966 roc: 0.6323 precision: 0.5639 recall: 0.6500
bagging_tree
accuracy: 0.6796 roc: 0.7333 precision: 0.6717 recall: 0.7846
random_forest
accuracy: 0.7123 roc: 0.7892 precision: 0.7075 recall: 0.8231
adaboost
accuracy: 0.6159 roc: 0.6595 precision: 0.6175 recall: 0.6308
gradient_boost
accuracy: 0.6505 roc: 0.6748 precision: 0.6472 recall: 0.6731
svm
accuracy: 0.5309 roc: 0.5291 precision: 0.5170 recall: 1.0000
xgboost
accuracy: 0.7163 roc: 0.7522 precision: 0.7040 recall: 0.7577


In [628]:
pd.DataFrame(final_train_dataset).to_csv("/mnt/HD1/download/Cata_20_data.txt")
pd.DataFrame(final_y_label).to_csv("/mnt/HD1/download/Cata_20_label.txt")

## PSSM training & testing

In [555]:
pos_neg_sample = pos_neg_sample.reset_index(drop=True)
neighbour_information_list = []
num_pos = 0
num_neg = 0
for i in range(0,len(pos_neg_sample)):
    features = Get_Neigh_PSSM(pos_neg_sample.iloc[i,0],pos_neg_sample.iloc[i,1],\
                pos_neg_sample.iloc[i,2],12)
    if pos_neg_sample.iloc[i,4] == "Pos":
        if features.shape[0] > 0:
            num_pos = num_pos + 1
            neighbour_information_list.append(list(features.iloc[:,5:25].values.flatten()))
    else:
        if features.shape[0] > 0:
            num_neg = num_neg + 1
            neighbour_information_list.append(list(features.iloc[:,5:25].values.flatten()))

In [556]:
train_dataset = neighbour_information_list
wrong_sample = []
for i in range(0,len(train_dataset)):
    if(len(train_dataset[i])<15):
        wrong_sample.append(i)

y_label =[1 for x in range(0,num_pos)]
y_label.extend([2 for i in range(0,num_neg)])

final_y_label = [y_label[i] for i in range(0,num_neg+num_pos) if i not in wrong_sample]
final_y_label = np.asarray(final_y_label)
shuffle_y_label = shuffle(final_y_label,random_state=0)

final_train_dataset = [train_dataset[i] for i in range(0,num_neg+num_pos) if i not in wrong_sample]
final_train_dataset = np.vstack(np.asarray(final_train_dataset))
shuffle_train_dataset = shuffle(final_train_dataset,random_state=0)

for clf in clfs:
    print(clf)
    scores_acc = cross_val_score(clfs[clf],shuffle_train_dataset,shuffle_y_label,scoring='accuracy',n_jobs=5,cv=10)
    scores_roc = cross_val_score(clfs[clf],shuffle_train_dataset,shuffle_y_label,scoring='roc_auc',n_jobs=5,cv=10)
    scores_precision = cross_val_score(clfs[clf],shuffle_train_dataset,shuffle_y_label,scoring='precision',n_jobs=5,cv=10)
    scores_recall = cross_val_score(clfs[clf],shuffle_train_dataset,shuffle_y_label,scoring='recall',n_jobs=5,cv=10)
    print('accuracy:',"{:.4f}".format(sum(scores_acc)/10),'roc:',"{:.4f}".format(sum(scores_roc)/10), \
         'precision:',"{:.4f}".format(sum(scores_precision)/10),'recall:',"{:.4f}".format(sum(scores_recall)/10))

decision_tree
accuracy: 0.6181 roc: 0.6117 precision: 0.6293 recall: 0.6372
naive_gaussian
accuracy: 0.6245 roc: 0.6830 precision: 0.7207 recall: 0.4470
K_neighbor
accuracy: 0.5850 roc: 0.6138 precision: 0.6893 recall: 0.3623
bagging_knn
accuracy: 0.5875 roc: 0.6598 precision: 0.6979 recall: 0.3540
bagging_tree
accuracy: 0.6271 roc: 0.7006 precision: 0.6755 recall: 0.8071
random_forest
accuracy: 0.7475 roc: 0.7923 precision: 0.7662 recall: 0.7534
adaboost
accuracy: 0.6781 roc: 0.7596 precision: 0.7010 recall: 0.6642
gradient_boost
accuracy: 0.6827 roc: 0.7375 precision: 0.6982 recall: 0.6814
svm
accuracy: 0.5737 roc: 0.6678 precision: 0.5572 recall: 0.8646
xgboost
accuracy: 0.7453 roc: 0.8172 precision: 0.7777 recall: 0.7172


## FEATURE training

In [403]:
pos_neg_sample = pd.concat([pos_sample_and_neig_pruned,neg_sample_and_neig_pruned])

In [568]:
neighbour_information_list = []
num_pos = 0
num_neg = 0
for i in range(0,len(pos_neg_sample)):
    features = Get_FEATURE(pos_neg_sample.iloc[i,0],pos_neg_sample.iloc[i,1],\
                pos_neg_sample.iloc[i,2])
    if pos_neg_sample.iloc[i,4] == "Pos":
        if features.shape[0] > 0:
            num_pos = num_pos + 1
            neighbour_information_list.append([float(x) for x in features["feature"].values[0].split(",")])
    else:
        if features.shape[0] > 0:
            num_neg = num_neg + 1
            neighbour_information_list.append([float(x) for x in features["feature"].values[0].split(",")])

  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result = self._query(query)
  result =

In [562]:
train_dataset = neighbour_information_list
# for each_res in neighbour_information_list:
#     one_res_data = []
#     for element in each_res[1:]:
#         if isinstance(element,str):
#             pass
#             #one_res_data.extend(onehot_encoded.transform([[amino_acid_property_by_res[element]-1]])[0].tolist())
#         else:
#             one_res_data.append(element)
#     train_dataset.append(one_res_data)
wrong_sample = []
for i in range(0,len(train_dataset)):
    if(len(train_dataset[i])<470):
        wrong_sample.append(i)

y_label =[1 for x in range(0,num_pos)]
y_label.extend([2 for i in range(0,num_neg)])

final_y_label = [y_label[i] for i in range(0,num_neg+num_pos) if i not in wrong_sample]
final_y_label = np.asarray(final_y_label)
shuffle_y_label = shuffle(final_y_label,random_state=0)

final_train_dataset = [train_dataset[i] for i in range(0,num_neg+num_pos) if i not in wrong_sample]
final_train_dataset = np.vstack(np.asarray(final_train_dataset))
shuffle_train_dataset = shuffle(final_train_dataset,random_state=0)

for clf in clfs:
    print(clf)
    scores_acc = cross_val_score(clfs[clf],shuffle_train_dataset,shuffle_y_label,scoring='accuracy',n_jobs=5,cv=10)
    scores_roc = cross_val_score(clfs[clf],shuffle_train_dataset,shuffle_y_label,scoring='roc_auc',n_jobs=5,cv=10)
    scores_precision = cross_val_score(clfs[clf],shuffle_train_dataset,shuffle_y_label,scoring='precision',n_jobs=5,cv=10)
    scores_recall = cross_val_score(clfs[clf],shuffle_train_dataset,shuffle_y_label,scoring='recall',n_jobs=5,cv=10)
    print('accuracy:',"{:.4f}".format(sum(scores_acc)/10),'roc:',"{:.4f}".format(sum(scores_roc)/10), \
         'precision:',"{:.4f}".format(sum(scores_precision)/10),'recall:',"{:.4f}".format(sum(scores_recall)/10))

decision_tree
accuracy: 0.6758 roc: 0.6640 precision: 0.6658 recall: 0.6577
naive_gaussian
accuracy: 0.6892 roc: 0.7287 precision: 0.7480 recall: 0.5846
K_neighbor
accuracy: 0.6931 roc: 0.7141 precision: 0.7077 recall: 0.6692
bagging_knn
accuracy: 0.7182 roc: 0.7610 precision: 0.7376 recall: 0.7077
bagging_tree
accuracy: 0.7375 roc: 0.7941 precision: 0.7145 recall: 0.7731
random_forest
accuracy: 0.7704 roc: 0.8428 precision: 0.7798 recall: 0.7846
adaboost
accuracy: 0.7106 roc: 0.7742 precision: 0.7167 recall: 0.7192
gradient_boost
accuracy: 0.7066 roc: 0.7698 precision: 0.7283 recall: 0.6846
svm
accuracy: 0.5309 roc: 0.5357 precision: 0.5170 recall: 1.0000
xgboost
accuracy: 0.7472 roc: 0.8313 precision: 0.7674 recall: 0.7231


In [150]:
len(final_train_dataset)

518

## Deep Learning test

In [566]:
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import Dropout

In [368]:
skf = StratifiedKFold(n_splits=10,shuffle=True)
skf.get_n_splits(shuffle_train_dataset,shuffle_y_label)

10

In [586]:
train_dataset = []
for each_res in neighbour_information_list:
    one_res_data = []
    i = 0
    for element in each_res:
        if isinstance(element,str):
            one_res_data.extend(onehot_encoded.transform([[amino_acid_property_by_res[element]-1]])[0].tolist())
        else:
            one_res_data.append(math.log(element))
            
#         if isinstance(element,str):
#             pass
#         else:
#             if i % 2 == 1:
#                 one_res_data.append(element)
#             i = i + 1    
            

    train_dataset.append(one_res_data)
wrong_sample = []
for i in range(0,len(train_dataset)):
    if(len(train_dataset[i])<len(train_dataset[0])):
        wrong_sample.append(i)

y_label =[0 for x in range(0,num_pos)]
y_label.extend([1 for i in range(0,num_neg)])

final_y_label = [y_label[i] for i in range(0,num_neg+num_pos) if i not in wrong_sample]
final_y_label = np.asarray(final_y_label)
shuffle_y_label = shuffle(final_y_label,random_state=0)

final_train_dataset = [train_dataset[i] for i in range(0,num_neg+num_pos) if i not in wrong_sample]
final_train_dataset = np.vstack(np.asarray(final_train_dataset))
shuffle_train_dataset = shuffle(final_train_dataset,random_state=0)

In [587]:
cum_accuracy = 0
for train_index, test_index in skf.split(shuffle_train_dataset,shuffle_y_label):
    model = Sequential()
    model.add(Dense(1000, input_dim=502, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1000, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1000, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1000, activation='relu'))
    model.add(Dropout(0.2))
    model.add(Dense(1, activation='sigmoid'))
    # compile the keras model
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    model.fit(shuffle_train_dataset[train_index], \
              shuffle_y_label[train_index], epochs=100, batch_size=20,verbose=0)
    _, accuracy = model.evaluate(shuffle_train_dataset[test_index], shuffle_y_label[test_index],verbose=0) 
    cum_accuracy = cum_accuracy + accuracy
print('Accuracy: %.2f' % (cum_accuracy/10))

Accuracy: 0.65
