<a href="https://colab.research.google.com/github/oii-nasif/GlyStruct/blob/master/Glycation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [30]:
# Remove Warnings
import warnings
warnings.filterwarnings(action='ignore',)

# Core:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

# Machine Learning Algorithms:
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.ensemble import (AdaBoostClassifier, RandomForestClassifier)
from xgboost import XGBClassifier

# Dataset Handle:
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.utils import shuffle

# Performance:
from sklearn.metrics import (confusion_matrix, accuracy_score, matthews_corrcoef, precision_score, classification_report)

from sklearn.metrics import confusion_matrix
from sklearn.metrics import matthews_corrcoef
from sklearn.metrics import precision_score

In [2]:
# Dataset Link: https://raw.githubusercontent.com/oii-nasif/GlyStruct/master/Glycation.elm
inputFile = '/content/drive/My Drive/Projects/Glycation/DataSet/Glycation.elm'

In [3]:
file = open(file=inputFile, mode='r', encoding="utf-8")

Sequences = []
for line in file.readlines():
    Sequences.append(line.split()[4])

Sequences = Sequences[1:]
# Sequences

In [4]:
len(Sequences)

6591

In [5]:
# X = []
# for seq in Sequences:
#     X.append([seq.count('A'), seq.count('C'), seq.count('D'), seq.count('E'), seq.count('F'), seq.count('G'), seq.count('H'), seq.count('I'), seq.count('K'), seq.count('L'), seq.count('M'), seq.count('N'), seq.count('P'), seq.count('Q'), seq.count('R'), seq.count('S'), seq.count('T'), seq.count('V'), seq.count('W'), seq.count('Y')])

# X = np.array(X)

In [6]:
# X.shape

In [7]:
# def kmers(x, k):
#     v = []
#     for i in range(len(x) - k + 1):
#         v.append(x[i:i + k])
#     return v

In [8]:
# import itertools
# import numpy as np

# def monoMonoGgap(x, g):  

#     t = []
#     m = list(itertools.product('ACDEFGHIKLMNPQRSTVWY', repeat=2))
#     for i in range(1, g + 1, 1):
#         V = kmers(x, i + 2)

#         for gGap in m:

#             C = 0
#             for v in V:
#                 if v[0] == gGap[0] and v[-1] == gGap[1]:
#                     C += 1

#             t.append(C)
#     return t

# T = []
# for x in Sequences:
#     T.append(monoMonoGgap(x, 10))

# T = np.array(T)
# T

In [9]:

# T.shape

In [10]:
# np.save(file = "gapped10mer", arr = T)

In [11]:
X = np.load('/content/drive/My Drive/Projects/Glycation/DataSet/gapped10mer.npy')

In [12]:
X.shape

(6591, 4000)

In [13]:
Y  = [1 for i in range(int(len(Sequences)*0.1301))]
Y += [0 for i in range(int(len(Sequences)*0.87))]
Y = np.array(Y)
print(Y.shape)

(6591,)


In [14]:
Y

array([1, 1, 1, ..., 0, 0, 0])

In [15]:
X, Y = shuffle(X, Y, random_state=42)

print(X.shape)
print(Y.shape)

(6591, 4000)
(6591,)


In [16]:
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.30, random_state=101)

In [17]:
print(Xtrain.shape)
print(Ytrain.shape)
print(Xtest.shape)
print(Ytest.shape)

(4613, 4000)
(4613,)
(1978, 4000)
(1978,)


In [18]:
scaler = StandardScaler()

Xtrain = scaler.fit_transform(Xtrain)
Xtest = scaler.transform(Xtest)

In [20]:
model = XGBClassifier()
model.fit(Xtrain, Ytrain)

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0,
              learning_rate=0.1, max_delta_step=0, max_depth=3,
              min_child_weight=1, missing=None, n_estimators=100, n_jobs=1,
              nthread=None, objective='binary:logistic', random_state=0,
              reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
              silent=None, subsample=1, verbosity=1)

In [21]:
importantFeatures = model.feature_importances_
selectedFeatures = importantFeatures.argsort()[::-1][:int(len(importantFeatures)*0.4)]

In [22]:
selectedFeatures

array([2252, 3982, 2216, ...,  155,  153,  152])

In [42]:
classifiers = [
    LogisticRegression(),
    KNeighborsClassifier(n_neighbors=5),
    DecisionTreeClassifier(),
    SVC(kernel='rbf', probability=True),
    RandomForestClassifier(),
    AdaBoostClassifier(),
    XGBClassifier()
]


In [43]:
for classifier in classifiers:
    model = classifier
    model.fit(Xtrain[:, selectedFeatures], Ytrain)

    Yp = model.predict(Xtest[:, selectedFeatures])
    accuracy = accuracy_score(y_true=Ytest, y_pred=Yp)

    CM = confusion_matrix(y_pred=Yp, y_true=Ytest)
    TN, FP, FN, TP = CM.ravel()
    
    Sensitivity = []
    Specificity = []
    Precision = []
    MCC = []

    MCC.append(matthews_corrcoef(y_true=Ytest, y_pred=Yp))
    Sensitivity.append( TP / (TP + FN) )
    Specificity.append( TN / (TN + FP) )
    Precision.append(precision_score(y_true=Ytest, y_pred=Yp))

    print('Classifier: {}'.format(classifier.__class__.__name__))
    print('Accuracy: {:0.2f}'.format(accuracy*100.0))   
    print('Sensitivity: {0:.4f}'.format(np.sum(Sensitivity)*100.0))
    print('Specificity: {0:.4f}'.format(np.sum(Specificity)*100.0))
    print('MCC: {0:.4f}'.format(np.sum(MCC)))
    print('Precision: {0:.4f}'.format(np.sum(Precision)*100.0))
    print('______________________________________________________________')

Classifier: LogisticRegression
Accuracy: 92.52
Sensitivity: 71.2803
Specificity: 96.1516
MCC: 0.6927
Precision: 76.0148
______________________________________________________________
Classifier: KNeighborsClassifier
Accuracy: 90.14
Sensitivity: 46.3668
Specificity: 97.6317
MCC: 0.5487
Precision: 77.0115
______________________________________________________________
Classifier: DecisionTreeClassifier
Accuracy: 91.66
Sensitivity: 74.3945
Specificity: 94.6122
MCC: 0.6740
Precision: 70.2614
______________________________________________________________
Classifier: SVC
Accuracy: 90.04
Sensitivity: 31.8339
Specificity: 100.0000
MCC: 0.5339
Precision: 100.0000
______________________________________________________________
Classifier: RandomForestClassifier
Accuracy: 95.75
Sensitivity: 71.2803
Specificity: 99.9408
MCC: 0.8218
Precision: 99.5169
______________________________________________________________
Classifier: AdaBoostClassifier
Accuracy: 89.89
Sensitivity: 41.1765
Specificity: 98.2238

In [31]:
for classifier in classifiers:
    model = classifier
    
    accuracy = cross_val_score(model, X[:, selectedFeatures], Y, cv=10)
    accuracy = np.mean(accuracy)
    
    print('Classifier: {}'.format(classifier.__class__.__name__))
    print('Accuracy: {:0.2f}'.format(accuracy*100.0))
    print('Sensitivity: {0:.4f}'.format(np.sum(Sensitivity)*100.0))
    print('Specificity: {0:.4f}'.format(np.sum(Specificity)*100.0))
    print('MCC: {0:.4f}'.format(np.sum(MCC)))
    print('Precision: {0:.4f}'.format(np.sum(Precision)*100.0))
    print('______________________________________________________________')   

Classifier: LogisticRegression, Accuracy: 0.94
Classifier: KNeighborsClassifier, Accuracy: 0.91
Classifier: DecisionTreeClassifier, Accuracy: 0.94
Classifier: SVC, Accuracy: 0.89
Classifier: RandomForestClassifier, Accuracy: 0.97
Classifier: AdaBoostClassifier, Accuracy: 0.91
Classifier: XGBClassifier, Accuracy: 0.93
