In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.svm import SVC
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
import xgboost as xgb
from xgboost import XGBClassifier

In [2]:
from sklearn.model_selection import train_test_split
from sklearn import metrics
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
from sklearn.preprocessing import scale
import os

import codecs


from sklearn.feature_selection import SelectKBest, chi2

In [None]:
# Set seed value
random_seed= 42

# 1. Set `PYTHONHASHSEED` environment variable at a fixed value
import os
os.environ['PYTHONHASHSEED']=str(random_seed)
# 2. Set `python` built-in pseudo-random generator at a fixed value
import random
random.seed(random_seed)
# 3. Set `numpy` pseudo-random generator at a fixed value
import numpy as np
np.random.seed(random_seed)

In [3]:
## PAAC train
paac_train_N = pd.read_csv('C:/Users/nchandra/OneDrive - National University of Singapore/FYP/BioStatsFYP/PredProtein/311020 FYP/data/PAAC_train_N.csv')

paac_train_P = pd.read_csv('C:/Users/nchandra/OneDrive - National University of Singapore/FYP/BioStatsFYP/PredProtein/311020 FYP/data/PAAC_train_P.csv')

paac_train = pd.concat([paac_train_P.iloc[:,1:], paac_train_N.iloc[:,1:]], axis = 0)

paac_train.reset_index(drop = True, inplace = True)
paac_train['target'] = paac_train['target'].astype('category')

#Train test split
paac_train_class = paac_train['target']
paac_X = paac_train.drop('target', axis = 1)
paac_y = paac_train['target']


In [4]:
# PAAC independent test data

paac_test_N = pd.read_csv('C:/Users/nchandra/OneDrive - National University of Singapore/FYP/BioStatsFYP/PredProtein/311020 FYP/data/PAAC_ind_N.csv')

paac_test_P = pd.read_csv('C:/Users/nchandra/OneDrive - National University of Singapore/FYP/BioStatsFYP/PredProtein/311020 FYP/data/PAAC_ind_P.csv')

paac_test = pd.concat([paac_test_P.iloc[:,1:], paac_test_N.iloc[:,1:]], axis = 0)
paac_test.reset_index(drop = True, inplace = True)
paac_test['target'] = paac_test['target'].astype('category')

paac_indX_test = paac_test.drop('target', axis = 1)
paac_indy_test = paac_test['target']

### Stack PAAC

In [5]:
from sklearn.preprocessing import MinMaxScaler
minmax = MinMaxScaler()

In [6]:
# training data
X_train = minmax.fit_transform(paac_X)
X_train = pd.DataFrame(X_train, index  = paac_X.index)
y_train = paac_y

In [7]:

X_test = minmax.fit_transform(paac_indX_test)
X_test = pd.DataFrame(X_test, index = paac_indX_test.index)
y_test = paac_indy_test

In [8]:
def feat_select(X_train, y_train): 
    
    from sklearn.feature_selection import SelectFromModel
    from numpy import sort

    xgb = XGBClassifier()
    xgb.fit(X_train, y_train)

    thresholds = np.sort(xgb.feature_importances_)[::-1]

    feature_thresh = pd.DataFrame(columns = ['Thresh', 'n', 'Accuracy'])
    columns = list(feature_thresh)
    data = []

    thresh_limit = 1

    for thresh in thresholds:

        #to Stop the loop if the threshold no longer improves

        if thresh >= thresh_limit:
            break

        #select features using threshold
        selection =  SelectFromModel(xgb, threshold = thresh, prefit = True)
        select_X_train = selection.transform(X_train)

        #train model
        selection_model = XGBClassifier()
        selection_model.fit(select_X_train, y_train)

        #evaluate model
        select_X_test = selection.transform(X_test)
        predictions = selection_model.predict(select_X_test)
        accuracy = accuracy_score(y_test, predictions)


        #print("Thresh = %.3f, n = %d, Accuracy = %.2f%%" % (thresh, select_X_train.shape[1], accuracy*100.0))

        thresh_limit = thresh

        values = [thresh, select_X_train.shape[1], accuracy*100]
        zipped = zip(columns, values)
        values_dict = dict(zipped)
        data.append(values_dict)

    feature_thresh = feature_thresh.append(data, True)

    best_accuracy = feature_thresh[feature_thresh['Accuracy'] == feature_thresh['Accuracy'].max()]
    best_threshold = best_accuracy.Thresh.min()

    # transform training data based on the selected feature importances
    best_thresh = SelectFromModel(xgb, threshold = best_threshold, prefit = True)
    
    X_train_best = pd.DataFrame(best_thresh.transform(X_train))
    X_test_best = pd.DataFrame(best_thresh.transform(X_test))

    return X_train_best, X_test_best


In [9]:
#X_train_best, X_test_best = feat_select(X_train, y_train)

In [10]:
#if X_train.shape[1] > 100:
#    X_train, X_test = feat_select(X_train, y_train)

## Begin Stack

In [11]:
# get a stacking ensemble of models
def get_stacking():
    # define the base models
    level0 = list()
    #level0.append(('lr', LogisticRegression()))
    level0.append(('knn', KNeighborsClassifier()))
    #level0.append(('cart', DecisionTreeClassifier()))
    level0.append(('lsvm', LinearSVC()))
    level0.append(('svm', SVC()))
    level0.append(('et', ExtraTreesClassifier()))
    level0.append(('rf', RandomForestClassifier()))
    #level0.append(('bayes', GaussianNB()))
    level0.append(('xgb', XGBClassifier()))
    
    # define meta learner model
    level1 = LogisticRegression()
    # define the stacking ensemble
    model = StackingClassifier(estimators=level0, final_estimator=level1, cv=5)
    return model

In [12]:
def get_models():
    models = dict()
    #models['lr'] = LogisticRegression()
    models['knn'] = KNeighborsClassifier()
    #models['cart'] = DecisionTreeClassifier()
    models['lsvm'] = LinearSVC(C=15.0, dual=False, 
                               loss='squared_hinge', 
                               penalty='l2', tol=0.01)
    models['et'] = ExtraTreesClassifier(bootstrap=False, 
                     criterion = 'entropy', 
                     max_features=0.35000000000000003, 
                     min_samples_leaf=18, 
                     min_samples_split=9, 
                     n_estimators=100)
    
    models['rf'] = RandomForestClassifier()
    models['svm'] = SVC()
    #models['bayes'] = GaussianNB()
    models['xgb'] = XGBClassifier()
    models['stacking'] = get_stacking()
    return models

In [13]:
# evaluate a given model using cross-validation
def evaluate_model(model, X, y):
    cv = RepeatedStratifiedKFold(n_splits=10, n_repeats=3, random_state=1)
    scores = cross_val_score(model, X, y, 
                             scoring='roc_auc', 
                             cv=cv, 
                             n_jobs=-1, 
                             error_score='raise')
    return scores

In [14]:
# get the models to evaluate
models = get_models()

In [15]:
results, names = list(), list()
for name, model in models.items():
    scores = evaluate_model(model, X_train, y_train)
    results.append(scores)
    names.append(name)
    print('>%s %.3f (%.3f)' % (name, mean(scores), std(scores)))

>knn 0.896 (0.050)
>lsvm 0.927 (0.041)
>et 0.905 (0.048)
>rf 0.918 (0.041)
>svm 0.920 (0.046)
>xgb 0.919 (0.036)
>stacking 0.932 (0.040)


In [16]:
# Result on Test Set
results, names = list(), list()
for name, model in models.items():
    mod = model.fit(X_train, y_train)
    y_pred = mod.predict(X_test)
    
    fpr,tpr,thresholds = metrics.roc_curve(y_test, 
                                           y_pred, 
                                           pos_label = 1)
    auc = metrics.auc(fpr,tpr)
    
    results.append(auc)
    names.append(name)
    print('>%s %.4f' % (name, auc))

>knn 0.6324
>lsvm 0.7500
>et 0.5000
>rf 0.5882
>svm 0.5735
>xgb 0.5882
>stacking 0.6618


In [17]:
# Apply Stack model
stackmodel = models['stacking'].fit(X_train,y_train)

In [18]:
y_pred = stackmodel.predict(X_test)

In [19]:
fpr, tpr, thresholds  = metrics.roc_curve(y_test, y_pred, pos_label=1)
print("AUC:",metrics.auc(fpr, tpr) )

AUC: 0.6617647058823529
