# Prediction of Breast Cancer peptides with MLP classifier and different n-fold CVs

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# remove warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [2]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix,accuracy_score, roc_auc_score,f1_score, recall_score, precision_score
from sklearn.utils import class_weight

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.svm import LinearSVC

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.feature_selection import RFECV, VarianceThreshold, SelectKBest, chi2
from sklearn.feature_selection import SelectFromModel, SelectPercentile, f_classif

import seaborn as sns; sns.set() # data visualization library 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from imblearn.over_sampling import SMOTE

In [3]:
from myFunctions import *

# define output variables
outVar = 'Class'

# define a label for output files
label = 'Outer'

seed = 74

## Individual ML and preprocessing

In [4]:
# read dataset for the best model: normalized, selected features, ballanced
sFile = './best_classifier/Mix_BreastCancer.ds_bal.csv'

print('\n-> Read dataset', sFile)
df = pd.read_csv(sFile)
print(len(df.columns))


-> Read dataset ./best_classifier/Mix_BreastCancer.ds_bal.csv
301


In [5]:
df.shape

(466, 301)

### ML

In [6]:
# get ds for ML
Xdata, Ydata, Features = getDataFromDataFrame(df)# out var = Class 


-> Get X & Y data, Features list
Shape (466, 301)
Shape X data: (466, 300)
Shape Y data: (466,)
Done!


In [7]:
# Calculate class weights
class_weights = set_weights(Ydata)
print("Class weights = ", class_weights)

Class weights =  {0: 1.0, 1: 1.0}


In [9]:
# define no of folds
nfolds = [5,10]

# for each fold CV create a ML classifier
for nfold in nfolds:
    
    # create stratified folds
    outer_cv = StratifiedKFold(n_splits=nfold,shuffle=True,random_state=seed)

    # define lists for results
    ifold = 0
    ACCs  =[]
    AUROCs=[]

    print("*** MLP results "+str(nfold)+"-folds CV:")
    for train_index, test_index in outer_cv.split(Xdata, Ydata):
        ifold +=1

        print("Fold =",ifold)
        start = time.time()

        #print("TRAIN:", train_index, "TEST:", test_index)
        X_train, X_test = Xdata[train_index], Xdata[test_index]
        y_train, y_test = Ydata[train_index], Ydata[test_index]

        #scaler.transform(X_test)
        clf = MLPClassifier(hidden_layer_sizes= (20),
                            random_state = seed,
                            max_iter=50000, shuffle=False)
        clf.fit(X_train, y_train)

        joblib.dump(clf, './best_classifier/MLP_model-'+str(nfold)+'-foldCV-'+str(ifold)+'.pkl', compress = 1)

        y_pred = clf.predict_proba(X_test)
        AUROC = roc_auc_score(y_test, y_pred[:, 1])
        AUROCs.append(AUROC)

        ACC = clf.score(X_test,y_test)
        ACCs.append(ACC)

        print("AUROC=",AUROC,"ACC=",ACC, (time.time() - start)/60,"mins")

    print("-> Mean AUROC =", np.mean(AUROCs), "SD AUROC =",np.std(AUROCs))
    print("-> Mean ACC =", np.mean(ACCs), "SD ACC =",np.std(ACCs))    

*** MLP results 5-folds CV:
Fold = 1
AUROC= 0.9624264373019467 ACC= 0.9361702127659575 0.010976902643839518 mins
Fold = 2
AUROC= 0.9949121184088807 ACC= 0.967741935483871 0.012082143624623617 mins
Fold = 3
AUROC= 0.9875115633672525 ACC= 0.9354838709677419 0.013023630777994791 mins
Fold = 4
AUROC= 0.9967622571692877 ACC= 0.956989247311828 0.013416580359141032 mins
Fold = 5
AUROC= 0.9953746530989824 ACC= 0.9354838709677419 0.01189046303431193 mins
-> Mean AUROC = 0.9873974058692699 SD AUROC = 0.012894728542415898
-> Mean ACC = 0.9463738274994281 SD ACC = 0.013495030468890643
*** MLP results 10-folds CV:
Fold = 1
AUROC= 0.9601449275362319 ACC= 0.9148936170212766 0.015934797128041585 mins
Fold = 2
AUROC= 0.9510869565217391 ACC= 0.9148936170212766 0.014011184374491373 mins
Fold = 3
AUROC= 1.0 ACC= 0.9787234042553191 0.014663596947987875 mins
Fold = 4
AUROC= 0.9873188405797101 ACC= 0.9361702127659575 0.015037699540456136 mins
Fold = 5
AUROC= 0.9728260869565218 ACC= 0.9148936170212766 0.01489

Have fun with ML! @muntisa