# Prediction of Breast Cancer peptides with the best MLP classifier

### SMOTE after data split (in each CV fold)

In [29]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

# remove warnings
import warnings
warnings.filterwarnings("ignore", category=DeprecationWarning) 

In [30]:
import numpy as np
import pandas as pd
import time
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.model_selection import cross_val_score, GridSearchCV, StratifiedKFold
from sklearn.metrics import confusion_matrix,accuracy_score, roc_auc_score,f1_score, recall_score, precision_score
from sklearn.utils import class_weight

from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression, LassoCV
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, ExtraTreesClassifier
from xgboost import XGBClassifier
from sklearn.svm import SVC
from sklearn.gaussian_process.kernels import RBF
from sklearn.svm import LinearSVC

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.feature_selection import RFECV, VarianceThreshold, SelectKBest, chi2
from sklearn.feature_selection import SelectFromModel, SelectPercentile, f_classif

import seaborn as sns; sns.set() # data visualization library 
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import GradientBoostingClassifier, BaggingClassifier, AdaBoostClassifier
from sklearn.naive_bayes import BernoulliNB, GaussianNB
from imblearn.over_sampling import SMOTE

In [31]:
from myFunctions import *

# define output variables
outVar = 'Class'

# define list of folds
nfold = 3

# define a label for output files
label = 'Outer'

# seed obtained with getResultsMultiSeeds.ipynb
# the seed with the mean AUROC of 3-fold CV closest to the general average of all the seeds (1-50)
seed = 74 

## Individual ML and preprocessing

In [55]:
# read dataset for the best model (it will generate: Mix-Best300)
sFile = './best_classifier/Mix_BreastCancer.csv'

print('\n-> Read dataset', sFile)
df = pd.read_csv(sFile)
print(len(df.columns))


-> Read dataset ./best_classifier/Mix_BreastCancer.csv
8744


### Preprocessing

In [56]:
# Clean columns (remove all extra columns but keep ProtID!)
df = ClearDatasets(df)
print(len(df.columns))

# drop ProtID to have only descriptors + Class (raw dataset)
print('\n-> Drop ProtID column')
df= df.drop(['ProtID'],axis = 1)
print('Done!')
print(len(df.columns))

# Check dataset
DataCheckings(df)

# Dataset preprocessing
df = DataPreprocessing(df)
print(len(df.columns))

# Remove zero variance columns
df = Remove0VarCols(df)


-> Modify dataset
Done!
8742

-> Drop ProtID column
Done!
8741

-> Checking dataset

Data points = 376

Columns (output + features)= 8741

Data types = [dtype('float64') dtype('int64')]


Column Names:
 Index(['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I',
       ...
       'DAYM780201.lag22', 'DAYM780201.lag23', 'DAYM780201.lag24',
       'DAYM780201.lag25', 'DAYM780201.lag26', 'DAYM780201.lag27',
       'DAYM780201.lag28', 'DAYM780201.lag29', 'DAYM780201.lag30', 'Class'],
      dtype='object', length=8741)

Categorical features: []

Columns with NaN:  0  /  8741

No of data points with NaN: 0  /  376
Done!

-> Dataset preprocessing
Inicial shape: (376, 8741)
Data points = 376
Columns (output + features)= 8741
Data types = [dtype('float64') dtype('int64')]


Column Names:
 Index(['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I',
       ...
       'DAYM780201.lag22', 'DAYM780201.lag23', 'DAYM780201.lag24',
       'DAYM780201.lag25', 'DAYM780201.lag26', 'DAYM780201.lag27',
       'D

In [57]:
# Save initial ds
rawFile = sFile[:-4]+'.ds_raw.csv'
print('\n-> Save raw dataset:',rawFile)
df.to_csv(rawFile, index=False)
print('Done!')


-> Save raw dataset: ./best_classifier/Mix_BreastCancer.ds_raw.csv
Done!


In [59]:
# read the prediction file with the same descriptors (all from Mix)
# read dataset for the best model (it will generate: Mix-Best300)
sFilep = './best_classifier/Screening_3_RBPs.csv' 
# Screening_1_Metastasis.csv, Screening_2_Cancer_Immunotherapy_Genes.csv, Screening_3_RBPs.csv

print('\n-> Read dataset', sFilep)
dfp = pd.read_csv(sFilep)
print(len(dfp.columns))


-> Read dataset ./best_classifier/Screening_3_RBPs.csv
8741


In [60]:
# Preprocessing of prediction file
# Clean columns (remove all extra columns but keep ProtID!)
#df = ClearDatasets(df)
#print(len(df.columns))

# drop ProtID to have only descriptors + Class (raw dataset)
#print('\n-> Drop ProtID column')
#df= df.drop(['ProtID'],axis = 1)
#print('Done!')
#print(len(df.columns))

# Check dataset
print(DataCheckings(dfp))


-> Checking dataset

Data points = 1369

Columns (output + features)= 8741

Data types = [dtype('float64') dtype('int64')]


Column Names:
 Index(['A', 'R', 'N', 'D', 'C', 'E', 'Q', 'G', 'H', 'I',
       ...
       'Pc2.Hydrophilicity.26', 'Pc2.Hydrophobicity.27',
       'Pc2.Hydrophilicity.27', 'Pc2.Hydrophobicity.28',
       'Pc2.Hydrophilicity.28', 'Pc2.Hydrophobicity.29',
       'Pc2.Hydrophilicity.29', 'Pc2.Hydrophobicity.30',
       'Pc2.Hydrophilicity.30', 'Class'],
      dtype='object', length=8741)

Categorical features: []

Columns with NaN:  0  /  8741

No of data points with NaN: 0  /  1369
Done!
0


In [61]:
# Select the same features as the dataset
dfp = dfp[list(df.columns)]
dfp.shape

(1369, 8709)

In [62]:
# Scale raw dataframe
# scale dataframe, save scaled file, save scaler
Xdata, Ydata, Features = getDataFromDataFrame(df)# out var = Class
Xdatap, Ydatap, Featuresp = getDataFromDataFrame(dfp)# prediction data

# Normalize dataset & prediction set
scaler = MinMaxScaler()
Xdata = scaler.fit_transform(Xdata)
Xdatap = scaler.transform(Xdatap) # scaler prediction data with the same scaler

df = pd.DataFrame(Xdata,columns=Features)
df['Class'] = Ydata # add class column

dfp = pd.DataFrame(Xdatap,columns=Featuresp)
dfp['Class'] = Ydatap # add class column

scalerFile = sFile[:-4]+'.scaler_Std.pkl'
print('* Save scaler:', scalerFile)
joblib.dump(scaler, scalerFile) 

# Save initial ds
scaledFile = sFile[:-4]+'.ds_std.csv'
print('* Save scaled dataset:', scaledFile)
df.to_csv(scaledFile, index=False)

# Save initial ds for predictions
scaledFilep = sFilep[:-4]+'.ds_std.csv'
print('* Save scaled dataset:', scaledFilep)
dfp.to_csv(scaledFilep, index=False)

print('Done!')


-> Get X & Y data, Features list
Shape (376, 8709)
Shape X data: (376, 8708)
Shape Y data: (376,)
Done!

-> Get X & Y data, Features list
Shape (1369, 8709)
Shape X data: (1369, 8708)
Shape Y data: (1369,)
Done!
* Save scaler: ./best_classifier/Mix_BreastCancer.scaler_Std.pkl
* Save scaled dataset: ./best_classifier/Mix_BreastCancer.ds_std.csv
* Save scaled dataset: ./best_classifier/Screening_3_RBPs.ds_std.csv
Done!


In [63]:
nFeats = 300

Xdata, Ydata, Features = getDataFromDataFrame(df)# out var = Class

print('\n-> Univariate Feature selection')
selector= SelectKBest(chi2, k=nFeats)
Xdata = selector.fit_transform(Xdata, Ydata)
    
selectorFile = sFile[:-4]+'.featSelector_Univariate'+str(nFeats)+'.pkl'
print('* Save selector:', selectorFile)
joblib.dump(selector, selectorFile) 
    
# Selected features
SelFeatures = []
for i in selector.get_support(indices=True):
    SelFeatures.append(Features[i])
        
# create the resulted dataframe
df = pd.DataFrame(Xdata,columns=SelFeatures)
df['Class'] = Ydata # add class column
print('Final columns:', list(df.columns))
    
# Save selected feature ds
selectFile = sFile[:-4]+'.ds_sel.csv'
print('* Save selected features dataset:', selectFile)
df.to_csv(selectFile, index=False)

# create the resulted dataframe for predictions
dfp = dfp[list(df.columns)]
    
# Save selected feature ds
selectFilep = sFilep[:-4]+'.ds_sel.csv'
print('* Save selected features dataset:', selectFilep)
dfp.to_csv(selectFilep, index=False)
    
print('Done!')


-> Get X & Y data, Features list
Shape (376, 8709)
Shape X data: (376, 8708)
Shape Y data: (376,)
Done!

-> Univariate Feature selection
* Save selector: ./best_classifier/Mix_BreastCancer.featSelector_Univariate300.pkl
Final columns: ['MN', 'LG', 'QI', 'NK', 'EM', 'QM', 'MM', 'EY', 'FAA', 'FNA', 'PNA', 'MDA', 'YHA', 'YKA', 'WFA', 'GPA', 'NTA', 'EYA', 'PAR', 'QDR', 'KER', 'SQR', 'QGR', 'LLR', 'HKR', 'TKR', 'TMR', 'YMR', 'MFR', 'EAN', 'HAN', 'MRN', 'SNN', 'EDN', 'QCN', 'QQN', 'GQN', 'PGN', 'IHN', 'NKN', 'HKN', 'LKN', 'AMN', 'TMN', 'VMN', 'MPN', 'PSN', 'YTN', 'KWN', 'PWN', 'EYN', 'PYN', 'LVN', 'PVN', 'SVN', 'VAD', 'HRD', 'IND', 'PDD', 'IQD', 'NHD', 'YHD', 'NID', 'HFD', 'ITD', 'RYD', 'IYD', 'QRC', 'DNC', 'SNC', 'MDC', 'AQC', 'CGC', 'MGC', 'VHC', 'CKC', 'IKC', 'SKC', 'MMC', 'PFC', 'MPC', 'MVC', 'FVC', 'FDE', 'YDE', 'SQE', 'TQE', 'RHE', 'MHE', 'HIE', 'FKE', 'EME', 'QME', 'LME', 'MME', 'VME', 'SFE', 'DAQ', 'TNQ', 'IDQ', 'DCQ', 'KCQ', 'GLQ', 'FKQ', 'AMQ', 'CMQ', 'VPQ', 'PSQ', 'IWQ', 'YWQ', '

In [64]:
df.shape

(376, 301)

In [65]:
dfp.shape

(1369, 301)

In [66]:
# Balancing dataframe using SMOTE
# df = SMOTEdf(df,sFile,seed)
# df = UndersampleDF(df, sFile, seed=seed)

In [67]:
df.shape

(376, 301)

### ML

In [68]:
# get ds for ML
Xdata, Ydata, Features = getDataFromDataFrame(df)# out var = Class 


-> Get X & Y data, Features list
Shape (376, 301)
Shape X data: (376, 300)
Shape Y data: (376,)
Done!


In [69]:
# Calculate class weights
class_weights = set_weights(Ydata)
print("Class weights = ", class_weights)

Class weights =  {0: 0.8068669527896996, 1: 1.3146853146853146}


In [46]:
outer_cv = StratifiedKFold(n_splits=3,shuffle=True,random_state=seed)

In [20]:
ifold = 0
ACCs  =[]
AUROCs=[]
models =[]
SelectedFeatures =[]

for train_index, test_index in outer_cv.split(Xdata, Ydata):
    ifold +=1
    
    print("Fold =",ifold)
    start = time.time()
    
    #print("TRAIN:", train_index, "TEST:", test_index)
    X_train, X_test = Xdata[train_index], Xdata[test_index]
    y_train, y_test = Ydata[train_index], Ydata[test_index]
    
    # Apply SMOTE to the training data
    smote_tr = SMOTE(random_state=seed)
    X_train_resampled, y_train_resampled = smote_tr.fit_resample(X_train, y_train)

    #scaler.transform(X_test)
    clf = MLPClassifier(hidden_layer_sizes= (20),
                        random_state = seed,
                        max_iter=50000, shuffle=False)
    clf.fit(X_train_resampled, y_train_resampled)
    
    joblib.dump(clf, './best_classifier/MLP_model_smote'+str(ifold)+'.pkl', compress = 1)
    models.append(clf)
    
    # SMOTE the testing data too!!!!!!
    smote_ts = SMOTE(random_state=seed)
    X_test_resampled, y_test_resampled = smote_ts.fit_resample(X_test, y_test)
    
    # Use decision_function instead of predict_proba
    if hasattr(clf, 'decision_function'):
        y_scores = clf.decision_function(X_test_resampled)
    elif hasattr(clf, 'predict_proba'):
        y_scores = clf.predict_proba(X_test_resampled)[:, 1]
    else:
        raise AttributeError("Classifier has no decision_function or predict_proba method")
    
    # Calculate ROC AUC score
    AUROC = roc_auc_score(y_test_resampled, y_scores)
        
    #y_pred = clf.predict_proba(X_test)
    #AUROC = roc_auc_score(y_test, y_pred[:, 1])
    AUROCs.append(AUROC)
    
    ACC = clf.score(X_test_resampled,y_test_resampled)
    ACCs.append(ACC)
   
    print("AUROC=",AUROC,"ACC=",ACC, (time.time() - start)/60,"mins")

Fold = 1
AUROC= 0.9682774490466799 ACC= 0.9102564102564102 0.008986107508341471 mins
Fold = 2
AUROC= 0.9811097992916173 ACC= 0.9155844155844156 0.006299865245819092 mins
Fold = 3
AUROC= 0.9534845496383958 ACC= 0.8653846153846154 0.005884281794230143 mins


In [21]:
print(np.mean(AUROCs),np.std(AUROCs))

0.9676239326588977 0.011287424192511159


In [22]:
print(np.mean(ACCs),np.std(ACCs))

0.8970751470751471 0.022513910652679565


In [23]:
dfp.shape

(1903, 301)

In [70]:
# get ds for ML
Xdatap, Ydatap, Featuresp = getDataFromDataFrame(dfp)# out var = Class


-> Get X & Y data, Features list
Shape (1369, 301)
Shape X data: (1369, 300)
Shape Y data: (1369,)
Done!


In [71]:
Xdatap.shape

(1369, 300)

In [72]:
# load the saved model 2 from disk: AUROC= 0.9747340425531915 ACC= 0.9157894736842105 (close to general mean AUROC for 50 seeds)
clf = joblib.load('./best_classifier/MLP_model_smote'+str(2)+'.pkl')

# predictions with the model
Ydatap = clf.predict(Xdatap)
        
# add probabilities (n_samples X n_classes; class 0, class 1)
Ydatapprob = clf.predict_proba(Xdatap)
        
# save predictions for list 1
dffp = pd.DataFrame(Xdatap,columns=Featuresp)
dffp['Class'] = Ydatap
dffp['Prob0'] = Ydatapprob[:,0]
dffp['Prob1'] = Ydatapprob[:,1]

In [73]:
dffp

Unnamed: 0,MN,LG,QI,NK,EM,QM,MM,EY,FAA,FNA,...,GHV,GLV,MMV,VFV,IYV,Pc1.N,Pc1.M,Class,Prob0,Prob1
0,0.218033,0.000000,0.241530,0.000000,0.305738,0.345902,0.000000,0.000000,0.000000,0.0,...,0.0,0.532895,0.000000,0.0,0.976974,0.037229,0.035432,0,0.996625,0.003375
1,0.258755,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.014129,0.014227,1,0.203024,0.796976
2,0.000000,0.448454,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.022057,0.034894,1,0.004913,0.995087
3,0.000000,0.277828,0.067214,0.000000,0.085082,0.770073,0.070712,0.171533,0.000000,0.0,...,0.0,0.000000,0.307763,0.0,0.000000,0.225567,0.371813,1,0.020595,0.979405
4,0.096377,0.567391,0.213527,0.342029,0.000000,0.152899,0.000000,0.181643,0.433962,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.136595,0.146389,0,0.902327,0.097673
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1364,0.000000,0.152364,0.129013,0.551080,0.000000,0.000000,0.000000,0.109749,0.000000,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.131570,0.076788,0,0.908360,0.091640
1365,0.154651,0.202326,0.171318,0.000000,0.216860,0.000000,0.180233,0.145736,0.000000,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.088178,0.096063,1,0.054835,0.945165
1366,0.000000,0.202797,0.171717,0.183372,0.217366,0.245921,0.090326,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.103424,0.182230,1,0.037346,0.962654
1367,0.000000,0.273585,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.0,...,0.0,0.000000,0.000000,0.0,0.000000,0.023453,0.001518,0,0.891812,0.108188


In [74]:
# merge with protein information from other file
# AC.Screening_1_Metastasis.csv, AC.Screening_2_Cancer_Immunotherapy_Genes.csv, AC.Screening_3_RBPs.csv
result = pd.concat([dffp, pd.read_csv('./best_classifier/AC.Screening_3_RBPs.csv')], axis=1)

In [75]:
# creat new order of columns in final results
newHeader=['Class','Prob1','Prob0','V1','V2']
result = result[newHeader]
result = result.sort_values(by=['Prob1'], ascending=False)
result.to_csv(sFilep[:-4]+'_predictions_smote.csv', index=True)