In [None]:
import sys
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import math

from sklearn.metrics import accuracy_score, f1_score, roc_curve, roc_auc_score
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
from sklearn.metrics import classification_report,confusion_matrix

from sklearn.utils.class_weight import compute_sample_weight

In [None]:
from sklearn.model_selection import StratifiedKFold

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.svm import SVC
from sklearn import metrics

from sklearn.model_selection import GridSearchCV

In [None]:
X_train = pd.read_csv('data/classification/DCR/X_train.csv', index_col=0)
X_test = pd.read_csv('data/classification/DCR/X_val.csv', index_col=0)
y_train = pd.read_csv('data/classification/DCR/y_train.csv', index_col=0)
y_test = pd.read_csv('data/classification/DCR/y_val.csv', index_col=0)

y_train=y_train.squeeze()
y_test=y_test.squeeze()


# 10 folds stratified cross-validation
from sklearn.model_selection import StratifiedKFold
cv = StratifiedKFold(n_splits=10, shuffle=True)


In [None]:
# import functions to fit the models and print scores
from ipynb.fs.full.functions import fit_model, fit_model_MLP, print_report, print_scores

In [None]:
names = ['XGB', 'LR', 'RF', 'MLP','SVM','AB','ET','LGBM']

In [None]:
classifiers = [
          XGBClassifier(),
          LogisticRegression(), 
          RandomForestClassifier(),
          MLPClassifier(),
          SVC(probability = True),
          AdaBoostClassifier(),
          ExtraTreesClassifier(),
          LGBMClassifier()
            ]

In [None]:
# IMPORT THE SELECTED FEATURES BY EACH METHOD
selMRMR = pd.read_csv('MRMRfeats.csv', index_col=0).transpose()
selected_features_MRMR = [list(selMRMR.iloc[:,i].dropna()) for i in range(selMRMR.shape[1])]

selFFS = pd.read_csv('FFSfeats.csv', index_col=0).transpose()
selected_features_FFS = [list(selFFS.iloc[:,i].dropna()) for i in range(selFFS.shape[1])]

selBFS = pd.read_csv('BFSfeats.csv', index_col=0).transpose()
selected_features_BFS = [list(selBFS.iloc[:,i].dropna()) for i in range(selBFS.shape[1])]

selRFE = pd.read_csv('RFEfeats.csv', index_col=0).transpose()
selected_features_RFE = [list(selRFE.iloc[:,i].dropna()) for i in range(selRFE.shape[1])]

## ------------------------------------------------------------------------------------------------------------

## CLASSIFICATION

In [None]:
# sample weights for training and test set
sw_train = compute_sample_weight(class_weight='balanced', y=y_train)
sw_test = compute_sample_weight(class_weight='balanced', y=y_test)

# set the classification outcome
outcome = 'DCR'


In [None]:
# change this variable according to the feature selector you want to compute the scores
selection = 'RFE'

# vector for setting the path to which save the results
path_params = [outcome, selection]
path_params

In [None]:
# set of hyperparameters for model tuning, you can add all the parameters you want and also change the values of them 
# that you want to test

parameters = [
    {'max_depth':[2,3],'eta':[0.01,0.03,0.3], 'n_estimators': [30,50,100], 'lambda':[1,3,8]},
    {"C":[1e-4,1e-3,1e-2,0.1,1,10]},
    {'max_depth' : [2,3],'min_samples_leaf' : [2,3,4], 'min_samples_split': [2,3,4], 'n_estimators':[50,100]},
    {"hidden_layer_sizes":[10], "alpha": [0.001,0.01,0.1,1], 'max_iter':[2000]},
    {"C":[1e-3,0.01,0.1,1], 'kernel':['rbf','linear'], 'gamma':[0.01,0.1,1, 10, 100]},
    {'learning_rate' : [0.001,0.01, 0.1],'base_estimator': [ DecisionTreeClassifier(max_depth=i) for i in range(2,4) ], 'n_estimators':[30,50,100]},
    {'max_depth' : [2,3],'min_samples_leaf' : [3,4,5], 'min_samples_split': [2,3,4], 'n_estimators':[50,100]},
    {'learning_rate' : [0.001, 0.01, 0.1,1], 'max_depth':[2,3],'num_leaves':[5,10,20,31],'n_estimators': [30,50,100]}
]

In [None]:
# UNCOMMENT THIS LINE AND COMMENT THE OTHER IF YOU WANT TO TRAIN WITH ALL THE FEATURES
#X_tr = X_train.copy()
#X_t = X_test.copy()

# change the vector of selected features in function of the selector
# change the index number in function of the model you want to train (look to the variable "classifiers")
X_tr = X_train.loc[:, selected_features_RFE[0]]
X_t = X_test.loc[:, selected_features_RFE[0]]

print(X_tr.shape)
scores = fit_model(classifiers[0],parameters[0], X_tr, y_train, X_t, y_test, sw_train, sw_test)
# fitMLP is another function because MLP doesn't have sample weights
# scores = fit_modelMLP(classifiers[3],parameters[3], X_tr, y_train, X_t, y_test, sw_train, sw_test)

In [None]:
# print F1, accuracy and hyperparamters selected by gridsearch
print_scores(scores)

In [None]:
# print classification report, confusion matrix and ROC curve
print_report(scores[7], scores[6], 'XGB', X_t, y_test, sw_test, path_params)

In [None]:
# save the trained model with pickle, in order to be further tested on other data
import pickle
path = 'results/classification/DCR/{fs}/MODELS/{n}.pkl'.format(fs=selection, n=names[0])
pickle.dump(scores[7], open(path, 'wb'))