In [None]:
import numpy as np

## FEATURE SELECTION

The selection of the best model is made with regards to the cross validation F1 score 

In [None]:
# MINIMUM REDUNDANCY - MAXIMUM RELEVANCE FUNCTION
def MRMR(X, y, mod):
    from mrmr.pandas import mrmr_classif
    ss = []
    from sklearn.model_selection import StratifiedKFold
    cv = StratifiedKFold(n_splits=10)
    from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score, cross_val_predict
    
    
    vect_feat=np.arange(1,X.shape[1]+1)
    for i in vect_feat:
        features = mrmr_classif(X=X, y=y, K=i)    
        X_mrmr = X.copy()
        y_mrmr = y.copy()
        X_mrmr = X_mrmr.loc[:,features]
        s = cross_val_score(mod, X, y, cv=cv,scoring='f1_weighted').mean()
        ss.append(s)
        
        if(s==max(ss)):
            selected_features = list(features)

    
    return selected_features

In [None]:
# RECURSIVE FEATURE ELIMINATION FUNCTION

def recursive_feature_elimination(X_train, y_train, model):
    from sklearn.feature_selection import RFECV
    from catboost import CatBoostClassifier
    from sklearn.model_selection import StratifiedKFold
    cv = StratifiedKFold(n_splits=10)

    rfecv = RFECV(
    estimator=model,
    step=1,
    cv=cv,
    scoring="f1_weighted",
    min_features_to_select=1,
    n_jobs=-1,
    )
    
    rfecv.fit(X_train, y_train)
    
    print('Score: {s}'.format(s=max(rfecv.cv_results_['mean_test_score'])))
    print('Number of features: {n}'.format(n=rfecv.n_features_))
    
    rank = rfecv.ranking_
    feats = np.where(rank==1)
    selected_features = rfecv.feature_names_in_[feats]
    
    return list(selected_features)

In [None]:
# RFE for Multilayer Perceptron, with eli5 package for permutation importance

def recursive_feature_elimination_MLP(X_train, y_train, model):
    import eli5
    from eli5.sklearn import PermutationImportance
    from sklearn.model_selection import cross_val_score
    
    from sklearn.model_selection import StratifiedKFold
    cv = StratifiedKFold(n_splits=5)

    score = []
    score.append(cross_val_score(model, X_train, y_train, cv=cv,scoring='f1_weighted').mean())
    X_sel = [X_train.copy()]
    
    feats = []
    
    for i in range(0,len(X_train.columns)-1):

        perm = PermutationImportance(model, cv=cv, scoring='f1_weighted', refit=True)
        perm.fit(X_sel[i],y_train)

        feat_imp = perm.feature_importances_
        idx = np.where(feat_imp == min(feat_imp))    

        new_X = X_sel[i].drop(X_sel[i].columns[idx], axis = 1)
        X_sel.append(new_X)

        score.append(cross_val_score(model, X_sel[i], y_train, cv=cv,scoring='f1_weighted').mean())


    X_sel.reverse()   
    score.reverse()
    
    ind = score.index(max(score))
    feats = list(X_sel[ind].columns)
    
    print('Number of features selected: {f}'.format(f=ind+1))
    print('Score: {s} '.format(s=max(score)))
    
    return feats

In [None]:
# Forward/Backward Feature Selection function


def fw_bw_feature_selection(X_train,y_train,model , option='forward'):
    
    from sklearn.feature_selection import SequentialFeatureSelector
    from sklearn.model_selection import cross_val_score
    from catboost import CatBoostClassifier
    import pandas as pd
    
    from sklearn.model_selection import StratifiedKFold
    cv = StratifiedKFold(n_splits=5)
    
    scores = []
    
    for n in range(len(X_train.columns)):
        
        if (n<len(X_train.columns)-1):
            selector = SequentialFeatureSelector(model, n_features_to_select=n+1, direction=option, scoring='f1_weighted', cv=cv)
            X_new = pd.DataFrame(selector.fit_transform(X_train, y_train))
            new_score = cross_val_score(model, X_new, y_train, cv=cv, scoring='f1_weighted').mean()
        else:
            new_score = cross_val_score(model, X_train, y_train, cv=cv, scoring='f1_weighted').mean()
        
        scores.append(new_score)
        
        if(new_score==max(scores)):
            selected_features = list(selector.get_feature_names_out())
    
    print('{m} features selected'.format(m=len(selected_features)))
    print('Best score: {s}\n----------------------'.format(s=max(scores)))
    
    return selected_features

## MODEL FIT

In [None]:
def fit_model(classifier,parameters, X_train, y_train, X_test, y_test, sw_train, sw_test):
    import pandas as pd
    from sklearn.model_selection import cross_val_score, GridSearchCV
    from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, roc_auc_score
    
    from sklearn.model_selection import StratifiedKFold
    cv = StratifiedKFold(n_splits=10)

    gs = GridSearchCV(classifier, parameters, cv=cv, scoring = 'f1_weighted', n_jobs=-1, refit=True, return_train_score=True, verbose=10)
    X_tr=X_train
    X_t=X_test
    
    
    # parameters rto gove to fit method for sample weighting
    fit_params = {'sample_weight': sw_train}

    gs1=gs.fit(X_tr,y_train,**fit_params)
    best_model=gs1.best_estimator_
    best_model.fit(X_tr,y_train,**fit_params)

    gs_f1 = gs1.best_score_
    
    # for the evaluation F1 weighted and balanced accuracy are used
    cv_f1 = cross_val_score(best_model,X_tr,y_train,scoring='f1_weighted', cv=cv).mean()
    cv_acc = cross_val_score(best_model,X_tr,y_train,scoring='balanced_accuracy', cv=cv).mean()
    
    best_params = gs1.best_params_
    
    y_pred = best_model.predict(X_t)
    y_pred_train = best_model.predict(X_tr)

    F1_train = f1_score(y_train, y_pred_train, average="weighted", sample_weight=sw_train)
    F1_test = f1_score(y_test, y_pred, average="weighted", sample_weight=sw_test)
    acc_train = balanced_accuracy_score(y_train, y_pred_train, sample_weight=sw_train)
    acc_test = balanced_accuracy_score(y_test, y_pred, sample_weight=sw_test)

    y_probs = best_model.predict_proba(X_t)
    AUC = roc_auc_score(y_test, y_probs[:,1])
    
    scores = [cv_f1, F1_train, F1_test, cv_acc, acc_train, acc_test, AUC, best_model, best_params]
    
    print('\n\n\n GridSearch Result: ', gs_f1)
        
    return scores


In [None]:
# Same as the previous function but without fit params because MLP doesn't support it

def fit_model_MLP(classifier,parameters, X_train, y_train, X_test, y_test, sw_train, sw_test):
    import pandas as pd
    from sklearn.model_selection import cross_val_score, GridSearchCV
    from sklearn.metrics import accuracy_score, f1_score, balanced_accuracy_score, roc_auc_score
    from sklearn.neural_network import MLPClassifier
    
    from sklearn.model_selection import StratifiedKFold
    cv = StratifiedKFold(n_splits=10)
    loo = LeaveOneOut()

    gs = GridSearchCV(classifier, parameters, cv=cv, scoring = 'f1_weighted', n_jobs=-1, refit=True, return_train_score=True, verbose=10)
    X_tr=X_train
    X_t=X_test

    gs1=gs.fit(X_tr,y_train)
    best_model=gs1.best_estimator_
    best_model.fit(X_tr,y_train)
    
    gs_f1 = gs1.best_score_
    
    cv_f1 = cross_val_score(best_model,X_tr,y_train,scoring='f1_weighted', cv=cv).mean()
    cv_acc = cross_val_score(best_model,X_tr,y_train,scoring='balanced_accuracy', cv=cv).mean()
    
    best_params = gs1.best_params_
    
    y_pred = best_model.predict(X_t)
    y_pred_train = best_model.predict(X_tr)

    F1_train = f1_score(y_train, y_pred_train, average="weighted", sample_weight=sw_train)
    F1_test = f1_score(y_test, y_pred, average="weighted", sample_weight=sw_test)
    acc_train = balanced_accuracy_score(y_train, y_pred_train, sample_weight=sw_train)
    acc_test = balanced_accuracy_score(y_test, y_pred, sample_weight=sw_test)

    y_probs = best_model.predict_proba(X_t)
    AUC = roc_auc_score(y_test, y_probs[:,1])
    
    scores = [cv_f1, F1_train, F1_test, cv_acc, acc_train, acc_test, AUC, best_model, best_params]
    
    print('\n\n\n GridSearch Result: ', gs_f1)
        
    return scores


In [None]:
# function that prints the F1 and accuracy on train and test set, as well as the hyperparamters chosen by gridsearch 

def print_scores(scores):
    print('TRAINING SET\n')
    print('F1: {ftr}'.format(ftr=scores[1]))
    print('Accuracy: {atr}\n---------------------------------\n'.format(atr=scores[4]))
    print('CV SET\n')
    print('F1: {fcv}'.format(fcv=scores[0]))
    print('Accuracy: {acv}\n---------------------------------\n'.format(acv=scores[3]))
    print('TEST SET\n')
    print('F1: {ft}'.format(ft=scores[2]))
    print('Accuracy: {at}\n---------------------------------\n'.format(at=scores[5]))
    print('Parameters: {par}'.format(par=scores[8]))

In [None]:
# function that prints and saves the classification report, the confusion matrix and the ROC curve

def print_report(best_model, AUC, model_name, X_test, y_test, sw_test, path_params):
    
    import pandas as pd
    from sklearn.metrics import roc_curve, classification_report,confusion_matrix
    import seaborn as sns
    import matplotlib.pyplot as plt
    
    y_pred=best_model.predict(X_test)
    y_probs = best_model.predict_proba(X_test)

    print()
    rep = classification_report(y_test, y_pred, output_dict=True,sample_weight=sw_test)
    print(classification_report(y_test, y_pred, sample_weight=sw_test))
    rep = pd.DataFrame(rep).transpose()
    path='results\classification\{o}\{sel}\Report_{m}.xlsx'.format(o=path_params[0], sel=path_params[1], m=model_name);
    rep.to_excel(path)
    
    
# Plot confusion matrix
    sns_plot=sns.heatmap(confusion_matrix(y_test, y_pred), annot=True, fmt='d', cmap="Blues")
    sns_plot.set_title("Confusion Matrix of "+model_name)
    fig = sns_plot.get_figure()
    path='results\classification\{o}\{sel}\Matrix_{m}.png'.format(o=path_params[0], sel=path_params[1], m=model_name);
    fig.savefig(path,format="png")
    
# Plot ROC curve 
    plt.figure()
    fpr, tpr, thresholds=roc_curve(y_test,  y_probs[:,1])
    plt.plot(fpr, tpr, label='AUC = %.2f '%AUC)
    plt.plot([0, 1], [0, 1], color='darkblue', linestyle='--')
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('ROC Curve of '+model_name)
    plt.legend(loc="lower right")
    path='results\classification\{o}\{sel}\ROC_{m}.png'.format(o=path_params[0], sel=path_params[1], m=model_name);
    plt.savefig(path,format="png")