# Machine Learning approach to analyze 3-Year prognostication for patients with brain arteriovenous malformation (bAVM) after stereotactic radiosurgery (SRS): a study for a small and heterogeneous group in Peru.

> [Utils]
---

## Main functions:

In [None]:
import joblib

import pandas as pd

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import confusion_matrix

# Functions
def trainAndGetBestModel(function, X, y, bal, the_metric, the_model_name, the_hyper_params, nfolds, random_seed, the_date_time, model_folder):
    balanced_prefix = "bal_" if bal==True else ""
    print('\nTrain via Cross-Validation and Grid-Search for (%s) with Scoring (%s): \n' % (the_model_name, the_metric))
    skf = StratifiedKFold(n_splits=nfolds, shuffle=True, random_state=random_seed)

    for i_fold, (train_idx, val_idx) in enumerate(skf.split(X, y)):
        X_train, X_val = X.iloc[train_idx], X.iloc[val_idx]
        Y_train, Y_val = y.iloc[train_idx], y.iloc[val_idx]
        fold_id = str(i_fold + 1)

        print('\n\nFold k: %s' % (fold_id))
        print('Training set size: %d' % len(Y_train))
        print('Validanting set size: %d' % len(Y_val))

        # X_train_norm = X_train
        # X_val_norm = X_val

        # Hyper-parameters 
        param_grid = the_hyper_params

        # Grid search
        print("\nTraining Set: Looking for the best model ...")
        clf = function
        internal_cv = StratifiedKFold(n_splits=nfolds)
        grid_cv = GridSearchCV(estimator=clf,
                              param_grid=param_grid,
                              cv=internal_cv,
                              scoring=the_metric.lower(),
                              verbose=0)
        grid_result = grid_cv.fit(X_train, Y_train)
        means = grid_result.cv_results_['mean_test_score']
        stds = grid_result.cv_results_['std_test_score']
        params = grid_result.cv_results_['params']
        for mean, stdev, param in zip(means, stds, params):
            print('\t%s (train) | mean: %f, std: %f --> %r' % (the_metric.lower(), mean, stdev, param))

        # Best model
        print('\t> Best %s: %f for params %s' % (the_metric.lower(), grid_result.best_score_, grid_result.best_params_))    
        best_clf = grid_cv.best_estimator_
        print('\t> Best model: %s' % best_clf)
        # Save best model
        model_file_name = "".join([balanced_prefix,the_model_name,'_',fold_id,'_',the_date_time,'.joblib'])
        joblib.dump(best_clf, "/".join([model_folder, 'cv', model_file_name]))

        # Best model prediction (val)
        print("\nValidating Set: Making Predictions ...")
        Y_val_predicted = best_clf.predict(X_val)
        cm = confusion_matrix(Y_val, Y_val_predicted)
        getMetrics(cm, model_file_name, '')
        

def crossValidation(X, y, bal, the_metric, the_model_name, nfolds, the_date_time, model_folder):
    balanced_prefix = "bal_" if bal==True else ""
    print('\nCross-Validation for (%s) with Scoring (%s): \n' % (the_model_name, the_metric))
    final_report_cv = []
    final_report_cv_title = ['ModelID', 'ModelName', 'Mean', 'Variance']

    for i in range(nfolds):
      # Load model
      model_id = str(i + 1) 
      model_name = balanced_prefix + the_model_name + '_%s_%s.joblib' % (model_id, the_date_time)
      model = joblib.load("/".join([model_folder, 'cv', model_name]))
      scores = cross_val_score(estimator = model, X = X, y = y, scoring = the_metric, cv = nfolds)
      scores_mean = scores.mean() 
      scores_variance = scores.std()
      final_report_cv.append([model_id, model_name, scores_mean, scores_variance])
      # print('\t Model: %s %s > Mean: %f | Variance: %f' % (model_name, METRIC, scores_mean, scores_variance))

    models = pd.DataFrame(final_report_cv, columns=final_report_cv_title)
    best_model = models.loc[[models['Mean'].idxmax()]]
    best_model_name = best_model['ModelName'].values[0]
    best_model_index = best_model['ModelID'].values[0]
    print(models)
    print('\nBest model: ', best_model_name)
    return best_model

def plot_roc_curve(fpr, tpr, auc_score, model_name):
    plt.figure()
    plt.plot(fpr, tpr, marker='.', label = "".join([model_name, " (area = %0.2f)"]) % auc_score)
    plt.plot([0, 1], [0, 1],'r--')
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.05])
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Receiver Operating Characteristic')
    plt.legend(loc="lower right")
    plt.show()

def getMetrics(cm, model_name, info):
    TN, FP, FN, TP = cm.ravel()
    # Accuracy
    accuracy = (TP+TN)/(TP+TN+FP+FN)
    # Sensitivity/Recall
    sensitivity = TP/(TP+FN)
    # Specificity
    specificity = TN/(TN+FP)
    # Positive Predictive Value (PPV)/ Precision
    PPV = TP/(TP + FP)
    # Negative Predictive Value (NPV)
    NPV = TN/(TN + FN)
    # BalancedAccuracy
    BA = (sensitivity + specificity)/2
    # F1Score
    F1 = 2*((PPV * sensitivity)/(PPV + sensitivity))

    print("Accuracy:           ","({:.2%})".format(accuracy))
    print("Sensitivity/Recall: ","({:.2%})".format(sensitivity))
    print("Specificity:        ","({:.2%})".format(specificity))
    print("PPV:                ","({:.2%})".format(PPV))
    print("NPV:                ","({:.2%})".format(NPV))
    print("Balanced Accuracy:  ","({:.2%})".format(BA))
    print("F1 Score:           ","({:.2%})".format(F1))

    report_metrics = [model_name, info,"%.4f" % accuracy, "%.4f" % sensitivity, "%.4f" % specificity, "%.4f" % PPV, "%.4f" % NPV,"%.4f" % BA,
                      "%.4f" % F1, 'AUC', TN, FP, FN, TP]
    
    return report_metrics

def saveFile(object_to_save, scaler_filename):
    joblib.dump(object_to_save, scaler_filename)

def loadFile(scaler_filename):
    return joblib.load(scaler_filename)