In [1]:
import pandas as pd
import numpy as np
from numpy import where
#from matplotlip import pyplot
import matplotlib.pyplot as plt
from collections import Counter
import seaborn as sns

from scipy.stats import chi2_contingency

from sklearn.neighbors import KNeighborsClassifier 
import sklearn.model_selection as model_selection
from sklearn.model_selection import cross_val_score
from sklearn import preprocessing
from sklearn.datasets import make_blobs
from sklearn.model_selection import GridSearchCV 
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from decimal import *


In [2]:
#great function found on kaggle: https://www.kaggle.com/grfiv4/displaying-the-results-of-a-grid-search

def GridSearch_table_plot(grid_clf, param_name,
                          num_results=15,
                          negative=True,
                          graph=True,
                          display_all_params=True):

    '''Display grid search results

    Arguments
    ---------

    grid_clf           the estimator resulting from a grid search
                       for example: grid_clf = GridSearchCV( ...

    param_name         a string with the name of the parameter being tested

    num_results        an integer indicating the number of results to display
                       Default: 15

    negative           boolean: should the sign of the score be reversed?
                       scoring = 'neg_log_loss', for instance
                       Default: True

    graph              boolean: should a graph be produced?
                       non-numeric parameters (True/False, None) don't graph well
                       Default: True

    display_all_params boolean: should we print out all of the parameters, not just the ones searched for?
                       Default: True

    Usage
    -----

    GridSearch_table_plot(grid_clf, "min_samples_leaf")

                          '''
    from matplotlib      import pyplot as plt
    from IPython.display import display
    import pandas as pd

    clf = grid_clf.best_estimator_
    clf_params = grid_clf.best_params_
    if negative:
        clf_score = -grid_clf.best_score_
    else:
        clf_score = grid_clf.best_score_
    clf_stdev = grid_clf.cv_results_['std_test_score'][grid_clf.best_index_]
    cv_results = grid_clf.cv_results_

    print("best parameters: {}".format(clf_params))
    print("best score:      {:0.5f} (+/-{:0.5f})".format(clf_score, clf_stdev))
    if display_all_params:
        import pprint
        pprint.pprint(clf.get_params())

    # pick out the best results
    # =========================
    scores_df = pd.DataFrame(cv_results).sort_values(by='rank_test_score')

    best_row = scores_df.iloc[0, :]
    if negative:
        best_mean = -best_row['mean_test_score']
    else:
        best_mean = best_row['mean_test_score']
    best_stdev = best_row['std_test_score']
    best_param = best_row['param_' + param_name]

    # display the top 'num_results' results
    # =====================================
    display(pd.DataFrame(cv_results) \
            .sort_values(by='rank_test_score').head(num_results))

    # plot the results
    # ================
    scores_df = scores_df.sort_values(by='param_' + param_name)

    if negative:
        means = -scores_df['mean_test_score']
    else:
        means = scores_df['mean_test_score']
    stds = scores_df['std_test_score']
    params = scores_df['param_' + param_name]

    # plot
    if graph:
        plt.figure(figsize=(8, 8))
        plt.errorbar(params, means, yerr=stds)

        plt.axhline(y=best_mean + best_stdev, color='red')
        plt.axhline(y=best_mean - best_stdev, color='red')
        plt.plot(best_param, best_mean, 'or')

        plt.title(param_name + " vs accuracy\nbest accuracy {:0.5f}".format(clf_score))
        plt.xlabel(param_name)
        plt.ylabel('accuracy')
        plt.savefig('tree_grid_accuracy.png',bbox_inches='tight', dpi=300)
        plt.show()


In [3]:
##print nice cm normalized function
def plot_confusion_matrix(cm,
                          target_names,
                          title='Confusion matrix',
                          cmap=None,
                          normalize=True):
    """
    given a sklearn confusion matrix (cm), make a nice plot

    Arguments
    ---------
    cm:           confusion matrix from sklearn.metrics.confusion_matrix

    target_names: given classification classes such as [0, 1, 2]
                  the class names, for example: ['high', 'medium', 'low']

    title:        the text to display at the top of the matrix

    cmap:         the gradient of the values displayed from matplotlib.pyplot.cm
                  see http://matplotlib.org/examples/color/colormaps_reference.html
                  plt.get_cmap('jet') or plt.cm.Blues

    normalize:    If False, plot the raw numbers
                  If True, plot the proportions

    Usage
    -----
    plot_confusion_matrix(cm           = cm,                  # confusion matrix created by
                                                              # sklearn.metrics.confusion_matrix
                          normalize    = True,                # show proportions
                          target_names = y_labels_vals,       # list of names of the classes
                          title        = best_estimator_name) # title of graph

    Citiation
    ---------
    http://scikit-learn.org/stable/auto_examples/model_selection/plot_confusion_matrix.html

    """
    import matplotlib.pyplot as plt
    import numpy as np
    import itertools

    if cmap is None:
        cmap = plt.get_cmap('Blues')

    plt.figure(figsize=(8, 6))
    plt.imshow(cm, interpolation='nearest', cmap=cmap)
    plt.title(title)
    plt.colorbar()

    if target_names is not None:
        tick_marks = np.arange(len(target_names))
        plt.xticks(tick_marks, target_names, rotation=45)
        plt.yticks(tick_marks, target_names)

    if normalize:
        cm = cm.astype('float') / cm.sum(axis=1)[:, np.newaxis]


    thresh = cm.max() / 1.5 if normalize else cm.max() / 2
    for i, j in itertools.product(range(cm.shape[0]), range(cm.shape[1])):
        if normalize:
            plt.text(j, i, "{:0.4f}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")
        else:
            plt.text(j, i, "{:,}".format(cm[i, j]),
                     horizontalalignment="center",
                     color="white" if cm[i, j] > thresh else "black")


    plt.tight_layout()
    plt.ylabel('True label')
    plt.xlabel('Predicted label')
    plt.savefig(title + '.png',bbox_inches='tight', dpi=300)
    plt.show()


In [8]:
def evaluate_model(model,x, y, cm_title, cm_filename):
 
    y_pred =cross_val_predict(model, x, y, cv=5)

    print(model)
    #EVALUTATION
    cm = confusion_matrix(y, y_pred)
    TP = cm[1, 1]
    TN = cm[0, 0]
    FP = cm[0, 1]
    FN = cm[1, 0]
    # plot_confusion_matrix(cm,,True,title="model" + str(k))

    if cm_title!=None:
        fig1 = plot_confusion_matrix(cm = cm,  
                                  normalize    = True,         
                                  target_names = ['benign', 'malignant'],       
                                  title        =cm_title )
        plt.savefig(cm_filename,bbox_inches='tight', dpi=300)
        print("Confusion Matrix")
        print(cm)


    #Classification Accuracy
    acc = metrics.accuracy_score(y, y_pred)
    print("Accuracy: {}".format(acc))

    #Sensitivity
    recall = metrics.recall_score(y, y_pred)
    print("Sensitivity: {}".format(recall))

    #Specificity
    specificity = TN / float(TN + FP)
    print("Specificity: {}".format(specificity))

    #False Positive Rate
    fpr = FP / float(TN + FP)
    print("False Positive Rate: {}".format(fpr))

    #Preciscion
    pr = metrics.precision_score(y, y_pred)
    print("Preciscion: {}".format(pr))
    
    report = metrics.classification_report(y, y_pred)
    print(report)
    
    #ROC Curve just to get AUC value (no plotting)
    # store the predicted probabilities for class 1
    y_pred_prob = cross_val_predict(model, x, y, cv=5,method='predict_proba')[:, 1]
    fpr, tpr, thresholds = metrics.roc_curve(y, y_pred_prob)
    
    #AUC Score
    auc = metrics.roc_auc_score(y, y_pred_prob)
    print("AUC: {}".format(auc))
    

In [5]:
#ROC Curve plotting
def roc_curve(y, y_pred_prob, title, filename):
    fpr, tpr, thresholds = metrics.roc_curve(y, y_pred_prob)
    plt.plot(fpr, tpr)
    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.title(title)
    plt.xlabel('False Positive Rate (1 - Specificity)')
    plt.ylabel('True Positive Rate (Sensitivity)')
    plt.grid(True)
    plt.savefig(filename, bbox_inches='tight')
    plt.close
