# ROC and Precision-Recall for Gaussian mixture models

In [None]:
from sklearn.datasets import make_blobs
import matplotlib.pyplot as plt
from matplotlib import cm
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import precision_recall_curve, auc, f1_score
from sklearn.metrics import roc_curve, roc_auc_score
from sklearn.naive_bayes import GaussianNB
from sklearn.inspection import DecisionBoundaryDisplay
import math

In [None]:
n_samples = 10000
n_bins = 2
mean = 2
scale = 0.2

def make_data(n_samples=n_samples, mean = mean, n_bins=n_bins, scale=scale):
    centers = [(0, 0), (mean, 0)]
    X, y = make_blobs(n_samples=n_samples, centers=centers, shuffle=False, random_state=42)
    scaling = np.ones_like(X)
    scaling[:,1] = ((scale-1)*y+1)
    X = (scaling*X)
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.8, random_state=42)
    return X_train, X_test, y_train, y_test

X_train, X_test, y_train, y_test = make_data()

In [None]:
plt.figure()
y_unique = np.unique(y_train)
colors = cm.rainbow(np.linspace(0.0, 1.0, y_unique.size))
for this_y, color in zip(y_unique, colors):
    this_X = X_train[y_train == this_y]
    plt.scatter(
        this_X[:, 0],
        this_X[:, 1],
        c=color[np.newaxis, :],
        alpha=0.5,
        edgecolor="k",
        label="Class %s" % this_y,
    )
plt.legend(loc="best")
plt.title("Data");

In [None]:
def compute_roc(model, X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test):
    model.fit(X_train, y_train)
    lr_probs = model.predict_proba(X_test)
    lr_probs = lr_probs[:, 1]
    lr_fpr, lr_tpr, _ = roc_curve(y_test, lr_probs)
    return lr_fpr, lr_tpr, lr_probs

model_lg = LogisticRegression(solver='lbfgs')
lr_fpr_lg, lr_tpr_lg, lr_probs_lg = compute_roc(model_lg)
model_nb = GaussianNB()
lr_fpr_nb, lr_tpr_nb, lr_probs_nb = compute_roc(model_nb)

plt.plot(lr_fpr_lg, lr_tpr_lg, marker='.', label='Logistic')
plt.plot(lr_fpr_nb, lr_tpr_nb, marker='.', label='Naive Bayes')
ns_probs = [0 for _ in range(len(y_test))]
ns_fpr, ns_tpr, _ = roc_curve(y_test, ns_probs)
plt.plot(ns_fpr, ns_tpr, linestyle='--', label='No Skill')
plt.xlabel('False Positive Rate')
plt.ylabel('True Positive Rate')
plt.legend()

In [None]:
lr_auc_lg = roc_auc_score(y_test, lr_probs_lg)
print('Logistic: ROC AUC=%.3f' % (lr_auc_lg))
lr_auc_nb = roc_auc_score(y_test, lr_probs_nb)
print('NB: ROC AUC=%.3f' % (lr_auc_nb))

In [None]:
def compute_rocauc(model, X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test):
    _, _, lr_probs = compute_roc(model, X_train=X_train,y_train=y_train,X_test=X_test,y_test=y_test)
    return roc_auc_score(y_test, lr_probs)

In [None]:
lr_precision_lg, lr_recall_lg, _ = precision_recall_curve(y_test, lr_probs_lg)
plt.plot(lr_recall_lg, lr_precision_lg, marker='.', label='Logistic')
lr_precision_nb, lr_recall_nb, _ = precision_recall_curve(y_test, lr_probs_nb)
plt.plot(lr_recall_nb, lr_precision_nb, marker='.', label='NB')
no_skill = np.sum(y_test) / len(y_test)
plt.plot([0, 1], [no_skill, no_skill], linestyle='--', label='No Skill')
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.legend();

In [None]:
y_predict_lg = model_lg.predict(X_test)
lr_f1_lg, lr_auc_lg = f1_score(y_test, y_predict_lg), auc(lr_recall_lg, lr_precision_lg)
print('Logistic: F1=%.3f PR AUC=%.3f' % (lr_f1_lg, lr_auc_lg))

y_predict_nb = model_nb.predict(X_test)
lr_f1_nb, lr_auc_nb = f1_score(y_test, y_predict_nb), auc(lr_recall_nb, lr_precision_nb)
print('NB: F1=%.3f PR AUC=%.3f' % (lr_f1_nb, lr_auc_nb))