Here, I will be plotting ROC curves for models with zero predictive power (random etc) on balanced/imbalanced classification data sets to see what they look like. I'm just trying to gain more intuitions about ROC curves by playing around with them :-)

In [None]:
import numpy as np
from matplotlib import pyplot as plt
from typing import List

In [None]:
def plot_roc_curve(false_positive_rate: List[int], true_positive_rate: List[int]):
    plt.figure()
    plt.xlabel("False Positive")
    plt.ylabel("True Positive")
    plt.scatter(np.array(false_positive_rate), np.array(true_positive_rate))
    plt.show()


In [None]:
def calculate_roc_and_plot(labels: np.array, predictions: np.array):
    thresholds = np.arange(0.001, 1, 0.001)
    false_positive_rate = np.zeros((thresholds.size,))
    true_positive_rate = np.zeros((thresholds.size,))
    for j in range(0, thresholds.size):
        all_positives = 0
        all_negatives = 0
        true_positives = 0
        false_positives = 0
        threshold = thresholds[j]
        for i in range(0, 1000):
            label = labels[i]
            prediction = predictions[i] >= threshold
            if label == 1:
                all_positives += 1
            else:
                all_negatives += 1
            if prediction:
                if label == 1:
                    true_positives += 1
                else:
                    false_positives += 1
        false_positive_rate[j] = false_positives / all_negatives
        true_positive_rate[j] = true_positives / all_positives
        # print("All Positives ", all_positives)
        # print("All Negatives ", all_negatives)
        # print("True Positives ", true_positives)
        # print("False Positives ", false_positives)
        # print("True Positive Rate ", true_positives / all_positives)
        # print("False Positive Rate ", false_positives / all_negatives)
    plot_roc_curve(false_positive_rate, true_positive_rate)

In [None]:
# Balanced case
balanced_labels = np.random.random_integers(0,1, 1000)
balanced_distributed_predictions = np.random.random(1000)
calculate_roc_and_plot(balanced_labels, balanced_distributed_predictions)

In [None]:
# Imbalanced (more negative) labels, balanced predictions
labels = np.append(np.random.random_integers(0,1,500), np.zeros(500))
np.random.shuffle(labels)
imbalanced_labels = labels
balanced_distributed_predictions = np.random.random(1000)
calculate_roc_and_plot(imbalanced_labels, balanced_distributed_predictions)

In [None]:
# Imbalanced (more positive) labels, balanced predictions
labels = np.append(np.random.random_integers(0,1,100), np.ones(900))
np.random.shuffle(labels)
imbalanced_labels = labels
balanced_distributed_predictions = np.random.random(1000)
calculate_roc_and_plot(imbalanced_labels, balanced_distributed_predictions)

In [None]:
# Imbalanced (more positive) labels, all predictions positive
labels = np.append(np.random.random_integers(0,1,500), np.ones(500))
np.random.shuffle(labels)
all_positive_predictions = np.ones(1000)
calculate_roc_and_plot(labels, all_positive_predictions)

In [None]:
# Imbalanced (more positive) labels, all predictions negative
labels = np.append(np.random.random_integers(0,1,500), np.ones(500))
np.random.shuffle(labels)
all_negative_predictions = np.zeros(1000)
calculate_roc_and_plot(labels, all_negative_predictions)

In [None]:
# Imbalanced (more positive) labels, imbalanced predictions (in the same direction)
# Note that, given the predictions are weighted toward the dominant class, I don't think you can call this 
# "having no predictive power" anymore -- it's a crude model that has learned predicting the dominant class helps
labels = np.append(np.random.random_integers(0,1,100), np.ones(900))
np.random.shuffle(labels)
print(labels.mean())
positively_weighted_predictions = np.append(np.random.random(100), (np.random.random(900)/4 + 0.75))
print(positively_weighted_predictions.mean())
np.random.shuffle(positively_weighted_predictions)
calculate_roc_and_plot(labels, positively_weighted_predictions)