# Background check for semi-supervised learning

In [1]:
%matplotlib notebook
import numpy as np
import matplotlib.pyplot as plt
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.base import clone

plt.style.use('ggplot')
plt.rcParams["savefig.format"] = 'svg'
plt.rcParams['image.cmap'] = 'viridis'

np.random.seed(42)

#x, y = make_classification(n_samples=10000, n_features=2, n_informative=5, n_redundant=0, n_clusters_per_class=4)

n_samples=1000
n_features=2
centers=10
x, blob_id = make_blobs(n_samples=n_samples, n_features=n_features,
                        centers=centers, shuffle=False, cluster_std=2)
y = blob_id%2

fig = plt.figure()
ax = fig.add_subplot()
ax.scatter(x[:,0], x[:,1], c=y, alpha=0.5, edgecolor='grey')
dataset_name = 'blobs_s_{}_f_{}_c_{}'.format(n_samples, n_features, centers)
fig.savefig('dataset_{}'.format(dataset_name))

<IPython.core.display.Javascript object>

In [2]:
classes, counts = np.unique(y, return_counts=True)
fig = plt.figure()
ax = fig.add_subplot()
ax.bar(classes, counts)

<IPython.core.display.Javascript object>

<BarContainer object of 2 artists>

In [3]:
# generation of a grid for later model exploration
# define bounds of the domain
min1, max1 = x[:, 0].min()-1, x[:, 0].max()+1
min2, max2 = x[:, 1].min()-1, x[:, 1].max()+1
# define the x and y scale
x1grid = np.arange(min1, max1, 0.1)
x2grid = np.arange(min2, max2, 0.1)
# create all of the lines and rows of the grid
xx, yy = np.meshgrid(x1grid, x2grid)
# flatten each grid to a vector
r1, r2 = xx.flatten(), yy.flatten()
r1, r2 = r1.reshape((len(r1), 1)), r2.reshape((len(r2), 1))
# horizontal stack vectors to create x1,x2 input for the model
grid = np.hstack((r1,r2))

# Labelled and unlabelled data

We can generate unlabelled data by selecting a subset of the original data and dropping its labels. We will generate the unlabelled data from two different forms:

1. IID: Uniformly random from the labelled distribution
2. OOD: Non-uniformly random from the labelled distribution

In this example, given that the samples are sorted by blobs, we can split the data into two parts for option (2), or shuffle the data and split for option (1).

1. IID: shuffle=True
2. OOD: shuffle=False

In [4]:
shuffle=False
dataset_name += '_iid' if shuffle else '_ood'

x_label, x_unlabel, y_label, y_unlabel = train_test_split(x, y, test_size=0.7, shuffle=shuffle)

fig = plt.figure()
ax = fig.add_subplot()
ax.scatter(x_label[:,0], x_label[:,1], c=y_label, edgecolor='grey', alpha=0.5)
ax.scatter(x_unlabel[:,0], x_unlabel[:,1], c='white', edgecolors='black', alpha=0.5)
fig.savefig('semi_{}'.format(dataset_name))

<IPython.core.display.Javascript object>

In [5]:
classes, counts = np.unique(y_label, return_counts=True)
fig = plt.figure()
ax = fig.add_subplot()
ax.bar(classes, counts)

<IPython.core.display.Javascript object>

<BarContainer object of 2 artists>

## Supervised model

In the subset of labeled data we can train a probabilistic classifier, and check the prediction space.

In [6]:
from sklearn.svm import SVC

classifier_dict = {'svm_rbf': SVC(kernel='rbf', probability=True)}

clf_name = 'svm_rbf'

# TODO: Check what happens when I do not clone.
#clf = classifier_dict[clf_name]
clf = clone(classifier_dict[clf_name])

clf.fit(x_label, y_label)

# make predictions for the grid
yhat = clf.predict_proba(grid)[:,1]
# reshape the predictions back into a grid
zz = yhat.reshape(xx.shape)
# plot the grid of x, y and z values as a surface
fig = plt.figure()
ax = fig.add_subplot()
im = ax.contourf(xx, yy, zz)
fig.colorbar(im, label='$p(y=1|x)$')
ax.scatter(x_label[:,0], x_label[:,1], c=y_label, edgecolors='grey',
            alpha=0.5)
ax.scatter(x_unlabel[:,0], x_unlabel[:,1], c='white', edgecolors='black',
            alpha=0.5)
ax.set_title('Accuarcy = {:0.3f}'.format(accuracy_score(y_label, clf.predict(x_label))))
fig.savefig('{}_clf_{}'.format(dataset_name, clf_name))

<IPython.core.display.Javascript object>

We define a function to compute a given metric function to the rejected, and accepted samples from a full range of an arbitrary sorting value.

In [7]:
def function_vs_reject(ground_truth, predictions, thresholds, func=accuracy_score):
    thresholds_sort_idx = np.argsort(-1*thresholds)
    predictions_sorted = predictions[thresholds_sort_idx]
    ground_truth_sorted = ground_truth[thresholds_sort_idx]
    accepted = np.zeros(thresholds.shape[0])
    rejected = np.zeros(thresholds.shape[0])
    samples = np.zeros(thresholds.shape[0])
    for i, threshold_id in enumerate(thresholds_sort_idx):
        threshold = thresholds[threshold_id]
        rejected[i] = func(ground_truth_sorted[i:], predictions_sorted[i:])
        accepted[i] = func(ground_truth_sorted[:i], predictions_sorted[:i])
        samples[i] = ground_truth_sorted[:i].shape[0]
            

    return accepted, rejected, samples, thresholds[thresholds_sort_idx]

Here, we will sort the accepted samples by the confidence of the binary classifier's prediction (maximum posterior probability per sample). We will show the accuracy of the accepted and rejected samples.

In [8]:
from matplotlib import gridspec
from matplotlib.ticker import MaxNLocator
from sklearn.metrics import f1_score, accuracy_score

from functools import partial

#func = partial(f1_score, zero_division=0)
func = accuracy_score

x_aux = x_unlabel
y_aux = y_unlabel

predictions = clf.predict(x_aux)
p_conf = np.max(clf.predict_proba(x_aux), axis=1)

accepted, rejected, samples, sorting_value = function_vs_reject(y_aux, predictions, p_conf,
                                                                func=func)

spec = gridspec.GridSpec(ncols=1, nrows=2, height_ratios=[5, 1],
                         wspace=0.02, hspace=0.05, left=0.15)
fig = plt.figure()
ax = fig.add_subplot(spec[0])
ax.plot(sorting_value, accepted, label='accepted')
ax.plot(sorting_value, rejected, label='rejected')
ax.set_ylabel('Accuracy')
ax.set_xticklabels([])
ax.set_xlim([sorting_value.max(), sorting_value.min()])
ax.set_ylim([-0.02, 1.02])
ax.legend()
ax = fig.add_subplot(spec[1])
ax.plot(sorting_value, samples)
ax.plot(sorting_value, samples.max() - samples)
ax.set_ylabel('#count')
ax.yaxis.set_major_locator(MaxNLocator(integer=True, prune='upper', nbins=3))
ax.set_xlabel('threshold max$(p(y=1|x), p(y=0|x))$')
ax.set_xlim([sorting_value.max(), sorting_value.min()])
fig.align_labels()
fig.savefig('')
plt.savefig('{}_clf_{}_conf_reject'.format(dataset_name, clf_name))

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


<IPython.core.display.Javascript object>

In [9]:
spec = gridspec.GridSpec(ncols=1, nrows=2, height_ratios=[5, 1],
                         wspace=0.02, hspace=0.05, left=0.15)

percentage = np.linspace(0, 100, x_aux.shape[0])

fig = plt.figure()
ax = fig.add_subplot(spec[0])
ax.plot(percentage, accepted, label='accepted')
ax.plot(percentage, rejected, label='rejected')
ax.set_ylabel('Accuracy')
ax.set_xticklabels([])
#ax.set_xlim([sorting_value.max(), sorting_value.min()])
ax.set_ylim([-0.02, 1.02])
ax.legend()
ax = fig.add_subplot(spec[1])
ax.plot(percentage, samples)
ax.plot(percentage, samples.max() - samples)
ax.set_ylabel('#count')
ax.yaxis.set_major_locator(MaxNLocator(integer=True, prune='upper', nbins=3))
ax.set_xlabel('percentage')
#ax.set_xlim([sorting_value.max(), sorting_value.min()])
fig.align_labels()
fig.savefig('{}_clf_{}_conf_reject_perc'.format(dataset_name, clf_name))

<IPython.core.display.Javascript object>

# Labelled vs unlabelled

We will focus now on the labelled vs unlabelled data (not distinguishing between classes).

In [10]:
x_l_vs_u = np.concatenate([x_label, x_unlabel])
y_l_vs_u = np.concatenate([np.ones(x_label.shape[0]), np.zeros(x_unlabel.shape[0])])
fig = plt.figure()
ax = fig.add_subplot()
plt.scatter(x_l_vs_u[:,0], x_l_vs_u[:,1], c=y_l_vs_u, cmap='binary',
            edgecolor='k', alpha=0.5)
plt.savefig('{}_lab_vs_unl'.format(dataset_name))

<IPython.core.display.Javascript object>

## Supervised model

to differentiate between samples with or without a label. If the samples are difficult to differentiate, we can expect both labelled and unlabelled data comming from the same data distribution. On the oposite case, if the model is able to predict from which set different sample come, is indicative of out of distribution data, for which we should not make clear assumptions.

In [11]:
clf_fg_name = 'svm_rbf'

# TODO: Check what happens when I do not clone
# clf_fg = classifier_dict[clf_name]
clf_fg = clone(classifier_dict[clf_name])

clf_fg.fit(x_l_vs_u, y_l_vs_u)

SVC(probability=True)

In [12]:
# make predictions for the grid
yhat = clf_fg.predict_proba(grid)[:,1]
# reshape the predictions back into a grid
zz = yhat.reshape(xx.shape)
# plot the grid of x, y and z values as a surface
fig = plt.figure()
ax = fig.add_subplot()
im = plt.contourf(xx, yy, zz, cmap='binary')
fig.colorbar(im, label='$p(fg|x)$')
ax.scatter(x_l_vs_u[:,0], x_l_vs_u[:,1], c=y_l_vs_u, cmap='binary',
            edgecolor='grey', alpha=0.5)
ax.set_title('Accuarcy = {}'.format(accuracy_score(y_l_vs_u, clf_fg.predict(x_l_vs_u))));
fig.savefig('{}_clf_fg_{}'.format(dataset_name, clf_name))

<IPython.core.display.Javascript object>

Here, we will sort the accepted samples by the posterior probability of belonging to the same distribution as the labelled samples. We will show the accuracy of the accepted and rejected samples.

In [13]:
predictions = clf.predict(x_aux)
p_fg = clf_fg.predict_proba(x_aux)[:,1]

accepted, rejected, samples, sorting_value = function_vs_reject(y_aux, predictions, p_fg,
                                                                func=func)

spec = gridspec.GridSpec(ncols=1, nrows=2, height_ratios=[5, 1],
                         wspace=0.02, hspace=0.05, left=0.15)
fig = plt.figure()
ax = fig.add_subplot(spec[0])
ax.plot(sorting_value, accepted, label='accepted')
ax.plot(sorting_value, rejected, label='rejected')
ax.set_ylabel('Accuracy')
ax.set_xticklabels([])
ax.set_xlim([sorting_value.max(), sorting_value.min()])
ax.set_ylim([-0.02, 1.02])
ax.legend()
ax = fig.add_subplot(spec[1])
ax.plot(sorting_value, samples)
ax.plot(sorting_value, samples.max() - samples)
ax.set_ylabel('#count')
ax.yaxis.set_major_locator(MaxNLocator(integer=True, prune='upper', nbins=3))
ax.set_xlabel('threshold $p(fg|x)$')
ax.set_xlim([sorting_value.max(), sorting_value.min()])
fig.align_labels()
plt.savefig('{}_clf_{}_bg_reject_{}'.format(dataset_name, clf_name, clf_fg_name))

  avg = a.mean(axis)
  ret = ret.dtype.type(ret / rcount)


<IPython.core.display.Javascript object>

The same figure but with percentage in the x-axis instead of posterior probabilities.

In [14]:
spec = gridspec.GridSpec(ncols=1, nrows=2, height_ratios=[5, 1],
                         wspace=0.02, hspace=0.05, left=0.15)

percentage = np.linspace(0, 100, x_aux.shape[0])

fig = plt.figure()
ax = fig.add_subplot(spec[0])
ax.plot(percentage, accepted, label='accepted')
ax.plot(percentage, rejected, label='rejected')
ax.set_ylabel('Accuracy')
ax.set_xticklabels([])
#ax.set_xlim([sorting_value.max(), sorting_value.min()])
ax.set_ylim([-0.02, 1.02])
ax.legend()
ax = fig.add_subplot(spec[1])
ax.plot(percentage, samples)
ax.plot(percentage, samples.max() - samples)
ax.set_ylabel('#count')
ax.yaxis.set_major_locator(MaxNLocator(integer=True, prune='upper', nbins=3))
ax.set_xlabel('percentage')
#ax.set_xlim([sorting_value.max(), sorting_value.min()])
fig.align_labels()
plt.savefig('{}_clf_{}_bg_reject_{}_perc'.format(dataset_name, clf_name, clf_fg_name))

<IPython.core.display.Javascript object>

In [15]:
fig = plt.figure()
ax = fig.add_subplot()
ax.scatter(accepted, rejected, alpha=0.1)
ax.set_title('Pairs by threshold')
ax.set_xlabel('Accuracy on accepted')
ax.set_ylabel('Accuracy on rejected')
ax.set_ylim(0, 1)
ax.set_xlim(0, 1)
plt.savefig('{}_clf_{}_bg_{}_acc_reject_vs_accept'.format(dataset_name, clf_name, clf_fg_name))

<IPython.core.display.Javascript object>

In [16]:
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
ax.scatter(sorting_value, accepted, rejected, alpha=0.1)
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.set_zlim(0, 1)
ax.set_xlabel('Reject threshold')
ax.set_ylabel('Accuracy on accepted')
ax.set_zlabel('Accuracy on rejected')

<IPython.core.display.Javascript object>

Text(0.5, 0, 'Accuracy on rejected')

- the x-axis represents the proportion of misclassifications that have been rejected: 'the amount of rejected misclassifications / total amount of misclassifications' for increasing rejection threshold
- the y-axis represents the proportion of correct predictions that have been rejected: 'the amount of rejected good classifications / total amount of good classifications' for increasing rejecting threshold

In [17]:
fig = plt.figure()
ax = fig.add_subplot()
ax.scatter((1 - rejected)/ (2 - (rejected + accepted)),
           (rejected)/ ((rejected + accepted)), alpha=0.1)
ax.set_ylim(0, 1)
ax.set_xlim(0, 1)
ax.set_xlabel('Proportion of rejected missclassifications')
ax.set_ylabel('Proportion of rejected correctly classified')

<IPython.core.display.Javascript object>

Text(0, 0.5, 'Proportion of rejected correctly classified')

In [18]:
fig = plt.figure()
ax = fig.add_subplot(projection='3d')
ax.scatter(sorting_value,
           (1 - rejected)/ (2 - (rejected + accepted)),
           (rejected)/ ((rejected + accepted)), alpha=0.1)
ax.set_xlim(0, 1)
ax.set_ylim(0, 1)
ax.set_zlim(0, 1)
ax.set_xlabel('Reject threshold')
ax.set_ylabel('Proportion of rejected missclassifications')
ax.set_zlabel('Proportion of rejected correctly classified')

<IPython.core.display.Javascript object>

Text(0.5, 0, 'Proportion of rejected correctly classified')