# Semi-supervised learning
## Label spreading

In [1]:
# Code source: Sebastian Curi and Andreas Krause, based on Jaques Grobler (sklearn demos).
# License: BSD 3 clause

# We start importing some modules and running some magic commands
% matplotlib inline
% reload_ext autoreload
% load_ext autoreload
% autoreload 2

# General math and plotting modules.
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
from matplotlib.colors import ListedColormap
import itertools
import plot_helpers

# Widget and formatting modules
import ipywidgets
from ipywidgets import interact, interactive, interact_manual, fixed
import pylab
# If in your browser the figures are not nicely vizualized, change the following line. 
pylab.rcParams['figure.figsize'] = (20, 5)

# Machine Learning library. 
from sklearn import datasets
from sklearn import svm
from sklearn.semi_supervised import label_propagation
from scipy import stats


The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [2]:
# Authors: Clay Woolam <clay@woolam.org>
# License: BSD


def label_spreading(labeled_data_percentage):
    iris = datasets.load_iris()
    X = iris.data[:, :2]
    y = iris.target
    
    rng = np.random.RandomState(0)
    y_train = np.copy(y)
    y_train[rng.rand(len(y)) < 1-labeled_data_percentage] = -1
    clf = label_propagation.LabelSpreading()
    clf.fit(X, y_train)
    
    # step size in the mesh
    h = .02
    # create a mesh to plot in
    x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
    y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
                         np.arange(y_min, y_max, h))
    
    color_map = {-1: (1, 1, 1), 0: (0, 0, .9), 1: (1, 0, 0), 2: (.8, .6, 0)}
    
    # Plot the decision boundary. For that, we will assign a color to each
    # point in the mesh [x_min, x_max]x[y_min, y_max].
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    # Put the result into a color plot
    Z = Z.reshape(xx.shape)
    plt.contourf(xx, yy, Z, cmap=plt.cm.Paired)
    plt.axis('off')

    # Plot also the training points
    colors = [color_map[y] for y in y_train]
    plt.scatter(X[:, 0], X[:, 1], c=colors, edgecolors='black')

    #plt.title(titles[i])
    plt.show()

interact(label_spreading, labeled_data_percentage=ipywidgets.FloatSlider(
    value=0.5, min=0.1, max=1.0, step=0.1, readout_format='.1f', style={'description_width': 'initial'},
    continuous_update=False),)

<function __main__.label_spreading>

In [3]:
def semisupervised_mnist(max_iterations, step):
    digits = datasets.load_digits()
    rng = np.random.RandomState(0)
    indices = np.arange(len(digits.data))
    rng.shuffle(indices)

    X = digits.data[indices[:330]]
    y = digits.target[indices[:330]]
    images = digits.images[indices[:330]]

    n_total_samples = len(y)
    n_labeled_points = 10

    unlabeled_indices = np.arange(n_total_samples)[n_labeled_points:]
    f = plt.figure()

    for i in range(max_iterations):
        if len(unlabeled_indices) == 0:
            print("No unlabeled items left to label.")
            break
        y_train = np.copy(y)
        y_train[unlabeled_indices] = -1

        lp_model = label_propagation.LabelSpreading(gamma=0.25, max_iter=max_iterations)
        lp_model.fit(X, y_train)

        predicted_labels = lp_model.transduction_[unlabeled_indices]
        true_labels = y[unlabeled_indices]


        # compute the entropies of transduced label distributions
        pred_entropies = stats.distributions.entropy(lp_model.label_distributions_.T)

        # select up to 5 digit examples that the classifier is most uncertain about
        uncertainty_index = np.argsort(pred_entropies)[::-1]
        uncertainty_index = uncertainty_index[np.in1d(uncertainty_index, unlabeled_indices)][:step]

        # keep track of indices that we get labels for
        delete_indices = np.array([])

        # Visualize the gain only on the first 5
        k = 0
        kmax = 5
        f.text(.05, (max_iterations - i - 0.5)/max_iterations, 
               "model %d\n\nfit with\n%d labels" % ((i + 1), n_labeled_points), 
               size=8)
        
        for index, image_index in enumerate(uncertainty_index):
            image = images[image_index]
            sub = f.add_subplot(max_iterations, kmax, index + 1 + (kmax * i))
            sub.imshow(image, cmap=plt.cm.gray_r, interpolation='none')
            sub.set_title("predict: %i\ntrue: %i" % (lp_model.transduction_[image_index], y[image_index]), size=10)
            sub.axis('off')
            k +=1 
            if k == kmax:
                break

        # labeling points, remote from labeled set
        delete_index, = np.where(unlabeled_indices == image_index)
        delete_indices = np.concatenate((delete_indices, delete_index))

        unlabeled_indices = np.delete(unlabeled_indices, delete_indices)
        n_labeled_points += len(uncertainty_index)
        
        f.suptitle("Active learning with Label Propagation.\nRows show 5 most "
                   "uncertain labels to learn with the next model.", y=1.15)
        
        plt.subplots_adjust(left=0.2, right=0.9, wspace=0.1, hspace=0.85,
                               bottom=0.2/(max_iterations), 
                               top=1-0.15/max_iterations) #  top=0.9,
        
    plt.show()
        
interact(semisupervised_mnist, 
         max_iterations=ipywidgets.IntSlider(value=6, min=1, max=10, step=1, description='Max iter:',
                                         style={'description_width': 'initial'}, continuous_update=False),
        step=ipywidgets.IntSlider(value=5, min=2, max=8, step=1, description='Label step:',
                                         style={'description_width': 'initial'}, continuous_update=False),)

<function __main__.semisupervised_mnist>