In [None]:
from matplotlib import cm
import matplotlib.pyplot as plt

import numpy as np
from sklearn.semi_supervised import LabelPropagation, LabelSpreading, SelfTrainingClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [None]:
def forceAspect(ax,aspect=1):
    im = ax.get_images()
    extent =  im[0].get_extent()
    ax.set_aspect(abs((extent[1]-extent[0])/(extent[3]-extent[2]))/aspect)


### Dummy datasets

##### Utilities

In [None]:
def plot_data(X, y, cmap=cm.get_cmap('RdBu', 100), title=None):
    fig, ax = plt.subplots()
    z=y.copy().astype(float)
    z[z==-1] = 0.5
    ind = np.argsort(np.abs(z-0.5))
    plt.scatter(*X[ind, :].T, c=cmap(z[ind]))
    if title is not None:
        plt.title(title)
    ax.set_aspect(aspect='equal')

In [None]:
def mask_all_but(y, n):
    y = y.astype(int)
    y[np.random.choice(range(len(y)), len(y) - n, replace=False)] = -1  # sklearn convention
    return y

In [None]:
np.random.seed(75)

In [None]:
n_labels = 9

##### Band

Diagonal band of emptyness between two triangles, each of which represents a class.

In [None]:
x_band = np.random.random([200, 2]) * np.array([3, 1])
x_band = x_band[np.abs(x_band[:, 1] - 0.3 * x_band[:, 0]) > 0.15]
y_band = mask_all_but(np.clip(np.sign(x_band[:, 1] - 0.3 * x_band[:, 0]), 0, 1), n_labels)

In [None]:
plot_data(x_band, y_band)

##### Rings

Two rings of different classes touching each other

In [None]:
x_rings = np.random.random([700, 2]) * np.array([2, 1])
rad_1 = ((x_rings - np.array([0.5, 0.5]))**2).sum(axis=1)
rad_2 = ((x_rings - np.array([1.5, 0.5]))**2).sum(axis=1)
x_rings = x_rings[((rad_1 > 0.3**2) & (rad_1 < 0.5**2)) | ((rad_2 > 0.3**2) & (rad_2 < 0.5**2)), :]
rad_1 = ((x_rings - np.array([0.5, 0.5]))**2).sum(axis=1)
y_rings = mask_all_but(((rad_1 > 0.3**2) & (rad_1 < 0.5**2)).astype(float), n_labels)

In [None]:
plot_data(x_rings, y_rings)

#### Filled rings

Rings but filled with blobs. This is an example with disjunct target regions.

In [None]:
x_rings2 = np.random.random([700, 2]) * np.array([2, 1])
rad_1 = ((x_rings2 - np.array([0.5, 0.5]))**2).sum(axis=1)
rad_2 = ((x_rings2 - np.array([1.5, 0.5]))**2).sum(axis=1)
x_rings2 = x_rings2[
    ((rad_1 > 0.3**2) & (rad_1 < 0.5**2)) | 
    ((rad_2 > 0.3**2) & (rad_2 < 0.5**2)) |
    (rad_1 < 0.2**2) |
    (rad_2 < 0.2**2), 
    :]
rad_1 = ((x_rings2 - np.array([0.5, 0.5]))**2).sum(axis=1)
rad_2 = ((x_rings2 - np.array([1.5, 0.5]))**2).sum(axis=1)
y_rings2 = mask_all_but((((rad_1 > 0.3**2) & (rad_1 < 0.5**2)) | (rad_2 < 0.2 ** 2)).astype(float), n_labels*2)

In [None]:
plot_data(x_rings2, y_rings2)

##### Blobs

Two 2d spherical normals

In [None]:
n = 70
x_blobs = np.concatenate([
    np.random.normal(size=[n, 2]) + np.array([3, 3]),
    np.random.normal(size=[n, 2])
])
y_blobs = mask_all_but(np.concatenate([np.ones(n), np.zeros(n)]), n_labels)

In [None]:
plot_data(x_blobs, y_blobs)

## Semi-Supervised classification on dummies

### Scikit-Learn

Scikit offers some approaches [out of the box](https://scikit-learn.org/stable/modules/semi_supervised.html).

#### Self Training Classifiers

In [None]:
stcs = {
"Logistic, threshold" : SelfTrainingClassifier(
    base_estimator=LogisticRegression(),
    criterion="threshold",  # Only well calibrated estimator. Not enough labels for others.
    max_iter=None
),
"Logistic, k-best" : SelfTrainingClassifier(
    base_estimator=LogisticRegression(),
    criterion="k_best",
    max_iter=None
),
"Random Forest" : SelfTrainingClassifier(
    base_estimator=RandomForestClassifier(n_estimators=100, n_jobs=-1),
    criterion="k_best",
    max_iter=None
),
"Naive Bayes" : SelfTrainingClassifier(
    base_estimator=GaussianNB(),
    criterion="k_best",
    max_iter=None
),
"SVC" : SelfTrainingClassifier(
    base_estimator=SVC(probability=True),
    criterion="k_best",
    max_iter=None
),
}

In [None]:
for label, stc in stcs.items():
    stc.fit(x_band, y_band)
    plot_data(x_band, stc.predict_proba(x_band)[:, 1], title=label)

In [None]:
for label, stc in stcs.items():
    stc.fit(x_rings, y_rings)
    plot_data(x_rings, stc.predict_proba(x_rings)[:, 1], title=label)

In [None]:
for label, stc in stcs.items():
    stc.fit(x_rings2, y_rings2)
    plot_data(x_rings2, stc.predict_proba(x_rings2)[:, 1], title=label)

In [None]:
for label, stc in stcs.items():
    stc.fit(x_blobs, y_blobs)
    plot_data(x_blobs, stc.predict_proba(x_blobs)[:, 1], title=label)

#### Label Propagation

#### Label Spreading