In [94]:
%%capture
%run ../pu_wrapper.py

In [169]:
import numpy as np
import sklearn.datasets as datasets
from matplotlib import pyplot as plt
from pandas import DataFrame
import seaborn as sns
from sklearn.linear_model import LogisticRegressionCV
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.metrics import accuracy_score, f1_score
from ipywidgets import interact, interactive, fixed, interact_manual
from sklearn.model_selection import train_test_split

sns.set(rc={'figure.figsize':(12,8)})

In [134]:
def plot(x, y):
    df = DataFrame(dict(x=x[:,0], y=x[:,1], label=y))
    colors = {1:'red', 0:'blue'}
    fig, ax = pyplot.subplots()
    grouped = df.groupby('label')
    for key, group in grouped:
        group.plot(ax=ax, kind='scatter', x='x', y='y', label=key, color=colors[key])
    pyplot.show()

def make_meshgrid(x, y, h=.02):
    x_min, x_max = x.min() - 1, x.max() + 1
    y_min, y_max = y.min() - 1, y.max() + 1
    xx, yy = np.meshgrid(np.arange(x_min, x_max, h), np.arange(y_min, y_max, h))
    return xx, yy

def plot_decision_helper(ax, clf, x, y, index, title):
    clf.fit(x, y)
    
    X0, X1 = x[:, 0], x[:, 1]
    xx, yy = make_meshgrid(X0, X1)
    
    Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
    Z = Z.reshape(xx.shape)
    ax[index].contourf(xx, yy, Z)
    
    ax[index].scatter(X0, X1, c=y, cmap=plt.cm.coolwarm, s=20, edgecolors='k')
    ax[index].set_ylabel('y')
    ax[index].set_xlabel('x')
    ax[index].set_title(title)
    

def plot_decision_function(clf, x, y, holdout, params):
    fig, ax = plt.subplots(1, 2)
    
    plot_decision_helper(ax, clf(**params), x, y, 0, "Vanilla")
    plot_decision_helper(ax, PuWrapper(clf(**params), holdout), x, y, 1, "PU")

## SVM

**Circles**

In [135]:
def test_circles(n_samples, noise, holdout):
    circles_params = {
        "n_samples": n_samples,
        "noise": noise
    }

    x, y = datasets.make_circles(**circles_params)
    plot(x, y)

    plot_decision_function(SVC, x, y, holdout, {"kernel": "rbf", "probability": True})

interact(test_circles, n_samples=(10, 1000, 10), noise=(0.0, 1.0, 0.01), holdout=(0.0, 1.0, 0.01));

interactive(children=(IntSlider(value=500, description='n_samples', max=1000, min=10, step=10), FloatSlider(va…

**Classification blobs**

In [145]:
def test_classification(n_samples, weights, holdout):
    classification_params = {
        "n_samples": n_samples,
        "n_features": 2,
        "n_informative": 2,
        "n_redundant": 0,
        "n_repeated": 0,
        "weights": (weights,)
    }

    x, y = datasets.make_classification(**classification_params)
    plot(x, y)

    plot_decision_function(SVC, x, y, holdout, {"kernel": "rbf", "probability": True})

interact(test_classification, n_samples=(10, 10000, 10), weights=(0.01, 0.99, 0.01), holdout=(0.0, 1.0, 0.01));

interactive(children=(IntSlider(value=5000, description='n_samples', max=10000, min=10, step=10), FloatSlider(…

**Moons**

In [148]:
def test_moons(n_samples, noise, holdout):
    moons_params = {
        "n_samples": n_samples,
        "noise": noise,
        "random_state": 30
    }

    x, y = datasets.make_moons(**moons_params)
    plot(x, y)

    plot_decision_function(SVC, x, y, holdout, {"kernel": "rbf", "probability": True})

interact(test_moons, n_samples=(10, 5000, 10), noise=(0.01, 0.99, 0.01), holdout=(0.0, 1.0, 0.01));

interactive(children=(IntSlider(value=2500, description='n_samples', max=5000, min=10, step=10), FloatSlider(v…

## Random forest

**Circles**

In [154]:
def test_circles(n_samples, noise, holdout):
    circles_params = {
        "n_samples": n_samples,
        "noise": noise
    }

    x, y = datasets.make_circles(**circles_params)
    plot(x, y)

    plot_decision_function(RandomForestClassifier, x, y, holdout, {"n_estimators": 50, "max_depth": 10})

interact(test_circles, n_samples=(10, 5000, 50), noise=(0.0, 1.0, 0.01), holdout=(0.0, 1.0, 0.01));

interactive(children=(IntSlider(value=2460, description='n_samples', max=5000, min=10, step=50), FloatSlider(v…

In [158]:
def test_classification(n_samples, weights, holdout):
    classification_params = {
        "n_samples": n_samples,
        "n_features": 2,
        "n_informative": 2,
        "n_redundant": 0,
        "n_repeated": 0,
        "weights": (weights,)
    }

    x, y = datasets.make_classification(**classification_params)
    plot(x, y)

    plot_decision_function(RandomForestClassifier, x, y, holdout, {"n_estimators": 50, "max_depth": 5})

interact(test_classification, n_samples=(10, 10000, 10), weights=(0.01, 0.99, 0.01), holdout=(0.0, 1.0, 0.01));

interactive(children=(IntSlider(value=5000, description='n_samples', max=10000, min=10, step=10), FloatSlider(…

## Gradient boosting

In [161]:
def test_circles(n_samples, noise, holdout):
    circles_params = {
        "n_samples": n_samples,
        "noise": noise
    }

    x, y = datasets.make_circles(**circles_params)
    plot(x, y)

    plot_decision_function(GradientBoostingClassifier, x, y, holdout, {"n_estimators": 50, "max_depth": 5})

interact(test_circles, n_samples=(10, 5000, 50), noise=(0.0, 1.0, 0.01), holdout=(0.0, 1.0, 0.01));

interactive(children=(IntSlider(value=2460, description='n_samples', max=5000, min=10, step=50), FloatSlider(v…