# Support Vector Machines

<h2>Índice<span class="tocSkip"></span></h2>
<div class="toc"><ul class="toc-item"><li><span><a href="#Preparatory-cells" data-toc-modified-id="Preparatory-cells-1"><span class="toc-item-num">1&nbsp;&nbsp;</span>Preparatory cells</a></span></li><li><span><a href="#Hard-margin-classifier" data-toc-modified-id="Hard-margin-classifier-2"><span class="toc-item-num">2&nbsp;&nbsp;</span>Hard margin classifier</a></span></li><li><span><a href="#Soft-margin-classfier" data-toc-modified-id="Soft-margin-classfier-3"><span class="toc-item-num">3&nbsp;&nbsp;</span>Soft margin classfier</a></span></li><li><span><a href="#The-power-of-kernels" data-toc-modified-id="The-power-of-kernels-4"><span class="toc-item-num">4&nbsp;&nbsp;</span>The power of kernels</a></span><ul class="toc-item"><li><span><a href="#Kernel-construction" data-toc-modified-id="Kernel-construction-4.1"><span class="toc-item-num">4.1&nbsp;&nbsp;</span>Kernel construction</a></span></li><li><span><a href="#Properties-of-some-kernels" data-toc-modified-id="Properties-of-some-kernels-4.2"><span class="toc-item-num">4.2&nbsp;&nbsp;</span>Properties of some <em>kernels</em></a></span></li></ul></li><li><span><a href="#Kernels-in-action" data-toc-modified-id="Kernels-in-action-5"><span class="toc-item-num">5&nbsp;&nbsp;</span>Kernels in action</a></span></li><li><span><a href="#Your-turn!" data-toc-modified-id="Your-turn!-6"><span class="toc-item-num">6&nbsp;&nbsp;</span>Your turn!</a></span></li></ul></div>

## Preparatory cells

In [None]:
# To support both python 2 and python 3
from __future__ import division, print_function, unicode_literals

# Common imports
import numpy as np
import os, sys

import matplotlib.pyplot as plt
%matplotlib inline

# to make this notebook's output stable across runs
np.random.seed(42)

# To plot pretty figures
%matplotlib inline
import matplotlib as mpl
import matplotlib.pyplot as plt
mpl.rc('axes', labelsize=16)
mpl.rc('xtick', labelsize=14)
mpl.rc('ytick', labelsize=14)

# Where to save the figures
PROJECT_ROOT_DIR = "."
CHAPTER_ID = "10_SVM"
IMAGES_PATH = os.path.join(PROJECT_ROOT_DIR, "plots", CHAPTER_ID)

def save_fig(fig_id, tight_layout=True, fig_extension="png", resolution=300):
    os.makedirs(IMAGES_PATH, exist_ok=True)
    path = os.path.join(IMAGES_PATH, fig_id + "." + fig_extension)
    print("Saving figure", fig_id)
    if tight_layout:
        plt.tight_layout()
    plt.savefig(path, format=fig_extension, dpi=resolution)

# Ignore useless warnings (see SciPy issue #5998)
# import warnings
# warnings.filterwarnings(action="ignore", message="^internal gelsd")

In [None]:
if 'google.colab' in sys.modules:
    # Let us define a couple of useful functions
    def plot_clasi(x, t, ws, labels=[], xp=[-1., 1.], thr=[0,], spines='zero', equal=True, join_centers=False,
                   margin=None):
        """
        Figura con el resultado del ajuste lineal
        """
        assert len(labels) == len(ws) or len(labels) == 0
        assert len(ws) == len(thr)

        if margin is None:
            margin = [False] * len(ws)
        else:    
            margin = np.atleast_1d(margin)
        assert len(margin) == len(ws)

        if len(labels) == 0:
            labels = np.arange(len(ws)).astype('str')

        # Agregemos el vector al plot
    #     fig = plt.figure(figsize=(6, 6))
        fig = plt.figure(figsize=(9, 7))
        ax = fig.add_subplot(111)

        xc1 = x[t == np.unique(t).max()]
        xc2 = x[t == np.unique(t).min()]

        ax.plot(*xc1.T, 'ob', mfc='None', label='C1')
        ax.plot(*xc2.T, 'or', mfc='None', label='C2')

        for i, w in enumerate(ws):

            # Compute vector norm
            wnorm = np.sqrt(np.sum(w**2))

            # Ploteo vector de pesos
            x0 = 0.5 * (xp[0] + xp[1])
            ax.quiver(0, thr[i]/w[1], w[0]/wnorm, w[1]/wnorm, 
                      color='C{}'.format(i+2), scale=10, label=labels[i], 
                      zorder=10)

            # ploteo plano perpendicular
            xp = np.array(xp)
            yp = (thr[i] - w[0]*xp)/w[1] 

            plt.plot(xp, yp, '-', color='C{}'.format(i+2))

            # Plot margin
            if margin[i]:
                for marg in [-1, 1]:
                    ym = yp + marg/w[1]
                    plt.plot(xp, ym, ':', color='C{}'.format(i+2))


        if join_centers:
            # Ploteo línea que une centros de los conjuntos
            mu1 = xc1.mean(axis=1)
            mu2 = xc2.mean(axis=1)
            ax.plot([mu1[0], mu2[0]], [mu1[1], mu2[1]], 'o:k', mfc='None', ms=10)    

        ax.legend(loc=0, fontsize=12)
        if equal:
            ax.set_aspect('equal')

        if spines is not None:
            for a in ['left', 'bottom']:
                ax.spines[a].set_position('zero')
            for a in ['top', 'right']:
                ax.spines[a].set_visible(False)

        return


    def makew(fitter):

        # # Obtengamos los pesos y normalicemos
        w = fitter.coef_.copy()

        # # Incluye intercept
        if fitter.fit_intercept:
            w = np.hstack([fitter.intercept_.reshape(1,1), w])

        # # Normalizon
        #w /= np.linalg.norm(w)
        return w.T
    
else:
    from utils import makew, plot_clasi

## Hard margin classifier

$\newcommand{\vv}[1]{\boldsymbol{#1}}$
$\newcommand{\om}[0]{\boldsymbol{\omega}}$
$\newcommand{\norm}[0]{\mathcal{N}}$
$\newcommand{\b}[1]{\mathrm{\mathbf{#1}}}$
$\newcommand{\T}{^\mathrm{T}}$
$\newcommand{\cu}{\mathcal{C}_1}$
$\newcommand{\cd}{\mathcal{C}_2}$

We have seen that for linearly separable problems there exists a linear combination of base functions, $\phi_i(\boldsymbol{x})$ that create a decision boundary with exact classification.

However, it is not clear that this is the optimal frontier in terms of generalisation error.

In [None]:
from numpy.random import multivariate_normal

def make_dataset(mu1=[0, 0], mu2=[-4, 1.5], 
                 cov1=[[1, 0.95],[0.95, 1]], cov2=[[1, 0.8],[0.8, 1]], 
                 size1=250, size2=200, random_state=20200922):

    np.random.seed(random_state)
    # Sample classes
    xc1 = multivariate_normal(mean=mu1, cov=cov1, size=size1).T
    xc2 = multivariate_normal(mean=mu2, cov=cov2, size=size2).T

    print(xc1.shape, xc2.shape)

    # Concatenate both classes
    x = np.hstack([xc1, xc2]).T
    tc1 = np.ones(xc1.shape[1])
    tc2 = -np.ones(xc2.shape[1])
    # tc2 = -np.ones((1, xc2.shape[1]))
    t = np.hstack([tc1, tc2])
    
    # Make desing matrix adding a column of ones
    phi0 = np.ones([len(x), 1])
    phi = np.hstack([phi0, x.copy()])

    return x, t, phi

x, t, phi = make_dataset()

In [None]:
# Veamos cómo se ven
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)

ax.plot(*x[t==1].T, 'ob', mfc='None', label='C1')
ax.plot(*x[t==-1].T, 'or', mfc='None', label='C2')

ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax.legend(loc='lower right', fontsize=16)
ax.set_aspect('equal')

Let's see how a **single layer perceptron** works in this case.

**What would you expect?**

In [None]:
from sklearn.linear_model import Perceptron

perce = Perceptron(max_iter=1000, fit_intercept=False, warm_start=False)

# Pero podríamos probar otras cosas, como esto que está comentado más abajo.
perce = perce.fit(phi, t.flatten())

w_perce = makew(perce)

In [None]:
# Comparamos con Fischer
plot_clasi(x, t, [w_perce[1:],], ['Perceptron',], xp=[-4, 2], thr=[-w_perce[0],])

Ok, but what happens if a new starting point is provided?

In [None]:
perce2 = Perceptron(max_iter=1000, fit_intercept=False, warm_start=True, random_state=1234)
perce2 = perce2.fit(phi, t.flatten(), coef_init=[1.0, 4.0, 55.0])
w_perce2 = makew(perce2)

print(perce2.coef_)
# print(w_perce2)
# Comparamos con Fischer
plot_clasi(x, t, [w_perce[1:], w_perce2[1:]], ['Perceptron', 'Perceptron2'], xp=[-4, 2], 
           thr=[-w_perce[0], -w_perce2[0]])

We may ask which of the two decision curves is best. And why...

Alternatively, we can use hard-margin classifiers, as we have just seen.

In [None]:
from sklearn.svm import LinearSVC, SVC

# Instatiate
svc = LinearSVC(loss='hinge', C=1e120, fit_intercept=False, penalty='l2')

# Fit
svc.fit(phi, t.flatten())

# Obtain parameter vector for plot
w_hmc = makew(svc)

We can find the *support vectors* by looking for the data points which are on the margin (or inside it), which means that $y(\mathbf{x})<= 1$.

In [None]:
# Find support vectors
isv = np.abs(svc.decision_function(phi)) <= 1.0 + 1e-12

svc.decision_function(phi[isv])

In [None]:
# Let us use plot_clasi with the margin=True option
plot_clasi(x, t, [w_hmc[1:],], ['Hard-margin',], xp=[-6, 2], 
           thr=[-w_hmc[0],], margin=True)

# Plot support vectors
plt.plot(x[isv, 0], x[isv, 1], 'o', ms=12, mfc='None', mec='k', mew=2)

In [None]:
y = svc.decision_function(phi)
plt.hist(y, 50)

But this classifier is sensitive to outliers.
Let us be mean, and add an outlier point.

In [None]:
#Outlier in design matrix
phi_out = np.array([[1, -3, 0.75],])
t_out = np.array([1,])

phi_bad = np.vstack([phi, phi_out])
t_bad = np.concatenate([t, t_out])

In [None]:
plot_clasi(phi_bad[:, 1:], t_bad, [], [], xp=[-6, 2], thr=[])

<!-- <img src="https://media.giphy.com/media/3o6Ztq5WTvF7LqQBgI/giphy.gif" width="500" height="270" frameBorder="0"></img> -->

<img src="https://media.giphy.com/media/3orif7aLUehOfdmlXy/giphy.gif" width="450" height="270"></img>

In [None]:
svc = LinearSVC(loss='hinge', C=1e120, fit_intercept=False, penalty='l2', max_iter=100000)

# Fit
svc.fit(phi_bad, t_bad)

# Obtain parameter vector for plot
w_hmc_ol = makew(svc)

# Find support vectors
isv = np.abs(svc.decision_function(phi_bad)) <= 1.0 + 1e-5

svc.decision_function(phi_bad[isv])

In [None]:
print(np.sum(w_hmc_ol**2), np.sum(w_hmc**2))

In [None]:
plot_clasi(phi_bad[:, 1:], t_bad, [w_hmc_ol[1:],], ['Hard-margin',], xp=[-6, 2], 
           thr=[-w_hmc_ol[0],], margin=True)

# Plot support vectors
plt.plot(phi_bad[isv, 1], phi_bad[isv, 2], 'o', ms=12, mfc='None', mec='k', mew=2)

In [None]:
y = svc.decision_function(phi_bad)

plt.hist(y, 50)

## Soft margin classfier

To avoid these issues, we can use a soft margin classifier that allows some misclassification in order to reach a larger margin.

In [None]:
# Igual que antes, excepto por el valor de C
svc = LinearSVC(loss='hinge', C=1.0, fit_intercept=False, penalty='l2', max_iter=100000)

# Fit
svc.fit(phi_bad, t_bad)

# Obtain parameter vector for plot
w_smc_ol = makew(svc)

# Find support vectors
isv = np.abs(svc.decision_function(phi_bad)) <= 1.0 + 1e-5

In [None]:
# plot_clasi(x, t, [w_hmc[1:], w_perce2[1:]], ['Hard-margin', 'Perceptron'], xp=[-6, 2], 
#            thr=[-w_hmc[0], -w_perce2[0]], margin=[True, False])
plot_clasi(phi_bad[:, 1:], t_bad, [w_smc_ol[1:],], ['Soft-margin',], xp=[-6, 2], 
           thr=[-w_smc_ol[0],], margin=True)

# Plot support vectors
plt.plot(phi_bad[isv, 1], phi_bad[isv, 2], 'o', ms=12, mfc='None', mec='k', mew=2)

Where, of course, we should find the value of the penalisation `C` by cross validation.

This should also work for non-separable sets.

In [None]:
x, t, phi = make_dataset(mu2=[-1,1])

In [None]:
# Veamos cómo se ven
fig = plt.figure(figsize=(8, 8))
ax = fig.add_subplot(111)

ax.plot(*x[t==1].T, 'ob', mfc='None', label='C1')
ax.plot(*x[t==-1].T, 'or', mfc='None', label='C2')

ax.set_xlabel('$x_1$')
ax.set_ylabel('$x_2$')
ax.legend(loc='lower right', fontsize=16)
ax.set_aspect('equal')

In [None]:
# Instancio
# hmc = LinearSVC(C=np.infty, fit_intercept=True, max_iter=1000)
smc = LinearSVC(C=1.0, fit_intercept=False, max_iter=10000, loss='hinge')
# smc = SVC(kernel='linear', degree=1, C=0.005, max_iter=1000)

# Ajusto
# hmc.fit(phi, t.flatten())
smc.fit(phi, t.flatten())

# w_hmc = makew(hmc).T
w_smc = makew(smc)

In [None]:
# Find support vectors
isv = np.abs(smc.decision_function(phi)) <= 1

In [None]:
plot_clasi(x, t, [w_smc[1:],], ['SMC', ], xp=[-4, 2], thr=[-w_smc[0],], margin=True)

plt.plot(x[isv, 0], x[isv, 1], 'o', ms=12, mfc='None', mec='k', mew=2)
# plt.plot(smc.support_vectors_[:, 0], smc.support_vectors_[:, 1], 'o', ms=12, mfc='None', mec='k', mew=2)

<font size=5>¡A sus teclados!</font>

* Encuentre el valor óptimo del parámetro C usando validación cruzada.
* Usando la función `plot_clasi`, haga un gráfico como el de arriba para el valor óptimo. 

In [None]:
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
import scipy.stats as st

# Using randomised search
parameters = {'C': st.loguniform(a=1e-3, b=10)}
cvs = RandomizedSearchCV(smc, parameters, cv=10, n_iter=50)
cvs.fit(phi, t.flatten())

# Using grid search
# parameters = {'C': np.logspace(-3, 0, 10)}
# cvs = GridSearchCV(smc, parameters, cv=5)
# cvs.fit(phi, t.flatten())

In [None]:
dist = st.loguniform(a=1e-3, b=10)
dist.rvs(50)

In [None]:
print(cvs.best_params_)

# Fit best model
smc = cvs.best_estimator_
smc.fit(phi, t.flatten())

w_smc = makew(smc)

# Identify support-vectors
isv = np.abs(smc.decision_function(phi)) <= 1

In [None]:
plot_clasi(x, t, [w_smc[1:],], ['SMC', ], xp=[-4, 2], thr=[-w_smc[0],], margin=True)

plt.plot(x[isv, 0], x[isv, 1], 'o', ms=12, mfc='None', mec='k', mew=2)

## The power of kernels

We have seen that predictions from a SVM can be expressed as:

$$
y(x_k) = \sum_{i=1}^N a_i\, t_i\, k(x_i, x_k)
$$

where the $a_i$, with $i = \{1, \ldots, N\}$ are the Lagrange multipliers that allow maximizing the margin. But more importantly, they can be shown to follow these conditions in the soft-margin case.

$$
\begin{array}{c}
a_i \geq 0\\
t_i y(\boldsymbol{x}_i) -1 + \xi_i \geq 0\\
a_i (t_i y(\boldsymbol{x}_i) -1 + \xi_i) = 0\\
\end{array}
$$
and where 
 $$
 \boxed{k(x, x^\prime) = \boldsymbol{\phi}(x)\T \boldsymbol{\phi}(x^\prime) = \sum_{i=1}^M \phi_i(x)\T \phi_i(x^\prime)}\;\;.$$

Therefore, it can be seen that the only terms contributing to the prediction of a new point $y(\boldsymbol{x_k})$ are the points which satisfy $t_i y(\boldsymbol{x}_i) = 1 - \xi_i$ . These are the **support vectors**.

The soft-margin classifier we have been using it is therefore a **sparse vector machine**.

### Kernel construction

To build a valid *kernel* we can define a set of base functions $\phi_i$, for $i = \{1, \ldots, M\}$ and then perform the product as above.
 
However, the true versatility of the technique appears when one directly defines a *kernel* function. But in this case, we must ensure that it is a valid function, that is, that it corresponds to a product of vectors of base functions, $\boldsymbol{\phi}(x)$.
 
A necessary and sufficient condition is that the Gram matrix, $\mathbf{K}$, whose element $n, m$ is $k(\mathbf{x}_n, \mathbf{x}_m)$ be positive semi-definite. This means that it must be fulfilled that

$$
\mathbf{v}\T \mathbf{K} \mathbf{v} \geq 0\;\;
$$
for any vector $\mathbf{v}$. This is equivalent to asking that the eigenvalues of the Gram matrix be greater than or equal to zero.

**Note**: this is not equivalent to saying that all the elements of the matrix are positive. In the book by Bishop you can find the case of this innocent-looking matrix
$$
\begin{pmatrix}
1 & 2\\
3 & 4\\
\end{pmatrix}\;\;,
$$

whose eigenvalues are $\lambda_1 = 5.37$ and $\lambda_2 = -0.37$ and is therefore not positive semi-definite.

One powerful way is to use *basic* kernels to build *more sophisticated* kernels, using the following properties (taken from Bishop):

<!-- ![title]("images/kernelconstruction.png") -->
<img src='images/kernelconstruction.png'>

### Properties of some *kernels*

$\newcommand{\k}[0]{k(\mathbf{x}, \mathbf{x}^\prime)}$
$\newcommand{\x}[0]{\mathbf{x}}$
We can see that the kernel $\k = (\mathbf{x}\T \mathbf{x}^\prime)^2$ has degree-two terms. In two dimensions, $\mathbf{x} = (x_1, x_2)$

$$
\begin{array}{ll}
(\mathbf{x}\T \mathbf{z})^2 &= (x_1 z_1 + x_2 z_2)^2\\
                            &= x_1^2 z_1^2 + 2 x_1 z_1 x_2 z_2 + x_2^2 z_2^2\\
                            &= (x_1^2, \sqrt{2}x_1 x_2, x_2^2) (z_1^2, \sqrt{2}z_1 z_2, z_2^2)\T\\
                            &= \boldsymbol{\phi}(\mathbf{x})\T \boldsymbol{\phi}(\mathbf{z})\;\;.
\end{array}
$$

In a similar way, $\k = (\mathbf{x}\T \mathbf{x}^\prime)^M$ has all degree-$M$ terms. 

Besides $\k = (\mathbf{x}\T \mathbf{x}^\prime + c)^2$ corresponds to a vector of base functions that contains constant and linear terms, in addition to those of order two.

One of the more interesting *kernels* is called the "Gaussian kernel" or Radial Basis Function (RBF):

$$
\k = \exp{\left(-\gamma ||\mathbf{x} - \mathbf{x}^\prime||^2\right)}\;\;,
$$

which has a gaussian shape, but it has no normalization or anything like that. The parameter $\gamma$ is the scale of distances between the points.

An interesting feature of this *kernel* is that it corresponds to a vector of base functions of infinite dimensions.

<img src="https://media.giphy.com/media/jNYUeDwoUoloEswJm8/giphy.gif" width="350" height="270"></img>

<font size=4>¡Muy importante!</font> Con kernels como el gaussiano, es crucial que los features tengan la misma escala. Por lo tanto, es fundamental usar `StandardScaler` o similar antes de analizar datos.

## Kernels in action

Let's see how this *kernel* works with the dataset we've just prepared, and try to see what role the $\gamma$ and $C$ hyperparameters play.

In [None]:
if 'google.colab' in sys.modules:

    def plot_svm(svc, x, t):

        plt.figure(figsize=(9, 7))

        xx, yy = np.meshgrid(np.linspace(x[:, 0].min()-1, x[:, 0].max()+1, 200), 
                             np.linspace(x[:, 1].min()-1, x[:, 1].max()+1, 200))

        # evaluate decision function
        Z = svc.decision_function(np.c_[xx.ravel(), yy.ravel()])
        Z = Z.reshape(xx.shape)

        # veamos la función de decisión y la frontera de decisión
        plt.pcolormesh(xx, yy, -Z, cmap=plt.cm.RdBu_r)
        plt.contour(xx, yy, -Z, 0, colors='0.5', zorder=1)
        plt.contour(xx, yy, -Z, [-1, 1], colors='0.25', linestyles='dashed', zorder=1)

        xc1 = x[t == np.unique(t.flatten()).max()]
        xc2 = x[t == np.unique(t.flatten()).min()]

        plt.plot(*xc1.T, 'ob', mfc='None', label='C1')
        plt.plot(*xc2.T, 'or', mfc='None', label='C2')

        # Get suppor vector
        xsv = svc.support_vectors_
        plt.plot(xsv[:, 0], xsv[:, 1], 'o', ms=12, mfc='None', mec='k', mew=2)

        plt.xticks(())
        plt.yticks(())
        plt.axis('tight')

        return
    
else:
    from utils import plot_svm

In [None]:
X, t, phi = make_dataset(mu2=[-1,1])

In [None]:
plot_clasi(x, t, [], [], [], [])

In [None]:
#Standarise
from sklearn.preprocessing import StandardScaler
X = StandardScaler().fit_transform(X)

In [None]:
from sklearn.svm import SVC

# Instantiate class
svc_rbf = SVC(kernel='rbf', C=100.0, gamma=10.0)
svc_rbf.fit(x, t.flatten())

plot_svm(svc_rbf, x, t)

# svc_poly = SVC(kernel='poly', C=10000.0, degree=3)
# svc_poly.fit(X, t.flatten())

# plot_svm(svc_poly, X, t, colorbar=True,)# vmin=-200, vmax=200)

Very nice, but let's try something that we could not have done otherwise.

In [None]:
from sklearn.datasets import make_moons
X, t = make_moons(n_samples=200, noise=0.15, random_state=42)

In [None]:
# Original
plot_clasi(X, t, [], [], [], [], join_centers=False)

In [None]:
X = StandardScaler().fit_transform(X)
# Scaled
plot_clasi(X, t, [], [], [], [], join_centers=False)

In [None]:
svc_rbf = SVC(kernel='rbf', C=1000.0, gamma=0.1)
svc_rbf.fit(X, t.flatten())

In [None]:
# import importlib
# import utils
# importlib.reload(utils)
# from utils import plot_svm

In [None]:
plot_svm(svc_rbf, X, t, colorbar=1)

In [None]:
svc_poly = SVC(kernel='poly', degree=5, coef0=10.0, C=1.0)
svc_poly.fit(X, t.flatten())

plot_svm(svc_poly, X, t, colorbar=True, vmin=-100, vmax=100)

## ¡De vuelta su turno!

* Usen Cross-Validation para encontrar los hiperparámetros óptimos para el SVC usando un kernel `rbf` en el dataset `make_moons`. Optimizar ambos parámetros.
* Exploren las posibilidades de las SVM en el dataset de Iris, usando primero dos clases (las que ustedes elijan), y luego para una clasificación multi-clase.

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV

params = {'C': np.logspace(-2, 3, 20), 'gamma': np.logspace(-2, 3, 20)}

gscv = GridSearchCV(svc_rbf, param_grid=params, cv=5, scoring='accuracy', n_jobs=-1)
gscv.fit(X, t)

In [None]:
print(gscv.best_params_, gscv.best_score_)
plot_svm(gscv.best_estimator_, X, t, colorbar=True)

***

In [None]:
from sklearn import datasets
iris=datasets.load_iris()
X = iris.data
t = iris.target

X = X[t > 0]
t = t[t > 0]

# Plot also the training points
for pair in ([0, 1], [0, 2], [2, 3]):
  plt.scatter(X[:, pair[0]], X[:, pair[1]], c=t, edgecolors='k', cmap=plt.cm.Paired)
  plt.xlabel(iris.feature_names[pair[0]])
  plt.ylabel(iris.feature_names[pair[1]])
  plt.show()