# Classical methods in machine learning

In [None]:
import numpy as np
import pandas as pd

import seaborn as sns
import matplotlib.cm as cm
import matplotlib.pyplot as plt
plt.rcParams['text.usetex'] = True

# Scikit-learn, tensorflow, torch, etc.
#import torch
#import tensorflow as tf

from sklearn.datasets import make_regression, make_classification, \
                             make_blobs, make_moons, make_circles
from sklearn.preprocessing import StandardScaler, Normalizer
# ...
# ...

In [None]:
# Initialize seaborn with custom settings
# Facecolor values from S. Conradi @S_Conradi/@profConradi
custom_settings = {
    'figure.facecolor': '#f4f0e8',
    'axes.facecolor': '#f4f0e8',
    'axes.edgecolor': '0.7',
    'axes.linewidth' : '2',
    'grid.color': '0.7',
    'grid.linestyle': 'none',
    'grid.alpha': 0.6,
}
sns.set_theme(palette=sns.color_palette('deep', as_cmap=False),
              rc=custom_settings)
plt.rcParams['text.usetex'] = False

# 2. Training ("fitting") a model

## 2.1. Regression

<p style="text-align:center; font-size:20px;">
  <b>Data and label -> Model -> Continuous value</b>
</p>

### 2.1.1. Generated datasets

In [None]:
X, y = make_regression(
    n_samples=5000,
    n_features=10,
    n_informative=10,
    n_targets=1,
    random_state=57
)
X = pd.DataFrame(X)

In [None]:
X

In [None]:
fig, ax = plt.subplots(figsize=(20, 5), dpi=120)

ax.plot(y, color='indianred', lw=2)

ax.set_title('$y_{\\text{values}}$',
             fontsize=30, fontweight='bold')
ax.set_xticks([])

plt.show()

In [None]:
nr, nc = 2, 5
fig, axes = plt.subplots(nr, nc, figsize=(nc*5, nr*5), dpi=120)

for i, ax in enumerate(axes.flat):
    ax.scatter(X[i], y, 
               color='indianred', alpha=0.6)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel(f'$X_{{{i+1}}}$', fontsize=30, fontweight='bold')
    ax.set_ylabel('$y$', fontsize=30, fontweight='bold')

plt.show()

#### Pull up the regression methods...

There's lots of them...: https://en.wikipedia.org/wiki/Outline_of_machine_learning#Regression_analysis

#### 1. Split the data set to a train and a test set

One of the most fundamental principles of machine learning is splitting the dataset into at least two subsets&mdash;a "training set" and a "test set"&mdash;and, in more advanced workflows, a third subset called a "validation set." The main idea is to fit (or "train") the model on the training set, then evaluate its performance on the test set. 

Why do we do this? When creating abstract models, our primary goal is for them to generalize well to unseen data. A model should not only describe known observations but also predict future, unseen events. In statistics, we "simulate" novel observations by designating part of the dataset as "unseen"&mdash;that is, the test set. If a trained model performs well on the test set, we can be confident it will generalize effectively to new data.

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.33,
    random_state=57
)

In [None]:
nr, nc = 2, 5
fig, axes = plt.subplots(nr, nc, figsize=(nc*5, nr*5), dpi=120)

for i, ax in enumerate(axes.flat):
    ax.scatter(X_train[i], y_train, label='Train set',
               color='indianred', s=4**2, alpha=0.6)
    ax.scatter(X_test[i], y_test, label='Test set',
               color='cornflowerblue', s=4**2, alpha=0.6)
    ax.set_xticks([])
    ax.set_yticks([])
    ax.set_xlabel(f'$X_{{{i+1}}}$', fontsize=30, fontweight='bold')
    ax.set_ylabel('$y$', fontsize=30, fontweight='bold')
    ax.legend(loc='lower right', fontsize=15)

plt.show()

#### 2. Train and evaluate some linear models

In [None]:
from sklearn.linear_model import ARDRegression, BayesianRidge, ElasticNet, \
                                 Lasso, LinearRegression, Ridge, SGDRegressor
from sklearn.svm import SVR
from sklearn.metrics import r2_score

In [None]:
def regression(model, *, X_train, y_train, X_test, y_test):
    reg = model()
    reg.fit(X_train, y_train)
    y_pred = reg.predict(X_test)
    print(f"Score for {model.__name__} : {r2_score(y_test, y_pred):.4f}")

In [None]:
models = [
    ARDRegression, BayesianRidge, ElasticNet,
    Lasso, LinearRegression, Ridge, SGDRegressor, SVR
]
for model in models:
    regression(model,
               X_train=X_train, y_train=y_train,
               X_test=X_test, y_test=y_test)

#### +1. Fun fact: "regression" and "linear regression" are not necessarily "linear"

In [None]:
# Generate an oddly specific toy dataset, which 99% of the times are
# shown as an example, when this fun fact arises
x_1, x_2, x_3, x_4 = 1/4, -3.4, 90, 2
X = np.linspace(-40, 40, 500)
y_sq = x_1 * X**2
y_li = x_2 * X + x_4
y_tr = x_3 * np.cos(X)**3
y = y_sq + y_li + y_tr

In [None]:
fig, ax = plt.subplots(figsize=(9, 5), dpi=120)

ax.grid(True, ls='--', color='.7', alpha=0.4)
ax.scatter(X, y, label='Original data',
           color='0.4', ec='none', s=7**2, alpha=0.7)
title = f'Equation of sampled polynomial: \n' + \
        f'$({x_1:.3f}\\,x^2) + ({x_2:.3f}\\,x) + ({x_3:.3f}\\,\\cos^3(x)) + ({x_4:.3f})$'
ax.set_title(title, fontsize=16, fontweight='bold', loc='left')


plt.show()

In [None]:
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import FunctionTransformer

Using `sklearn` we can sequentially transform a dataset and then fit an estimator on it. This sequential list of transformations, finished by a single, final estimator are referred to as a "pipeline" in `sklearn`. The pipeline below defined as

```python
pipeline = Pipeline([("polynomial_variation", FunctionTransformer(poly2_reg)),
                     ("linear_regression", LinearRegression())])
```

contains some arbitrary transformation defined by the `poly2_reg` function (this can be anything actually that transform `X` in any way), which is then fitted using `sklearn`'s built-in linear estimator, the `LinearRegressor`, which implements the ordinary least squares linear regression.

In [None]:
def poly2_reg(X):
    """
    Returns the transformed array using the equation
       ```A * X^2 + B * cos^3(X) + C * X + D```
    """
    return np.hstack((np.cos(X)**3, X, X**2))

In [None]:
pipeline = Pipeline([("polynomial_variation", FunctionTransformer(poly2_reg)),
                     ("linear_regression", LinearRegression())])
# Transform X for the PolynomialFeatures() and LinearRegression() class
# Then fit on the pipeline the available data
pipeline.fit(X[:, np.newaxis], y)
# Get coefficients
c, b, a = pipeline[1].coef_
d = pipeline[1].intercept_

In [None]:
print(f"Coefficients: x_1: {a:.2f}, x_2: {b:.2f}, x_3: {c:.2f}, const.: {d:.2f}")

In [None]:
fig, ax = plt.subplots(figsize=(9, 5), dpi=120)

ax.grid(True, ls='--', color='.7', alpha=0.4)
ax.scatter(X, y, label='Original data',
           color='0.5', ec='none', s=7**2, alpha=0.7)
ax.plot(X, pipeline.predict(X[:, np.newaxis]), label='Fitted model',
        color='tab:red', lw=3, ls=(0, (2, 1)), alpha=0.8)

title = f'Equation of fitted polynomial: \n' + \
        f'$({a:.3f}\\,x^2) + ({b:.3f}\\,x) + ({c:.3f}\\,\\cos^3(x)) + ({d:.3f})$'
ax.set_title(title, fontsize=15, fontweight='bold', loc='left')
ax.legend(loc='best', fontsize=15,
          facecolor='#f4f0e8', edgecolor='none', framealpha=0.7)

plt.show()

### 2.1.2. Finally, some real data...

In this section, we will use the [Communities and Crime Data Set](https://archive.ics.uci.edu/ml/datasets/Communities+and+Crime) from the UCI Machine Learning Repository, which contains detailed crime statistics from various communities across the US. Using `LinearRegression` and `Lasso` regression models, we will try to identify which features contribute the most to the overall crime rate.


#### 0. Load the data set

Before we can even load the data set, we first need to understand its structure first. The data is provided as a CSV file named `communities.data`, which we can easily load using `pandas` directly from the UCI archive via a URL. However, the feature names and their descriptions are not included in the CSV file, as they are shipped separately in a text file called `communities.names` along with additional metadata. To construct a proper `DataFrame` from the data itself, we need to load and parse this file to extract the relevant feature names.

In [None]:
import re
from urllib import request

In [None]:
# Feature names start with `@attribute`, followed by the feature name,
# then ending with the type of the feature values (numeric/string/etc.)
archive_url = 'https://web.archive.org/web/20230321133656/'
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.names'
with request.urlopen(url) as f:
    lines = f.read().decode('utf-8').splitlines()
features = [re.sub(r'^@attribute\s+', '', line).split()[0] 
            for line in lines if line.startswith('@attribute')]


In [None]:
# Missing values are marked with an `?` in the dataset
archive_url = 'https://web.archive.org/web/20240810114503/'
url = 'https://archive.ics.uci.edu/ml/machine-learning-databases/communities/communities.data'
df = pd.read_csv(url, sep=',', names=features, na_values=['?'])

In [None]:
display(df.head())
display(df.tail())

#### 1. Preprocess the dataset

#### Handle missing/ID labels

While missing values in meaningful features should be filled appropriately, columns representing ID-like variables can be deleted. Location and violent crime rates do correlates in real life, but an idea of a causal relationship between location and crime rates can be discarded now. Description for each feature can be accessed in the `community.names` file.

#### ID-like columns
The first 4 columns (`state`, `county`, `community`, `communityname`) can be deleted, because they are not meaningful features. The `state` and `county` columns are categorical variables, while the `community` column is a unique identifier for each community. Finally, the `communityname` column is a string representation of the community name, all of which are not useful for our analysis.

In [None]:
df = df[features[4:]]

The column `fold` is a debug feature, which is just a remnant from a cross-validation applied during creation of the data set; this can be also discarded.

In [None]:
df = df[features[5:]]

#### Features with missing values

According to the feature descriptions, all remaining columns are in a decimal format and scaled into the interval of $\left[ 0, 1 \right]$. The only exception is the feature `LemasGangUnitDeploy`, which is actually an ordinal with values $0.0$, $0.5$ and $1.0$. We can still however handle it as a decimal feature.

There is a table in the `community.names` metadata file which summarizes the basic statistical attributes (mean, median, standard deviation, etc.) of each feature in the dataset. According to this table, any feature with missing entries has exactly $1675$ missing values. (There is only one exception: the column `OtherPerCap`, where only $1$ value is missing.) It is entirely logical to assume that, in this case, the missing features are always missing from the same lines. If this hypothesis is true, we can test it by visualizing the missing values on a matrix plot. If we plot features on the $y$-axis, we should see only horizontal lines (interrupted by vertical gaps) in the dataset, instead of individual points scattered throughout.

In [None]:
fig, ax = plt.subplots(figsize=(30, 30), dpi=300)
ax.set_aspect('equal')
ax.grid(False)

ax.imshow(df.isna().T, interpolation='none')
ax.set_xlabel('Rows', fontweight='bold')
ax.set_ylabel('Features', fontweight='bold')

ax.set_xticks([])
ax.set_yticks([])

plt.show()

These are indeed "horizontal lines interrupted by vertical gaps." However, these features are missing most of their values. In this case, we should consider simply dropping these features from the model, since filling them with artificial values could reasonably distort their impact on the model. I will try this method for this dataset.

In [None]:
# Drop columns with at least 50% of values missing
df_n = df.dropna(axis=1, thresh=int(0.5 * len(df)), inplace=False)

# Fill that 1 remaining entry with the mean of the corresponding feature
df_n = df_n.fillna(df_n.mean())

In [None]:
fig, ax = plt.subplots(figsize=(30, 30), dpi=300)
ax.set_aspect('equal')
ax.grid(False)

ax.imshow(df_n.isna().T, interpolation='none')
ax.set_xlabel('Rows', fontweight='bold')
ax.set_ylabel('Features', fontweight='bold')

ax.set_xticks([])
ax.set_yticks([])

plt.show()

#### Scale dataset

In [None]:
# Create the X and y datasets
X = df_n[df_n.columns[:-1]]
y = df_n[df_n.columns[-1]]
# Scale the dataset
X = StandardScaler().fit_transform(X)

In [None]:
fig, ax = plt.subplots(figsize=(10, 2), dpi=200)

ax.plot(y, color='indianred', lw=2)
ax.set_title('Violent crime rate per pop (normalized to [0, 1])', loc='left')

plt.show()

#### 2. Fit linear regression using 5-fold CV

<img width="800px" src="./images/5foldcv.png" style="display:block; margin:auto;"/>

5-fold cross-validation is a widely utilized technique used to assess the performance of a machine learning model. It helps to estimate how well a model generalizes to unseen data by repeatedly training and testing on different subsets of the data. The process involves the following steps:

1. **Split** the dataset into 5 equal "folds" (subsets).  
2. **Loop** over the 5 folds:
   - Use 4 folds for **training**.
   - Use the remaining 1 fold for **validation/testing**.
3. **Record** the validation error (e.g. MSE) for each fold.
4. **Average** the 5 validation errors, which will provide a robust estimate of model performance.

In [None]:
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score

In [None]:
# Number of folds
folds = 5
# Invoke the KFold class from sklearn for CV tests
cv = KFold(n_splits=folds, shuffle=True, random_state=42)
# The model we use is linear regression
model = LinearRegression()

In [None]:
# Test R^2 score
# Refrence: https://scikit-learn.org/stable/modules/model_evaluation.html#scoring-parameter
scores = cross_val_score(model, X, y, scoring='r2', cv=cv)

print('KFOLD SCORES:\n' +
      '----------------')
print(scores)
print('Mean of scores : {0:.4f}'.format(np.mean(scores)))
print('Std of scores : {0:.4f}'.format(np.std(scores)))

#### 3. Fit Lasso regression using 5-fold CV

In [None]:
from sklearn.linear_model import Lasso

from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GridSearchCV

In [None]:
# Use just part of the full dataset for training with 5-fold CV
# Use the remaining values as a test dataset
X_train, X_test, y_train, y_test = train_test_split(
    X, y,
    test_size=0.2,
    random_state=42
)

In [None]:
# 5-fold search is needed
folds = 5
cv = KFold(n_splits=folds, shuffle=True, random_state=42)
# Lasso estimator with scaling the data
model = make_pipeline(StandardScaler(), Lasso(random_state=None))
# Parameters to explore:
# alpha and max_iter
param_grid = {
    'lasso__alpha' : np.logspace(-5, -1, 50),
    'lasso__max_iter' : np.linspace(10, 1000, 10, dtype=int)
}
# Grid search cross-validation
clf = GridSearchCV(estimator=model,
                   param_grid=param_grid,
                   cv=cv,
                   n_jobs=-1)

In [None]:
best_model = clf.fit(X_train, y_train).best_estimator_

In [None]:
print('Best model : {0}'.format(best_model.named_steps['lasso']))
y_pred = best_model.predict(X_test)
print(f"Score for Lasso : {r2_score(y_test, y_pred):.4f}")

In [None]:
fig, axes = plt.subplots(figsize=(5, 5), dpi=120)
axes.set_aspect('equal')

axes.plot([0, 1], [0, 1],
          color='red', lw=4, ls='--', zorder=3, alpha=0.5)
axes.scatter(y_test, y_pred,
             color='0.4', s=12**2, ec='black', alpha=0.4)

axes.set_xlim(0, 1)
axes.set_ylim(0, 1)

axes.set_title('Predictions using the optimized\n5-fold Lasso regression',
               loc='left')
axes.set_xlabel('$\\mathrm{y_{groundtruth}}$', fontsize=20)
axes.set_ylabel('$\\mathrm{y_{inferred}}$', fontsize=20)
axes.text(0.04, 0.96, f'R$^{{2}}$ score : {r2_score(y_test, y_pred):.4f}',
          va='top', ha='left', transform=axes.transAxes,
          bbox=dict(boxstyle='square,pad=0.3', fc='none', ec='black', lw=1))

plt.show()

#### Comparing various models like previously

In [None]:
models = [
    ARDRegression, BayesianRidge, ElasticNet,
    Lasso, LinearRegression, Ridge, SGDRegressor, SVR
]
for model in models:
    regression(model,
               X_train=X_train, y_train=y_train,
               X_test=X_test, y_test=y_test)

#### Notes on the results

The grid search returned a very small (almost the smallest) alpha value in the analysis above. This wasn't actually an error, but the indication, that a linear regression could be efficiently used in case of this specific dataset. ($\alpha \to 0$ is equivalent to the linear regression in the case of the Lasso regression.)

#### 4. Evaluating the trained Lasso model with the shrinkage method

The shrinkage method is a "numerical Occam's razor", which helps simplify models by discouraging overly complex solutions, typically through penalizing large coefficients. This approach makes models more interpretable, robust, and reduces the risk of overfitting.

One popular shrinkage technique is **Lasso Regression** (Least Absolute Shrinkage and Selection Operator). Lasso modifies ordinary least squares regression by adding a penalty term to its loss function, proportional to the absolute values of the regression coefficients. Specifically, the Lasso regression loss function can be expressed mathematically as:

$$
    \mathcal{L}(\beta)
    =
    \underbrace{\frac{1}{2n} \sum_{i=1}^{n} (y_{i} - \mathbf{x}_{i}^{T} \beta)^{2}}_{\text{Ordinary Least Squares (OLS)}}
    +
    \underbrace{\alpha \sum_{j=1}^{p}|\beta_j|}_{\text{Lasso penalty term}}
$$

Here:
- $\mathbf{x}_i$ is the feature vector for observation $i$.
- $y_i$ is the actual outcome for observation $i$.
- $\beta_j$ represents the coefficients (weights) associated with each feature.
- $\alpha$ is a hyperparameter that controls the strength of regularization:
    - Larger $\alpha$ values impose stronger penalties, forcing more coefficients to exactly zero.
    - Smaller $\alpha$ values relax this penalty, allowing more coefficients to remain non-zero.

Because of the absolute value penalty $|\beta_j|$, Lasso regression can shrink coefficients exactly to zero. This property makes it particularly effective for **feature selection**, automatically identifying the most important variables in the dataset by eliminating irrelevant or redundant ones.

Essentially, the shrinkage method is the analysis of the coefficients of the trained model as a function of the regularization parameter $\alpha$. The coefficients are plotted against the $\log_{10}(\alpha)$ values, which allows us to visualize how the coefficients change as we vary the strength of the penalty. This plot is often referred to as a "coefficient path" or "regularization path."

In [None]:
def evaluate_lasso(X, y, alpha=1.0, max_iter=1e5):
    model = make_pipeline(
        StandardScaler(),
        Lasso(alpha=alpha, max_iter=max_iter, random_state=None))
    model.fit(X, y)
    return model

In [None]:
lasso_alphas = np.logspace(-5, 1, 100)
lasso_coeffs = np.zeros((len(lasso_alphas), X.shape[1]))

for i, a in enumerate(lasso_alphas):
    model = evaluate_lasso(X_train, y_train, alpha=a, max_iter=300)
    lasso_coeffs[i] = model.named_steps['lasso'].coef_

In [None]:
nr, nc = 1, 2
fig, axes = plt.subplots(nr, nc, figsize=(nc*10, nr*10),
                         facecolor='black', subplot_kw={'facecolor' : 'black'})

ax = axes[0]
ax.set_xlim(np.log(lasso_alphas.min()), np.log(lasso_alphas.max()))
ax.set_title('Full test range',
             fontsize=16, fontweight='bold', color='white')

ax = axes[1]
ax.set_xlim(-6, -1)
ax.set_ylim(-0.08, 0.08)
ax.set_title('Zoomed on interesting area',
             fontsize=16, fontweight='bold', color='white')

for ax in axes:
    ax.plot(np.log(lasso_alphas), lasso_coeffs,
          lw=3, alpha=0.6)

    ax.set_xlabel('$\\log \\left( \\alpha \\right)$',
                  fontsize=16, fontweight='bold', color='white')
    ax.set_ylabel('Value of coefficients',
                  fontsize=16, fontweight='bold', color='white')
    ax.tick_params(axis='both', which='major',
                   labelsize=12, colors='white', rotation=20)

fig.suptitle('Shrinkage method used on the results of Lasso regression.',
             color='white', fontsize=21, y=0.03)

plt.show()

Around $\log(\alpha) \approx -5$ is where mostly the interesting events happen. That is the range, where a lot of coefficients diverges away from 0, while other coefficients vanish. Two other coefficients does the same, but with much a much smaller extent around $\log(\alpha) \approx -2$, before vanishing quickly. Let's see which features are responsible for this last anomalies.

In [None]:
fig, ax = plt.subplots(figsize=(35, 35))

ax.imshow(lasso_coeffs.T, aspect=0.8, cmap='seismic')

ax.set_xticks([])
ax.set_xticklabels([])

ax.set_yticks([i for i in range(len(df_n.columns[:-1]))])
ax.set_yticklabels(df_n.columns.tolist()[:-1])

plt.show()

## 2.2. Clustering (#3 in the `problems.ipynb`)

In [None]:
N = 1500
# Create a dummy dataset of blobs
Xb, yb = make_blobs(
    n_samples=N,    # Number of points in the dataset
    n_features=2,   # Dimension of the dataset (Here it's a 2D dataset)
    centers=3,      # Number of blobs to create
    cluster_std=[1.0, 2.5, 0.5],
    center_box=(-10, 10),
    random_state=57
)

# Create a dummy dataset of circles
Xc, yc = make_circles(
    n_samples=N,    # Number of points in the dataset
    noise=0.05,
    factor=0.5,
    random_state=57
)

# Create a dummy dataset of moons
Xm, ym = make_moons(
    n_samples=N,    # Number of points in the dataset
    noise=0.1,
    random_state=57
)

In [None]:
# Visualize them
nr, nc = 1, 3
fig, axes = plt.subplots(nrows=nr, ncols=nc, figsize=(8*nc, 8*nr))

Xi = (Xb, Xc, Xm)
yi = (yb, yc, ym)
for X, y, ax in zip(Xi, yi, axes.flat):

    X = X - np.mean(X)
    ax.scatter(*X.T, c=cm.viridis(y/y.max()))

    lim = 1.1 * np.max(np.abs(X))
    ax.set_xlim(-lim, lim)
    ax.set_ylim(-lim, lim)

plt.show()

### An example for clustering: naive *k*-means algorithm

<img src="./images/kmeans.gif" style="display:block; margin:auto;"/>

### Compare different types of clustering methods

In [None]:
import time

from itertools import cycle, islice
from sklearn import cluster
from sklearn.cluster import MeanShift, MiniBatchKMeans, AffinityPropagation, \
                            AgglomerativeClustering, SpectralClustering, \
                            DBSCAN, OPTICS, Birch
from sklearn.mixture import GaussianMixture
from sklearn.neighbors import kneighbors_graph

### Define the datasets and corresponding clustering parameters

In [None]:
datasets = [
  (
    Xb, {}
  ),
  (
    Xc, {
      "damping": 0.77,
      "preference": -240,
      "quantile": 0.2,
      "min_samples": 20,
      "xi": 0.25,
      "n_clusters": 2,
    }
  ),
  (
    Xm, {
      "damping": 0.75,
      "preference": -220,
      "n_clusters": 2
    }
  )
]

default_params = {
    "quantile": 0.3,
    "eps": 0.3,
    "damping": 0.9,
    "preference": -200,
    "n_neighbors": 10,
    "n_clusters": 3,
    "min_samples": 20,
    "xi": 0.05,
    "min_cluster_size": 0.1,
}

### Define different clustering methods

In [None]:
def return_clustering_algos(params, **kwargs):

    bandwidth = None if not kwargs else kwargs["bandwidth"]
    connectivity = None if not kwargs else kwargs["connectivity"]

    ms = MeanShift(
        bandwidth=bandwidth,
        bin_seeding=True
    )
    two_means = MiniBatchKMeans(
        n_clusters=params["n_clusters"]
    )
    affinity_propagation = AffinityPropagation(
        damping=params["damping"],
        preference=params["preference"],
        random_state=0
    )
    ward = AgglomerativeClustering(
        n_clusters=params["n_clusters"],
        linkage="ward",
        connectivity=connectivity
    )
    average_linkage = AgglomerativeClustering(
        linkage="average",
        metric="cityblock",
        n_clusters=params["n_clusters"],
        connectivity=connectivity,
    )
    spectral = SpectralClustering(
        n_clusters=params["n_clusters"],
        eigen_solver="arpack",
        affinity="nearest_neighbors",
    )
    dbscan = DBSCAN(
        eps=params["eps"]
    )
    optics = OPTICS(
        min_samples=params["min_samples"],
        xi=params["xi"],
        min_cluster_size=params["min_cluster_size"],
    )
    birch = Birch(
        n_clusters=params["n_clusters"]
    )
    gmm = GaussianMixture(
        n_components=params["n_clusters"],
        covariance_type="full"
    )

    clustering_algorithms = (
        ("MeanShift", ms),
        ("MiniBatch\nKMeans", two_means),
        ("Affinity\nPropagation", affinity_propagation),
        ("Ward", ward),
        ("Agglomerative\nClustering", average_linkage),
        ("Spectral\nClustering", spectral),
        ("DBSCAN", dbscan),
        ("OPTICS", optics),
        ("BIRCH", birch),
        ("Gaussian\nMixture", gmm),
    )

    return clustering_algorithms

#### Stolen and reworked from matplotlib's website

In [None]:
# ============
# Set up cluster parameters
# ============
nr, nc = len(datasets), len(return_clustering_algos(default_params))
fig, axes = plt.subplots(nr, nc, figsize=(4*nc, 4*nr))
fig.subplots_adjust(
    left=0.02, right=0.98, bottom=0.001, top=0.95, wspace=0.05, hspace=0.01
)


for dataset_i, (dataset, algo_params) in enumerate(datasets):
    # update parameters with dataset-specific values
    params = default_params.copy()
    params.update(algo_params)

    #X, y = dataset

    # normalize dataset for easier parameter selection
    X = StandardScaler().fit_transform(dataset)

    # estimate bandwidth for mean shift
    bandwidth = cluster.estimate_bandwidth(X, quantile=params["quantile"])

    # connectivity matrix for structured Ward
    connectivity = kneighbors_graph(
        X, n_neighbors=params["n_neighbors"], include_self=False
    )
    # make connectivity symmetric
    connectivity = 0.5 * (connectivity + connectivity.T)

    extra_params = {"bandwidth" : bandwidth, "connectivity" : connectivity}

    # ============
    # Create cluster objects
    # ============
    clustering_algorithms = return_clustering_algos(params, **extra_params)

    for ax_i, (name, algorithm) in enumerate(clustering_algorithms):
        # Fit
        t0 = time.time()
        algorithm.fit(X)
        t1 = time.time()
        if hasattr(algorithm, "labels_"):
            y_pred = algorithm.labels_.astype(int)
        else:
            y_pred = algorithm.predict(X)

        ax = axes[dataset_i, ax_i]
        ax.axis('off')
        if dataset_i == 0:
            ax.set_title(name, fontsize=18)

        colors = np.array(
            list(
                islice(
                    cycle(
                        [
                            "#377eb8",
                            "#ff7f00",
                            "#4daf4a",
                            "#f781bf",
                            "#a65628",
                            "#984ea3",
                            "#999999",
                            "#e41a1c",
                            "#dede00",
                        ]
                    ),
                    int(max(y_pred) + 1),
                )
            )
        )
        # add black color for outliers (if any)
        colors = np.append(colors, ["#000000"])
        ax.scatter(*X.T, color=np.array(colors)[y_pred], s=16)

        lim = 1.1 * np.max(np.abs(X))
        ax.set_xlim(-lim, lim)
        ax.set_ylim(-lim, lim)

plt.show()

## 3. Classification (#2 in the `problems.ipynb`)

In [None]:
X, y = make_classification(
    n_samples=100,    # Number of points in the data set
    n_features=6,     # Number of features in the data set
    n_informative=4,
    n_redundant=2,
    n_repeated=0,
    n_classes=2,
    n_clusters_per_class=2,
    random_state=0,
)

In [None]:
pd.DataFrame(X)

### Literally the same as regression, but with different methods...