In [0]:
import numpy as np
import pandas as pd
import plotly.graph_objects as go

from sklearn.linear_model import Ridge, Lasso, ElasticNet, LinearRegression
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.datasets import load_boston

In [0]:
# adapted code from
# http://www.davidsbatista.net/blog/2018/02/23/model_optimization/

class EstimatorSelectionHelper:
    def __init__(self, models, params):
        if not set(models.keys()).issubset(set(params.keys())):
            missing_params = list(set(models.keys()) - set(params.keys()))
            raise ValueError(
                f'Some estimators are missing parameters: {missing_params}')
        self.models = models
        self.params = params
        self.keys = models.keys()
        self.grid_searches = {}

    def fit(self, X, y, cv=3, n_jobs=3, verbose=1, scoring=None, refit=False):
        for key in self.keys:
            print(f'Running GridSearchCV for {key}')
            model = self.models[key]
            params = self.params[key]
            gs = GridSearchCV(model, params, cv=cv, n_jobs=n_jobs,
                              verbose=verbose, scoring=scoring, refit=refit,
                              return_train_score=True)
            gs.fit(X, y)
            self.grid_searches[key] = gs

    def score_summary(self, sort_by='mean_score',
                      num_rows_per_estimator=None,
                      score_precision=5):
        def row(key, scores, params):
            d = {
                 'estimator': key,
                 'mean_score': round(np.mean(scores), score_precision),
                 'std_score': round(np.std(scores), score_precision),
                 'min_score': round(min(scores), score_precision),
                 'max_score': round(max(scores), score_precision),
            }
            return pd.Series({**params, **d})

        rows = []
        for k in self.grid_searches:
            params = self.grid_searches[k].cv_results_['params']
            scores = []
            for i in range(self.grid_searches[k].cv):
                key = "split{}_test_score".format(i)
                r = self.grid_searches[k].cv_results_[key]
                scores.append(r.reshape(len(params), 1))

            all_scores = np.hstack(scores)
            for p, s in zip(params, all_scores):
                rows.append((row(k, s, p)))

        df = pd.concat(rows, axis=1).T.sort_values([sort_by], ascending=False)

        columns = ['estimator', 'mean_score', 'std_score',
                   'min_score', 'max_score']
        columns = columns + [c for c in df.columns if c not in columns]

        if num_rows_per_estimator:
            df = df.groupby('estimator')
            df = df.head(num_rows_per_estimator)

        return df[columns]

# Minimisation du risque empirique régularisé

On s'intéresse aux algorithmes d'apprentissage basés sur la minimisation du
risque empirique régularisé

$$\min_f \sum_i \ell(y_i ,f(x_i)) + \alpha \Omega(f) $$

# Jeu de données Concrete

In [0]:
# download dataset
! wget https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls
df = pd.read_excel('Concrete_Data.xls')

# clean column names
df.columns = ['cement', 'slag', 'ash', 'water', 'superplasticizer',
              'coarse', 'aggregate', 'age', 'strength']

# get data and target
target = 'strength'
Y = df[target]
X = df.drop(columns=[target], axis=1)

# split into train/test
ratio, seed = 0.2, 42
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size=ratio,
                                                    random_state=seed)

--2019-11-02 09:43:24--  https://archive.ics.uci.edu/ml/machine-learning-databases/concrete/compressive/Concrete_Data.xls
Resolving archive.ics.uci.edu (archive.ics.uci.edu)... 128.195.10.252
Connecting to archive.ics.uci.edu (archive.ics.uci.edu)|128.195.10.252|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 124928 (122K) [application/x-httpd-php]
Saving to: ‘Concrete_Data.xls’


2019-11-02 09:43:25 (290 KB/s) - ‘Concrete_Data.xls’ saved [124928/124928]



## Les régularisations étudiées

* $\ell(y_i,f(x_i)) = (y_i - \beta^T x_i)^2$

### Régression ridge

* $\Omega(f) = \|\beta\|_2^2$ (norme $L_2$)

### Régression Lasso

* $\Omega(f) = \|\beta\|_1$ (norme $L_1$)

### Régression ElasticNet

* combine une régularization $L_1$ et une régularization $L_2$
* coefficients de régularisation $\alpha$ $(L_2)$ et $\rho$ $(L_1)$
* $\Omega(f) = \alpha \rho \|\beta\|_1 + \frac{\alpha(1-\rho)}{2}\|\beta\|_2$

In [0]:
alpha = 5
l1_ratio = 0.5

# initialize classifiers
classifiers = {
    'Ridge': Ridge(alpha=alpha),
    'Lasso': Lasso(alpha=alpha),
    'ElasticNet': ElasticNet(alpha=alpha, l1_ratio=l1_ratio),
    'LinearRegression': LinearRegression()
}

results = []

# train classifiers while saving coefficients and scores
for clf_name, clf in classifiers.items():
    clf.fit(X_train, Y_train)
    results.append({
        'name': clf_name,
        'coefficients': clf.coef_,
        'train_score': clf.score(X_train, Y_train),
        'test_score': clf.score(X_test, Y_test)
    })

In [0]:
# ploting power !
traces = []

for result in results:
    traces.append(
        go.Bar(
            x=X.columns,
            y=result['coefficients'],
            name=f"Coefficients {result['name']}",
            text=result['coefficients'].round(3),
            textposition='auto'
        )
    )

fig = go.Figure(
    data=traces,
    layout={
        'legend': {
            'orientation': 'h'
        }
    }
)
fig.show()

## Regardons un peu le dataset

In [0]:
traces = []

for column in X.columns:
    traces.append(dict(label=column, values=df[column]))

fig = go.Figure(
    data=[
        go.Splom(
            dimensions=traces,
            showupperhalf=False
        )
    ]
)
fig.show()

### Sélection du paramètre $\alpha$

* Le choix de $\alpha$ est très important.

* On choisit $\alpha$ qui minimise l'erreur de cross validation parmi plusieurs
 valeurs définies dans une grille.  

In [0]:
# todo plot lines when alpha changes

# Jeu de données Boston

In [0]:
boston = load_boston()
X, Y = boston.data, boston.target

In [0]:
# split into train/test
ratio, seed = 0.2, 42
X_train, X_test, Y_train, Y_test = train_test_split(X, Y,
                                                    test_size=ratio,
                                                    random_state=seed)

* Appliquer et comparer la régression Ridge et la régression Lasso sur le jeu
 de données boston. 

In [0]:
alpha = 1
l1_ratio = 0.5

# initialize classifiers
classifiers = {
    f'Ridge_alpha={alpha}': Ridge(alpha=alpha),
    f'Lasso_alpha={alpha}': Lasso(alpha=alpha),
    f'ElasticNet_alpha={alpha} & l1_ratio={l1_ratio}': ElasticNet(alpha=alpha, l1_ratio=l1_ratio),
    'LinearRegression': LinearRegression()
}

results = []

# train classifiers while saving coefficients and scores
for clf_name, clf in classifiers.items():
    clf.fit(X_train, Y_train)
    results.append({
        'name': clf_name,
        'coefficients': clf.coef_,
        'train_score': clf.score(X_train, Y_train),
        'test_score': clf.score(X_test, Y_test)
    })

* Afficher les coefficients des régressions.

In [0]:
# ploting coefficients
traces = []

for result in results:
    traces.append(
        go.Bar(
            x=boston.feature_names,
            y=result['coefficients'],
            name=f"Coefficients {result['name']}",
            text=result['coefficients'].round(3),
            textposition='auto'
        )
    )

fig = go.Figure(
    data=traces,
    layout={
        'legend': {
            'orientation': 'h'
        }
    }
)
fig.show()

* Afficher l'erreur de prédiction sur les données de test et les données 
d'apprentissage en fonction du paramètre de régularisation.

In [0]:
# ploting train/test scores
results = pd.DataFrame(results)

fig = go.Figure(
    data=[
        go.Bar(
            x=results['name'],
            y=results['train_score'],
            name="Train scores",
            text=results['train_score'].round(5),
            textposition='auto'
        ),
        go.Bar(
            x=results['name'],
            y=results['test_score'],
            name="Test scores",
            text=results['test_score'].round(5),
            textposition='auto'
        ),
    ],
    layout={
        'legend': {
            'orientation': 'h'
        }
    }
)
fig.show()

* Afficher l'erreur de prédiction sur les données de test et les données 
d'apprentissage en fonction du nombre de données d'apprentissage dans les 
deux cas : $\alpha =0$ et $\alpha$ optimal choisi par cross-validation.

* Lire les sections 3.1 et 3.2 du chapitre 3 du livre "Pattern Recognition and
 Machine Learning" (Bishop, 2006).

In [0]:
models = {
    'Ridge': Ridge(),
    'Lasso': Lasso(),
    'ElasticNet': ElasticNet(),
    # 'LinearRegression': LinearRegression()
}

alphas = np.linspace(0, 10, 100)
l1_ratios = np.linspace(0, 1, 10)

params = {
    'Lasso': {
        'alpha': alphas,
    },
    'Ridge': {
        'alpha': alphas,
    },
    'ElasticNet': {
        'alpha': alphas,
        'l1_ratio': l1_ratios
    }
}

grid = EstimatorSelectionHelper(models, params)
grid.fit(X_train, Y_train, n_jobs=-1)

gs = grid.score_summary(sort_by='mean_score',
                        num_rows_per_estimator=5)

Running GridSearchCV for Ridge
Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 243 tasks      | elapsed:    2.0s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    2.1s finished


Running GridSearchCV for Lasso
Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 274 tasks      | elapsed:    1.7s
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed:    1.8s finished


Running GridSearchCV for ElasticNet
Fitting 3 folds for each of 1000 candidates, totalling 3000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done 904 tasks      | elapsed:    3.2s
[Parallel(n_jobs=-1)]: Done 3000 out of 3000 | elapsed:    7.3s finished

Sorting because non-concatenation axis is not aligned. A future version
of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.





In [0]:
# ploting power !
fig = go.Figure(
    data=[
        go.Table(
            header=dict(
                values=list(
                    gs.columns
                )
            ),
            cells=dict(
                values=[
                    gs.estimator,
                    gs.mean_score,
                    gs.std_score,
                    gs.min_score,
                    gs.max_score,
                    gs.alpha,
                    gs.l1_ratio
                ]
            )
        )
    ]
)
fig.show()

# **Conclusions**

* On voit bien que Lasso "ramène" rapidement certains coefficients à 0 sans que ce mécanisme ne soit lié à la variance de chaque feature que la regression cherche à retrouver : le Lasso permet une sélection des variables prédictives.

* Un feature qui a une forte variance ne sera pas systématiquement priorisé ou délaissé par les régressions, ce qui nous empêche de prévoir à l'avance quels features seront principalement considérés par notre régression.

* Ridge, Lasso et ElasticNet permettent de contrôler les poids d'une régression linéaire tout en offrant une souplesse au niveau de cette régularisation grâce au paramètre $ \alpha $.

* Les régularisations permettent de résoudre le problème (ou du moins de réduire son importance) du sur-apprentissage : la différence entre le score train et celui du test se réduit plus $ \alpha $ croît.

* Le paramètre $ \alpha $ optimal est trouvé par cross-validation sur chaque jeu de données.