In [1]:
%load_ext watermark
%watermark

2017-09-18T21:24:19+02:00

CPython 3.6.1
IPython 5.3.0

compiler   : GCC 4.8.2 20140120 (Red Hat 4.8.2-15)
system     : Linux
release    : 4.10.0-33-generic
machine    : x86_64
processor  : x86_64
CPU cores  : 8
interpreter: 64bit


In [2]:
import numpy as np
import pandas as pd
from sklearn import metrics
from sklearn.model_selection import cross_validate

In [3]:
vehiculos = pd.read_csv("data/vehiculos_procesado.csv")
datos_entrenamiento = vehiculos[["desplazamiento","cilindros","consumo"]]

objetivo = vehiculos["co2"]

In [4]:
datos_entrenamiento.head()

Unnamed: 0,desplazamiento,cilindros,consumo
0,2.5,4.0,17
1,4.2,6.0,13
2,2.5,4.0,16
3,4.2,6.0,13
4,3.8,6.0,16


In [5]:
from sklearn.linear_model import (LinearRegression, Lasso,
                                  Ridge, ElasticNet)


In [6]:
modelo_ols = LinearRegression()
modelo_ols.fit(datos_entrenamiento, objetivo)

modelo_ols.coef_

array([ 11.76787991,   1.23791071, -19.80355606])

In [7]:
def norma_l1(coeficientes):
    return np.abs(coeficientes).sum()

def norma_l2(coeficientes):
    return np.sqrt(np.power(coeficientes, 2).sum())

print(norma_l1(modelo_ols.coef_))
print(norma_l2(modelo_ols.coef_))

32.8093466802
23.0693791245


En vez de definir esta función a mano, vamos a usar la funcion de [`numpy.linalg.norm`](https://docs.scipy.org/doc/numpy-1.13.0/reference/generated/numpy.linalg.norm.html) que es una función que calcula varios tipos de normas.

In [8]:
def norma_l1(coeficientes):
    return np.linalg.norm(coeficientes, ord=1)

def norma_l2(coeficientes):
    return np.linalg.norm(coeficientes, ord=2)

print(norma_l1(modelo_ols.coef_))
print(norma_l2(modelo_ols.coef_))

32.8093466802
23.0693791245


In [None]:
def norma_l1_cv(estimator, X, y):
    return norma_l1(estimator.coef_)

def norma_l2_cv(estimator, X, y):
    return norma_l2(estimator.coef_)

In [9]:
from sklearn.preprocessing import PolynomialFeatures

In [10]:
PolynomialFeatures?

In [11]:
transformador_polinomial = PolynomialFeatures(5)

In [12]:
transformador_polinomial.fit(datos_entrenamiento)

PolynomialFeatures(degree=5, include_bias=True, interaction_only=False)

In [13]:
variables_polinomiales = transformador_polinomial.transform(
    datos_entrenamiento)

In [14]:
variables_polinomiales.shape

(35539, 56)

In [15]:
datos_entrenamiento.loc[0]

desplazamiento     2.5
cilindros          4.0
consumo           17.0
Name: 0, dtype: float64

In [16]:
variables_polinomiales[0]

array([  1.00000000e+00,   2.50000000e+00,   4.00000000e+00,
         1.70000000e+01,   6.25000000e+00,   1.00000000e+01,
         4.25000000e+01,   1.60000000e+01,   6.80000000e+01,
         2.89000000e+02,   1.56250000e+01,   2.50000000e+01,
         1.06250000e+02,   4.00000000e+01,   1.70000000e+02,
         7.22500000e+02,   6.40000000e+01,   2.72000000e+02,
         1.15600000e+03,   4.91300000e+03,   3.90625000e+01,
         6.25000000e+01,   2.65625000e+02,   1.00000000e+02,
         4.25000000e+02,   1.80625000e+03,   1.60000000e+02,
         6.80000000e+02,   2.89000000e+03,   1.22825000e+04,
         2.56000000e+02,   1.08800000e+03,   4.62400000e+03,
         1.96520000e+04,   8.35210000e+04,   9.76562500e+01,
         1.56250000e+02,   6.64062500e+02,   2.50000000e+02,
         1.06250000e+03,   4.51562500e+03,   4.00000000e+02,
         1.70000000e+03,   7.22500000e+03,   3.07062500e+04,
         6.40000000e+02,   2.72000000e+03,   1.15600000e+04,
         4.91300000e+04,

In [17]:
variables_polinomiales = PolynomialFeatures(5).fit_transform(
    datos_entrenamiento)

In [18]:
variables_polinomiales.shape

(35539, 56)

Ahora vamos a evaluar los distintos tipos de regularizacion

**Modelo OLS con variables polinomiales**

In [19]:
RESULTADOS = {}

In [20]:
modelo_ols = LinearRegression()
modelo_ols.fit(variables_polinomiales, objetivo)
print(modelo_ols.coef_)

RESULTADOS["ols"] = {
    "norma_l1": norma_l1(modelo_ols.coef_),
    "norma_l2": norma_l2(modelo_ols.coef_),
}

[ -1.33584245e-04  -2.16202014e+03  -4.74743022e+02  -9.33379437e+02
   2.06179369e+03  -1.98652141e+03   4.07183963e+02   6.98188920e+02
  -8.60206821e+00   5.68565588e+01  -5.04043555e+02   5.97572364e+02
  -2.39915727e+02  -1.93155159e+02   2.06936510e+02  -2.46367000e+01
  -7.53120030e+00  -6.16587971e+01   2.77167163e+00  -1.72213966e+00
   3.22404684e+01  -2.32860787e+01   3.67724866e+01  -1.28564530e+01
  -4.24157144e+01   9.08423003e+00   1.17166991e+01   1.52755499e+01
  -7.40121866e+00   6.04111961e-01  -9.27527797e-01  -8.19881992e-01
   1.96304324e+00  -8.55528666e-02   2.44388052e-02  -2.44193545e+00
   6.17851513e+00  -1.41371913e+00  -7.50158521e+00   1.59153882e+00
  -6.92569799e-01   5.18941968e+00  -3.95368961e-01   8.24178211e-01
  -1.15475754e-01  -1.85224537e+00  -2.10933016e-02  -3.68950001e-01
   9.99115995e-02  -5.66794749e-03   2.41619836e-01  -2.05795972e-02
   5.80537282e-02  -2.77634650e-02   1.11091111e-03  -1.26148038e-04]


**Modelo Regularización L1 con variables polinomiales**

In [21]:
Lasso?

In [22]:
modelo_l1 = Lasso(alpha=1.0, tol=0.01, max_iter=5000)
modelo_l1.fit(variables_polinomiales, objetivo)
print(modelo_l1.coef_)

RESULTADOS["regularizacion_l1"] = {
    "norma_l1": norma_l1(modelo_l1.coef_),
    "norma_l2": norma_l2(modelo_l1.coef_),
}

[  0.00000000e+00   0.00000000e+00   0.00000000e+00  -3.30237023e+01
   5.12409278e+00   1.44708364e+00  -1.47969971e+00   6.52491862e-01
  -5.04276126e-01   5.26431228e-02  -1.48062992e-01  -0.00000000e+00
  -1.32137016e-01   1.02289253e-01  -9.17626300e-02  -1.30192905e-02
   6.15573125e-02  -3.01413839e-02   2.31053922e-02   5.17315962e-03
   1.91792946e-02  -1.51220456e-03  -1.88077866e-03  -1.09782669e-03
  -6.50140180e-03  -2.91026927e-04   1.73571147e-03  -3.33999083e-03
  -1.64104930e-03   1.59784747e-03   1.31929178e-03   1.69629743e-04
  -1.64562586e-03   2.03369130e-04   1.99415033e-05  -2.81072727e-03
   7.11467485e-04   2.46531750e-03  -2.20515928e-03   2.38235311e-04
   6.38712931e-04  -1.09015174e-03  -3.89575283e-04  -8.09552788e-05
   2.70933679e-04  -1.56154368e-04  -1.88340282e-04  -2.27097112e-04
   3.81560605e-05   2.29005451e-06  -5.56615246e-05   1.44114807e-04
   4.13806087e-06   5.51970343e-05   7.62940670e-07  -1.14293565e-06]


**Modelo Regularización L2 (Ridge)  con variables polinomiales**

In [23]:
modelo_l2 = Ridge(alpha=1.0, tol=0.01, max_iter=5000)
modelo_l2.fit(variables_polinomiales, objetivo)

print(modelo_l2.coef_)
RESULTADOS["regularizacion_l2"] = {
    "norma_l1": norma_l1(modelo_l2.coef_),
    "norma_l2": norma_l2(modelo_l2.coef_),
}

[  0.00000000e+00   1.51700247e+01   1.42698882e+00  -9.87023278e+00
   4.84917206e+01   3.40056616e+01   4.08618469e+01   1.87320061e+01
  -3.38485609e+01  -4.32030584e+00  -2.38650499e+01   2.49478095e+01
  -1.68754256e+01  -1.28015201e+01  -6.76127446e+00  -2.53957180e+00
  -4.79471152e+00   6.20678299e+00   2.06740033e+00   2.74905008e-01
   1.53996044e+01  -3.67045279e+01   3.58931719e+00   3.45190179e+01
  -4.38968686e+00   9.62645876e-01  -1.53177951e+01   4.03072220e+00
  -3.81915655e-02   4.61122793e-02   3.15169731e+00  -1.12388091e+00
  -2.48505884e-01  -3.05622232e-02  -7.73542279e-03  -2.33675034e+00
   6.80498512e+00  -8.69751682e-01  -7.95084511e+00   2.20023690e+00
  -1.37698389e-01   4.60217944e+00  -2.17511174e+00   2.25464545e-01
  -1.86520202e-02  -1.28786598e+00   9.40206797e-01  -1.93273412e-01
   1.34228203e-02  -4.68453867e-04   1.24956763e-01  -1.58111975e-01
   5.90668933e-02  -2.26321629e-03   2.49826293e-04   8.13220717e-05]


**Regularización Elasticnet con variables polinomiales**

In [24]:
ElasticNet?

In [25]:
modelo_elasticnet = ElasticNet(l1_ratio=0.5, tol=0.01,max_iter=5000)
modelo_elasticnet.fit(variables_polinomiales, objetivo)
print(modelo_elasticnet.coef_)

RESULTADOS["regularizacion_elasticnet"] = {
    "norma_l1": norma_l1(modelo_elasticnet.coef_),
    "norma_l2": norma_l2(modelo_elasticnet.coef_),
}

[  0.00000000e+00   0.00000000e+00   0.00000000e+00  -1.33310440e+00
   4.89645007e+00   6.09439149e+00  -1.31816563e+00   2.37723778e+00
  -3.10050434e+00  -6.41934303e-01  -7.84374220e-01  -9.59081564e-02
  -7.18954629e-02   7.19478059e-03  -4.57252233e-02  -6.37787774e-02
   1.22426563e-02   3.46550752e-02   4.26955780e-02   2.66673958e-03
   1.39812144e-02  -8.09891082e-03   8.76409316e-03  -1.69690074e-02
  -2.05177215e-03   1.54020194e-03  -6.29758134e-03  -2.92855315e-03
  -6.86058449e-04   1.42133650e-03  -9.92879254e-04   1.28704557e-04
   5.03220762e-04   6.61024458e-04   5.80200778e-05   3.41020839e-03
   2.27563082e-03   2.71125911e-03  -1.90350591e-03  -7.82250851e-05
   6.56665428e-04  -7.34759017e-04  -9.12167373e-04  -2.43120629e-04
   1.60164558e-04   3.89147755e-04  -5.45731423e-04  -3.49391006e-04
   3.45240438e-05   3.46406797e-05   2.59823852e-04  -6.35175029e-05
  -9.14762151e-05   5.08095098e-05   1.14384625e-05   7.36969809e-07]


In [26]:
pd.set_option("display.float_format", lambda x: str(round(x,6)))

In [27]:
resultados_df = pd.DataFrame(RESULTADOS).T
l1_ols = resultados_df.loc["ols", "norma_l1"]
l2_ols = resultados_df.loc["ols", "norma_l2"]

resultados_df["pct_reduccion_l1"] = 1-resultados_df.norma_l1 / l1_ols
resultados_df["pct_reduccion_l2"] = 1-resultados_df.norma_l2 / l2_ols

resultados_df

Unnamed: 0,norma_l1,norma_l2,pct_reduccion_l1,pct_reduccion_l2
ols,10853.748045,3922.194755,0.0,0.0
regularizacion_elasticnet,21.002924,8.997096,0.998065,0.997706
regularizacion_l1,42.945147,33.494056,0.996043,0.99146
regularizacion_l2,457.523445,109.550681,0.957847,0.972069


In [None]:
from sklearn import datasets


boston = datasets.load_boston()

datos_entrenamiento = boston["data"]
objetivo = boston["target"]

### Benchmarks con validación cruzada 

In [None]:
def rmse(objetivo, estimaciones):
    return np.sqrt(metrics.mean_squared_error(objetivo, estimaciones)
                 )

def rmse_cv(estimator, X, y):
    y_pred = estimator.predict(X)
    return rmse(y, y_pred)

def evaluar_modelo(modelo, X, y):
    scoring = {
        "mae": "neg_mean_absolute_error",
        "rmse": rmse_cv,
        "l1_norm":norma_l1_cv,
        "l2_norm":norma_l2_cv,
    }
    scores = cross_validate(modelo, X, y,
                            scoring=scoring,
                            cv=100, 
                            return_train_score=False,
                            n_jobs=-1)
    return pd.DataFrame(scores).mean()

RESULTADOS = {}

In [None]:
modelo_ols = LinearRegression()
RESULTADOS["ols_cuadratico"] = evaluar_modelo(
                        modelo_ols,
                        variables_polinomiales,
                        objetivo
)

In [None]:
modelo_l1 = Lasso(alpha=1.0, tol=0.015)

RESULTADOS["l1_cuadratico"] = evaluar_modelo(
                        modelo_l1,
                        variables_polinomiales,
                        objetivo
)

In [None]:
modelo_l2 = Ridge(alpha=1.0, tol=0.015)

RESULTADOS["l2_cuadratico"] = evaluar_modelo(
                        modelo_l2,
                        variables_polinomiales,
                        objetivo
)

In [None]:
modelo_elasticnet = ElasticNet(l1_ratio=0.5)

RESULTADOS["elasticnet_cuadratico"] = evaluar_modelo(
                        modelo_elasticnet,
                        variables_polinomiales,
                        objetivo
)

In [None]:
pd.DataFrame(RESULTADOS)