In [1]:
from sklearn.model_selection import cross_val_score, train_test_split, RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import MinMaxScaler
from sklearn.kernel_ridge import KernelRidge
from sklearn.datasets import load_boston
from sklearn.linear_model import Ridge
from sklearn.metrics import r2_score
import pandas as pd
import numpy as np

pd.set_option("display.max_columns", 100)

## Carga de datos

In [2]:
boston = load_boston()
df = pd.DataFrame(data = boston["data"], columns=boston["feature_names"])
df["target"] = boston["target"]
tgt = "target"
ls_features = [x for x in df.columns if x not in [tgt]]

## Preparación de datos

In [3]:
X = df[ls_features]
y = df[[tgt]]

In [4]:
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size = 0.7)

In [5]:
mm_x = MinMaxScaler()
mm_y = MinMaxScaler()

In [6]:
Xs = mm_x.fit_transform(X_train) 
ys = mm_y.fit_transform(y_train)

## Modelado

### Ridge Regression

In [7]:
param_grid = {
    "alpha": [x/1000 for x in range(1000)] + [x for x in range(100)],
    "solver": ['auto', 'svd', 'cholesky', 'lsqr', 'sparse_cg', 'sag', 'saga']
}

In [8]:
model = Ridge()

In [9]:
Ridge?

In [10]:
model.fit(Xs, ys)

Ridge()

In [11]:
clf = GridSearchCV(model, param_grid, cv=4, error_score=-1000, n_jobs=-1, scoring="r2", verbose=5)
clf.fit(Xs, ys)
print("Best score: " + str(clf.best_score_))
print("Best estimator: " + str(clf.best_estimator_))

Fitting 4 folds for each of 7700 candidates, totalling 30800 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    1.1s
[Parallel(n_jobs=-1)]: Done  72 tasks      | elapsed:    1.5s
[Parallel(n_jobs=-1)]: Done 2696 tasks      | elapsed:    3.8s
[Parallel(n_jobs=-1)]: Done 10760 tasks      | elapsed:    7.5s
[Parallel(n_jobs=-1)]: Done 21128 tasks      | elapsed:   13.4s


Best score: 0.7217479663317007
Best estimator: Ridge(alpha=0.606, solver='sag')


[Parallel(n_jobs=-1)]: Done 30713 tasks      | elapsed:   18.5s
[Parallel(n_jobs=-1)]: Done 30800 out of 30800 | elapsed:   18.6s finished


### Kernel Ridge Regression

In [12]:
param_grid = {"alpha": [x/100 for x in range(100)],
              "kernel": ['linear', 'poly', 'rbf', 'sigmoid', "chi2", "laplacian", "exponential"], 
              "degree": [1, 2, 3],
              "gamma": [x/10 for x in range(10)]}

In [13]:
n_hyper = np.product([x for x in map(len, param_grid.values())])

In [14]:
n_hyper

21000

In [15]:
model = KernelRidge()

In [16]:
model.fit(Xs, ys)

KernelRidge()

In [17]:
KernelRidge?

In [18]:
clf = RandomizedSearchCV(model, param_grid, cv=4, error_score=-1000, n_jobs=-1, scoring="r2", verbose=5, n_iter=n_hyper*.25,)
clf.fit(Xs, ys)
print("Best score: " + str(clf.best_score_))

Fitting 4 folds for each of 5250 candidates, totalling 21000 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.
[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:    0.0s
[Parallel(n_jobs=-1)]: Done 176 tasks      | elapsed:    0.4s
[Parallel(n_jobs=-1)]: Done 2608 tasks      | elapsed:    4.7s
[Parallel(n_jobs=-1)]: Done 6640 tasks      | elapsed:   13.4s
[Parallel(n_jobs=-1)]: Done 11824 tasks      | elapsed:   23.9s
[Parallel(n_jobs=-1)]: Done 18160 tasks      | elapsed:   33.8s


Best score: 0.861432595439045


[Parallel(n_jobs=-1)]: Done 21000 out of 21000 | elapsed:   38.0s finished


In [19]:
clf.best_score_

0.861432595439045

In [20]:
clf.best_estimator_

KernelRidge(alpha=0.02, degree=1, gamma=0.3, kernel='laplacian')

### Pruebas del modelo ganador

In [21]:
y_test


Unnamed: 0,target
438,8.4
161,50.0
242,22.2
0,24.0
314,23.8
...,...
144,11.8
336,19.5
117,19.2
339,19.0


In [22]:
X_test

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
438,13.67810,0.0,18.10,0.0,0.740,5.935,87.9,1.8206,24.0,666.0,20.2,68.95,34.02
161,1.46336,0.0,19.58,0.0,0.605,7.489,90.8,1.9709,5.0,403.0,14.7,374.43,1.73
242,0.10290,30.0,4.93,0.0,0.428,6.358,52.9,7.0355,6.0,300.0,16.6,372.75,11.22
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.0900,1.0,296.0,15.3,396.90,4.98
314,0.36920,0.0,9.90,0.0,0.544,6.567,87.3,3.6023,4.0,304.0,18.4,395.69,9.28
...,...,...,...,...,...,...,...,...,...,...,...,...,...
144,2.77974,0.0,19.58,0.0,0.871,4.903,97.8,1.3459,5.0,403.0,14.7,396.90,29.29
336,0.03427,0.0,5.19,0.0,0.515,5.869,46.3,5.2311,5.0,224.0,20.2,396.90,9.80
117,0.15098,0.0,10.01,0.0,0.547,6.021,82.6,2.7474,6.0,432.0,17.8,394.51,10.30
339,0.05497,0.0,5.19,0.0,0.515,5.985,45.4,4.8122,5.0,224.0,20.2,396.90,9.74


In [23]:
mm_x

MinMaxScaler()

In [24]:
X_test_sc=mm_x.transform(X_test)

In [25]:
X_test_sc

array([[ 1.53641446e-01,  0.00000000e+00,  6.42962963e-01, ...,
         8.08510638e-01,  1.73054617e-01,  9.15573303e-01],
       [ 1.63464848e-02,  0.00000000e+00,  6.97777778e-01, ...,
         2.23404255e-01,  9.43340562e-01, -5.41928123e-03],
       [ 1.05477146e-03,  3.00000000e-01,  1.55185185e-01, ...,
         4.25531915e-01,  9.39104342e-01,  2.65259555e-01],
       ...,
       [ 1.59519571e-03,  0.00000000e+00,  3.43333333e-01, ...,
         5.53191489e-01,  9.93973473e-01,  2.39018825e-01],
       [ 5.16033223e-04,  0.00000000e+00,  1.64814815e-01, ...,
         8.08510638e-01,  1.00000000e+00,  2.23046207e-01],
       [ 2.91478404e-03,  0.00000000e+00,  3.31481481e-01, ...,
         7.02127660e-01,  1.00000000e+00,  3.47404450e-01]])

In [26]:
kernel_model=clf.best_estimator_

In [27]:
kernel_model.predict(X_test_sc)

array([[0.11852249],
       [0.86722594],
       [0.33685507],
       [0.45891776],
       [0.36612234],
       [0.10940708],
       [0.52383987],
       [0.16164811],
       [0.10926588],
       [0.5916373 ],
       [0.62800829],
       [0.25550449],
       [0.48556013],
       [0.27329064],
       [0.11360229],
       [0.30020138],
       [0.18742497],
       [0.11912544],
       [0.37154094],
       [0.23867953],
       [0.93254879],
       [0.29896216],
       [0.46875481],
       [0.3128527 ],
       [0.21084025],
       [0.34630574],
       [0.06766382],
       [0.23859132],
       [0.42395875],
       [0.36004799],
       [0.29899795],
       [0.232486  ],
       [0.22783143],
       [0.58899804],
       [0.14465979],
       [0.2215909 ],
       [0.17558096],
       [0.37057628],
       [0.3953385 ],
       [0.40112574],
       [0.10809224],
       [0.240209  ],
       [0.48075148],
       [0.41356006],
       [0.43395266],
       [0.48567552],
       [0.13709251],
       [0.232

In [28]:
mm_y.inverse_transform(kernel_model.predict(X_test_sc))

array([[10.33351195],
       [44.02516711],
       [20.15847795],
       [25.65129917],
       [21.47550519],
       [ 9.92331878],
       [28.57279437],
       [12.27416512],
       [ 9.91696443],
       [31.62367872],
       [33.2603732 ],
       [16.49770187],
       [26.85020591],
       [17.29807893],
       [10.11210288],
       [18.50906216],
       [13.43412365],
       [10.36064502],
       [21.71934245],
       [15.74057865],
       [46.9646955 ],
       [18.4532973 ],
       [26.09396649],
       [19.0783716 ],
       [14.48781143],
       [20.58375828],
       [ 8.04487181],
       [15.73660942],
       [24.0781439 ],
       [21.20215933],
       [18.45490771],
       [15.46186984],
       [15.25241427],
       [31.50491192],
       [11.50969038],
       [14.97159034],
       [12.90114328],
       [21.67593271],
       [22.79023257],
       [23.05065816],
       [ 9.86415062],
       [15.80940482],
       [26.63381641],
       [23.61020266],
       [24.52786988],
       [26

In [29]:
r2_score(y_pred=mm_y.inverse_transform(kernel_model.predict(X_test_sc)), y_true=y_test)

0.8951387504102031

In [30]:
r2_score(y_pred=mm_y.inverse_transform(model.predict(X_test_sc)), y_true=y_test)

0.6130016057464873