In [1]:
import os
import numpy as np
import matplotlib as mpl
import matplotlib.pyplot as plt
import pandas as pd

In [2]:
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet
from sklearn.svm import SVR
from sklearn.neighbors import KNeighborsRegressor
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, BaggingRegressor, VotingRegressor, StackingRegressor
from sklearn.preprocessing import PolynomialFeatures, StandardScaler
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.pipeline import Pipeline
from sklearn.metrics import mean_squared_error
from sklearn.decomposition import PCA, FactorAnalysis, FastICA
from scipy.stats import randint

In [17]:
from sklearn.datasets import fetch_california_housing
from sklearn.model_selection import train_test_split

X,y = fetch_california_housing(return_X_y=True)

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=43)

In [18]:
X_train.shape

(14448, 8)

In [57]:
p2 = PolynomialFeatures(degree=2)
X_train2 = p2.fit_transform(X_train)
X_train2.shape

(14448, 9)

In [20]:
p3 = PolynomialFeatures(degree=3)
X_train3 = p3.fit_transform(X_train)
X_train3.shape

(14448, 165)

In [21]:
reg = Pipeline([
            ("scaler", StandardScaler()),
            ("reg", LinearRegression())
        ])
print(np.mean(cross_val_score(reg,X_train,y_train,cv=5,scoring='neg_mean_squared_error')))

-0.6659087212206725


In [22]:
p_grid = [{'reg__alpha':[0.0,0.01,0.1,1.0,10.0,100.0]}]
reg = Pipeline([
            ("scaler", StandardScaler()),
            ("reg", Ridge())
        ])
gs = GridSearchCV(reg,p_grid,cv=5,return_train_score=True,scoring='neg_mean_squared_error')
gs.fit(X_train,y_train)
print(gs.best_params_)
print(gs.best_score_)
reg_ridge = gs.best_estimator_


{'reg__alpha': 0.0}
-0.6659087212206728


In [31]:
reg_ridge_bag = BaggingRegressor(reg_ridge,n_estimators=100,random_state=42)#.fit(X_train,y_train)
print(np.mean(cross_val_score(reg_ridge_bag,X_train,y_train,cv=5,scoring='neg_mean_squared_error')))

-1.4133534875981273


In [18]:
np.linspace(-5,5,10)

array([-5.        , -3.88888889, -2.77777778, -1.66666667, -0.55555556,
        0.55555556,  1.66666667,  2.77777778,  3.88888889,  5.        ])

In [24]:
list(np.exp(np.arange(-5,5,0.01)))

[0.006737946999085467,
 0.006805664492230543,
 0.006874062557496249,
 0.0069431480347461084,
 0.007012927832585418,
 0.007083408929052112,
 0.007154598372314572,
 0.0072265032813764495,
 0.00729913084678857,
 0.007372488331367998,
 0.007446583070924324,
 0.007521422474993249,
 0.007597014027577547,
 0.0076733652878954684,
 0.007750483891136671,
 0.007828377549225746,
 0.007907054051593414,
 0.007986521265955473,
 0.008066787139099587,
 0.008147859697679952,
 0.008229747049019994,
 0.008312457381923082,
 0.008395998967491432,
 0.008480380159953223,
 0.008565609397498015,
 0.008651695203120587,
 0.008738646185473244,
 0.008826471039726676,
 0.008915178548439498,
 0.009004777582436504,
 0.009095277101695758,
 0.009186686156244607,
 0.009279013887064678,
 0.00937226952700599,
 0.009466462401710255,
 0.009561601930543437,
 0.0096576976275377,
 0.009754759102342825,
 0.009852796061187177,
 0.00995181830784834,
 0.010051835744633496,
 0.010152858373369673,
 0.01025489629640393,
 0.01035795971

In [25]:
p_rand = {'reg__alpha':list(np.exp(np.arange(-5,5,0.01)))}
rs = RandomizedSearchCV(reg,p_rand,n_iter=50,cv=5,verbose=10,return_train_score=True,scoring='neg_mean_squared_error')
rs.fit(X_train,y_train)
print(rs.best_params_)
print(rs.best_score_)

Fitting 5 folds for each of 50 candidates, totalling 250 fits
[CV 1/5; 1/50] START reg__alpha=4.572225195141524...............................
[CV 1/5; 1/50] END reg__alpha=4.572225195141524;, score=(train=-0.518, test=-0.520) total time=   0.0s
[CV 2/5; 1/50] START reg__alpha=4.572225195141524...............................
[CV 2/5; 1/50] END reg__alpha=4.572225195141524;, score=(train=-0.522, test=-0.502) total time=   0.0s
[CV 3/5; 1/50] START reg__alpha=4.572225195141524...............................
[CV 3/5; 1/50] END reg__alpha=4.572225195141524;, score=(train=-0.517, test=-0.521) total time=   0.0s
[CV 4/5; 1/50] START reg__alpha=4.572225195141524...............................
[CV 4/5; 1/50] END reg__alpha=4.572225195141524;, score=(train=-0.521, test=-0.508) total time=   0.0s
[CV 5/5; 1/50] START reg__alpha=4.572225195141524...............................
[CV 5/5; 1/50] END reg__alpha=4.572225195141524;, score=(train=-0.511, test=-0.545) total time=   0.0s
[CV 1/5; 2/50] STA

In [23]:
p_grid = [{'reg__alpha':[0.01,0.1,1.0,10.0,100.0]}]
reg = Pipeline([
            ("scaler", StandardScaler()),
            ("reg", Lasso())
        ])
gs = GridSearchCV(reg,p_grid,cv=5,return_train_score=True,scoring='neg_mean_squared_error')
gs.fit(X_train,y_train)
print(gs.best_params_)
print(gs.best_score_)
reg_lasso = gs.best_estimator_


{'reg__alpha': 0.01}
-0.6480258355490132


In [27]:

p_rand = {'reg__alpha':list(np.exp(np.arange(-5,5,0.01)))}
rs = RandomizedSearchCV(reg,p_rand,n_iter=50,cv=5,verbose=0,return_train_score=True,scoring='neg_mean_squared_error')
rs.fit(X_train,y_train)
print(rs.best_params_)
print(rs.best_score_)

{'reg__alpha': 0.008565609397498015}
-0.522895546462318


In [24]:
p_grid = [{'reg__alpha':[0.01,0.1,1.0,10.0,100.0],'reg__l1_ratio':[0.1,0.25,0.5,0.75,0.9]}]
reg = Pipeline([
            ("scaler", StandardScaler()),
            ("reg", ElasticNet())
        ])
gs = GridSearchCV(reg,p_grid,cv=5,return_train_score=True,scoring='neg_mean_squared_error')
gs.fit(X_train,y_train)
print(gs.best_params_)
print(gs.best_score_)
reg_elastic = gs.best_estimator_


{'reg__alpha': 0.01, 'reg__l1_ratio': 0.9}
-0.6497517910760411


In [None]:

p_rand = {'reg__alpha':list(np.exp(np.arange(-5,5,0.01))),'reg__l1_ratio':list(np.arange(0.01,0.99,0.01))}
rs = RandomizedSearchCV(reg,p_rand,n_iter=50,cv=5,verbose=0,return_train_score=True,scoring='neg_mean_squared_error')
rs.fit(X_train,y_train)
print(rs.best_params_)
print(rs.best_score_)

In [25]:
reg = Pipeline([
            ("scaler", StandardScaler()),
            ("reg", SVR())
        ])

p_rand = {'reg__C':list(np.exp(np.arange(-5,5,0.01))),'reg__gamma':list(np.exp(np.arange(-5,5,0.01)))}
rs = RandomizedSearchCV(reg,p_rand,n_iter=10,cv=5,verbose=10,return_train_score=True,scoring='neg_mean_squared_error')
rs.fit(X_train[:2000,:],y_train[:2000])
print(rs.best_params_)
print(rs.best_score_)
reg_svr = rs.best_estimator_

Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV 1/5; 1/10] START reg__C=0.904837418035865, reg__gamma=0.018499714119818846..
[CV 1/5; 1/10] END reg__C=0.904837418035865, reg__gamma=0.018499714119818846;, score=(train=-0.464, test=-0.454) total time=   0.2s
[CV 2/5; 1/10] START reg__C=0.904837418035865, reg__gamma=0.018499714119818846..
[CV 2/5; 1/10] END reg__C=0.904837418035865, reg__gamma=0.018499714119818846;, score=(train=-0.439, test=-0.536) total time=   0.1s
[CV 3/5; 1/10] START reg__C=0.904837418035865, reg__gamma=0.018499714119818846..
[CV 3/5; 1/10] END reg__C=0.904837418035865, reg__gamma=0.018499714119818846;, score=(train=-0.474, test=-0.333) total time=   0.1s
[CV 4/5; 1/10] START reg__C=0.904837418035865, reg__gamma=0.018499714119818846..
[CV 4/5; 1/10] END reg__C=0.904837418035865, reg__gamma=0.018499714119818846;, score=(train=-0.397, test=-0.445) total time=   0.1s
[CV 5/5; 1/10] START reg__C=0.904837418035865, reg__gamma=0.018499714119818846..
[CV 5/

In [35]:
reg_svr_bag = BaggingRegressor(reg_svr,n_estimators=10,random_state=42)
print(np.mean(cross_val_score(reg_svr_bag,X_train[:2000,:],y_train[:2000],cv=5,scoring='neg_mean_squared_error')))

-0.4282204494089796


In [26]:
p_grid = [
    {'reg__n_neighbors':[1,3,5,10],
    'reg__weights':["uniform","distance"]}
]

reg = Pipeline([
            ("scaler", StandardScaler()),
            ("reg", KNeighborsRegressor())
])

gs_knn = GridSearchCV(reg,p_grid,cv=5,return_train_score=True,scoring='neg_mean_squared_error',verbose=10)
gs_knn.fit(X_train,y_train)
print(gs_knn.best_params_)
print(gs_knn.best_score_)
reg_knn = gs_knn.best_estimator_


Fitting 5 folds for each of 8 candidates, totalling 40 fits
[CV 1/5; 1/8] START reg__n_neighbors=1, reg__weights=uniform....................
[CV 1/5; 1/8] END reg__n_neighbors=1, reg__weights=uniform;, score=(train=-0.000, test=-0.727) total time=   0.1s
[CV 2/5; 1/8] START reg__n_neighbors=1, reg__weights=uniform....................
[CV 2/5; 1/8] END reg__n_neighbors=1, reg__weights=uniform;, score=(train=-0.000, test=-0.576) total time=   0.1s
[CV 3/5; 1/8] START reg__n_neighbors=1, reg__weights=uniform....................
[CV 3/5; 1/8] END reg__n_neighbors=1, reg__weights=uniform;, score=(train=-0.000, test=-0.681) total time=   0.1s
[CV 4/5; 1/8] START reg__n_neighbors=1, reg__weights=uniform....................
[CV 4/5; 1/8] END reg__n_neighbors=1, reg__weights=uniform;, score=(train=-0.000, test=-0.660) total time=   0.1s
[CV 5/5; 1/8] START reg__n_neighbors=1, reg__weights=uniform....................
[CV 5/5; 1/8] END reg__n_neighbors=1, reg__weights=uniform;, score=(train=-0.00

In [32]:
reg_knn_bag = BaggingRegressor(reg_knn,n_estimators=10,random_state=42)#.fit(X_train,y_train)
print(np.mean(cross_val_score(reg_knn_bag,X_train,y_train,cv=5,scoring='neg_mean_squared_error')))

-0.4066036873842034


In [36]:
p_grid = [{
    'reg__n_estimators':[100],
    'reg__max_depth':[None],
    'reg__min_samples_leaf':[1,],
    'reg__max_features':['auto']
    }]
reg = Pipeline([
            ("scaler", StandardScaler()),
            ("reg", RandomForestRegressor(random_state=42))
        ])
gs_rf = GridSearchCV(reg,p_grid,cv=5,return_train_score=True,scoring='neg_mean_squared_error',verbose=10)
gs_rf.fit(X_train,y_train)
print(gs_rf.best_params_)
print(gs_rf.best_score_)

Fitting 5 folds for each of 1 candidates, totalling 5 fits
[CV 1/5; 1/1] START reg__max_depth=None, reg__max_features=auto, reg__min_samples_leaf=1, reg__n_estimators=100
[CV 1/5; 1/1] END reg__max_depth=None, reg__max_features=auto, reg__min_samples_leaf=1, reg__n_estimators=100;, score=(train=-0.039, test=-0.286) total time=   8.7s
[CV 2/5; 1/1] START reg__max_depth=None, reg__max_features=auto, reg__min_samples_leaf=1, reg__n_estimators=100
[CV 2/5; 1/1] END reg__max_depth=None, reg__max_features=auto, reg__min_samples_leaf=1, reg__n_estimators=100;, score=(train=-0.040, test=-0.267) total time=   9.3s
[CV 3/5; 1/1] START reg__max_depth=None, reg__max_features=auto, reg__min_samples_leaf=1, reg__n_estimators=100
[CV 3/5; 1/1] END reg__max_depth=None, reg__max_features=auto, reg__min_samples_leaf=1, reg__n_estimators=100;, score=(train=-0.039, test=-0.288) total time=   7.4s
[CV 4/5; 1/1] START reg__max_depth=None, reg__max_features=auto, reg__min_samples_leaf=1, reg__n_estimators=10

In [38]:
reg_rf = gs.best_estimator_

In [42]:
reg_voting = VotingRegressor(
    [('knn',reg_knn),('rf',reg_rf)])

print(np.mean(cross_val_score(reg_voting,X_train,y_train,cv=5,scoring='neg_mean_squared_error')))

-0.4616942287074991


In [49]:
reg_voting = StackingRegressor(
    [('reg1',reg_svr),('reg2',reg_knn),('reg3',reg_rf)])

print(np.mean(cross_val_score(reg_voting,X_train[:2000,:],y_train[:2000],cv=5,scoring='neg_mean_squared_error')))

-0.3970254874332899


## K-Neighbors

In [55]:
p_grid = [
    {'poly__degree':[1],
    'pca__n_components':[2,5,8],
    'reg__n_neighbors':[5],
    'reg__weights':["uniform","distance"]},
    {'poly__degree':[2],
    'pca__n_components':np.floor(np.linspace(2,45,4)).astype(int),
    'reg__n_neighbors':[5],
    'reg__weights':["uniform","distance"]},
    {'poly__degree':[3],
    'pca__n_components':np.floor(np.linspace(2,195,4)).astype(int),
    'reg__n_neighbors':[5],
    'reg__weights':["uniform","distance"]}
]

reg = Pipeline([
            ("poly",PolynomialFeatures()),
            ("scaler", StandardScaler()),
            ("pca",FactorAnalysis()),
            ("reg", KNeighborsRegressor())
])

gs_knn = GridSearchCV(reg,p_grid,cv=5,return_train_score=True,scoring='neg_mean_squared_error',verbose=10)
gs_knn.fit(X_train[:2000,:],y_train[:2000])
print(gs_knn.best_params_)
print(gs_knn.best_score_)
reg_knn = gs_knn.best_estimator_


Fitting 5 folds for each of 22 candidates, totalling 110 fits
[CV 1/5; 1/22] START pca__n_components=2, poly__degree=1, reg__n_neighbors=5, reg__weights=uniform
[CV 1/5; 1/22] END pca__n_components=2, poly__degree=1, reg__n_neighbors=5, reg__weights=uniform;, score=(train=-0.655, test=-1.045) total time=   0.5s
[CV 2/5; 1/22] START pca__n_components=2, poly__degree=1, reg__n_neighbors=5, reg__weights=uniform
[CV 2/5; 1/22] END pca__n_components=2, poly__degree=1, reg__n_neighbors=5, reg__weights=uniform;, score=(train=-0.655, test=-1.015) total time=   0.6s
[CV 3/5; 1/22] START pca__n_components=2, poly__degree=1, reg__n_neighbors=5, reg__weights=uniform
[CV 3/5; 1/22] END pca__n_components=2, poly__degree=1, reg__n_neighbors=5, reg__weights=uniform;, score=(train=-0.696, test=-0.950) total time=   0.5s
[CV 4/5; 1/22] START pca__n_components=2, poly__degree=1, reg__n_neighbors=5, reg__weights=uniform
[CV 4/5; 1/22] END pca__n_components=2, poly__degree=1, reg__n_neighbors=5, reg__weigh

In [1]:
results = gs_knn.cv_results_
for mean_train, mean_score, params in zip(results["mean_train_score"], results["mean_test_score"], results["params"]):
    print(mean_train, mean_score, params)

NameError: name 'gs_knn' is not defined