### Exercise 146: Using the cross_val_score Function to Get Accurate Results on the Datasett

In [1]:
# import all the key libraries
import pandas as pd
import numpy as np
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

In [2]:
# load data
housing_df = pd.read_csv('HousingData.csv')
housing_df.head()

Unnamed: 0.1,Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,LSTAT,MEDV
0,0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1,296,15.3,4.98,24.0
1,1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2,242,17.8,9.14,21.6
2,2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2,242,17.8,4.03,34.7
3,3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3,222,18.7,2.94,33.4
4,4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3,222,18.7,,36.2


In [3]:
# drop null values
housing_df = housing_df.dropna()

In [4]:
# declare X and Y, X - predictor columns and Y the target column
X = housing_df.iloc[:,:-1]
y = housing_df.iloc[:, -1]

In [5]:
# Create the regressor: reg
reg = LinearRegression()

In [6]:
from sklearn.model_selection import cross_val_score

In [7]:
def regression_model_cv(model, k=5):
    scores = cross_val_score(model, X, y, scoring='neg_mean_squared_error', cv=k)
    rmse = np.sqrt(-scores)
    print('Reg rmse:', rmse)
    print('Reg mean:', rmse.mean ())

In [8]:
regression_model_cv(LinearRegression())

Reg rmse: [3.54810173 4.50937757 5.82330106 8.05737628 5.2191234 ]
Reg mean: 5.4314560081485475


In [9]:
regression_model_cv(LinearRegression(), k=3)

Reg rmse: [ 4.40674889  6.15899    19.11540116]
Reg mean: 9.893713348149246


In [10]:
regression_model_cv(LinearRegression(), k=6)

Reg rmse: [ 3.2569782   4.0645006   5.69790223  4.05568051 10.69290989  4.23497297]
Reg mean: 5.333824068010536


In [11]:
from sklearn.linear_model import Ridge
regression_model_cv(Ridge())

Reg rmse: [3.52479283 4.72296032 5.54622438 8.00759231 5.26861171]
Reg mean: 5.414036309884283


In [12]:
from sklearn.linear_model import Lasso
regression_model_cv(Lasso())

Reg rmse: [4.712548   5.83933857 8.02996117 7.89925202 4.38674414]
Reg mean: 6.173568778640691


### Exercise 147: Using K-Nearest Neighbors to Find the Median Value of the Dataset

In [13]:
from sklearn.neighbors import KNeighborsRegressor

In [14]:
regression_model_cv(KNeighborsRegressor())

Reg rmse: [ 9.45732105  7.9848147  10.9163503  11.09935544  6.57277173]
Reg mean: 9.206122643945283


In [15]:
regression_model_cv(KNeighborsRegressor(), 4)

Reg rmse: [ 5.86453026 10.665062   10.3490104   6.59496159]
Reg mean: 8.368391062917976


In [16]:
regression_model_cv(KNeighborsRegressor(), 7)

Reg rmse: [ 6.38094312  7.59697678 10.79359598 10.44880649 10.95244461  9.15753593
  4.82622967]
Reg mean: 8.593790368881992


In [17]:
regression_model_cv(KNeighborsRegressor(), 10)

Reg rmse: [ 5.10895488  6.73367953  7.22744422 12.18521358  9.6899144  11.45865254
  9.86220811 11.16437466  5.93686354  5.00012102]
Reg mean: 8.436742648104477


In [18]:
regression_model_cv(KNeighborsRegressor(), 3)

Reg rmse: [ 8.76663924 10.28732474 10.09593297]
Reg mean: 9.716632320260985


### Exercise 148: K-Nearest Neighbors with GridSearchCV to Find the Optimal Number of Neighbors

In [19]:
from sklearn.model_selection import GridSearchCV

In [20]:
neighbors = np.linspace(1, 20, 20)

In [21]:
k = neighbors.astype(int)

In [22]:
print("k: ", k)

k:  [ 1  2  3  4  5  6  7  8  9 10 11 12 13 14 15 16 17 18 19 20]


In [23]:
param_grid = {'n_neighbors': k}

In [24]:
knn = KNeighborsRegressor()

In [25]:
knn_tuned = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error')

In [26]:
knn_tuned.fit(X, y)

In [27]:
k = knn_tuned.best_params_
print("Best n_neighbors: {}".format(k))
score = knn_tuned.best_score_
rsm = np.sqrt(-score)
print("Best score: {}".format(rsm))

Best n_neighbors: {'n_neighbors': 17}
Best score: 8.90228252410123


### Exercise 149: Decision Trees and Random Forests

In [28]:
from sklearn import tree

In [29]:
regression_model_cv(tree.DecisionTreeRegressor())

Reg rmse: [3.78877422 5.87886151 6.51553159 6.78173272 9.01010117]
Reg mean: 6.395000243546111


In [30]:
from sklearn.ensemble import RandomForestRegressor

In [31]:
regression_model_cv(RandomForestRegressor())

Reg rmse: [3.44185845 3.56641302 4.96401892 6.57734796 5.4377748 ]
Reg mean: 4.7974826284025465


In [32]:
regression_model_cv(RandomForestRegressor(n_jobs=-1, n_estimators=100))

Reg rmse: [3.52478667 3.5865979  4.94146038 6.60035729 6.62210362]
Reg mean: 5.055061171969096


In [33]:
from sklearn.model_selection import RandomizedSearchCV

In [34]:
param_grid = {'max_depth': [None, 10, 30, 50, 70, 100, 200, 400],
             'min_samples_split': [2, 3, 4, 5],
             'min_samples_leaf': [1, 2, 3],
             'max_features': ['auto', 'sqrt']}

In [35]:
reg = RandomForestRegressor(n_jobs = -1)

In [36]:
reg_tuned = RandomizedSearchCV(reg, param_grid, cv=5,  
  scoring='neg_mean_squared_error')

In [37]:
reg_tuned.fit(X, y)

20 fits failed out of a total of 50.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
20 fits failed with the following error:
Traceback (most recent call last):
  File "C:\Users\lorra\anaconda3\envs\pwkb01\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "C:\Users\lorra\anaconda3\envs\pwkb01\Lib\site-packages\sklearn\base.py", line 1466, in wrapper
    estimator._validate_params()
  File "C:\Users\lorra\anaconda3\envs\pwkb01\Lib\site-packages\sklearn\base.py", line 666, in _validate_params
    validate_parameter_constraints(
  File "C:\Users\lorra\anaconda3\envs\pwkb01\Lib\site-packages\sklearn\utils\_param_validation.py", line 95, in validate_

In [38]:
p = reg_tuned.best_params_
print("Best n_neighbors: {}".format(p))
score = reg_tuned.best_score_
rsm = np.sqrt(-score)
print("Best score: {}".format(rsm))

Best n_neighbors: {'min_samples_split': 4, 'min_samples_leaf': 1, 'max_features': 'sqrt', 'max_depth': 50}
Best score: 4.6649090206085955


In [39]:
# Setup the hyperparameter grid
regression_model_cv(RandomForestRegressor(n_jobs=-1, n_estimators=500))

Reg rmse: [3.41910852 3.58894279 4.86090382 6.52718889 6.63596963]
Reg mean: 5.006422731836785


In [40]:
X = housing_df.iloc[:,:-1]
y = housing_df.iloc[:, -1]

In [41]:
from sklearn.ensemble import AdaBoostRegressor
regression_model_cv(AdaBoostRegressor())

Reg rmse: [3.63718631 3.43323877 5.99243295 6.25216942 4.47475404]
Reg mean: 4.757956297241335
