In [1]:
from sklearn.datasets import load_boston
boston = load_boston()

In [2]:
type(boston)

sklearn.utils.Bunch

In [3]:
boston.keys()

dict_keys(['data', 'target', 'feature_names', 'DESCR', 'filename'])

In [4]:
type(boston['data'])

numpy.ndarray

In [5]:
boston['feature_names']

array(['CRIM', 'ZN', 'INDUS', 'CHAS', 'NOX', 'RM', 'AGE', 'DIS', 'RAD',
       'TAX', 'PTRATIO', 'B', 'LSTAT'], dtype='<U7')

In [7]:
print(boston['DESCR'])

.. _boston_dataset:

Boston house prices dataset
---------------------------

**Data Set Characteristics:**  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive. Median Value (attribute 14) is usually the target.

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pu

In [8]:
from sklearn.ensemble import RandomForestRegressor

In [9]:
clf = RandomForestRegressor()

In [10]:
clf.fit(boston['data'], boston['target'])

RandomForestRegressor(bootstrap=True, ccp_alpha=0.0, criterion='mse',
                      max_depth=None, max_features='auto', max_leaf_nodes=None,
                      max_samples=None, min_impurity_decrease=0.0,
                      min_impurity_split=None, min_samples_leaf=1,
                      min_samples_split=2, min_weight_fraction_leaf=0.0,
                      n_estimators=100, n_jobs=None, oob_score=False,
                      random_state=None, verbose=0, warm_start=False)

In [11]:
clf.score(boston['data'], boston['target'])

0.9827823988280268

In [12]:
clf.score?

[0;31mSignature:[0m [0mclf[0m[0;34m.[0m[0mscore[0m[0;34m([0m[0mX[0m[0;34m,[0m [0my[0m[0;34m,[0m [0msample_weight[0m[0;34m=[0m[0;32mNone[0m[0;34m)[0m[0;34m[0m[0;34m[0m[0m
[0;31mDocstring:[0m
Return the coefficient of determination R^2 of the prediction.

The coefficient R^2 is defined as (1 - u/v), where u is the residual
sum of squares ((y_true - y_pred) ** 2).sum() and v is the total
sum of squares ((y_true - y_true.mean()) ** 2).sum().
The best possible score is 1.0 and it can be negative (because the
model can be arbitrarily worse). A constant model that always
predicts the expected value of y, disregarding the input features,
would get a R^2 score of 0.0.

Parameters
----------
X : array-like of shape (n_samples, n_features)
    Test samples. For some estimators this may be a
    precomputed kernel matrix or a list of generic objects instead,
    shape = (n_samples, n_samples_fitted),
    where n_samples_fitted is the number of
    samples used in t

In [13]:
dir(clf)

['__abstractmethods__',
 '__class__',
 '__delattr__',
 '__dict__',
 '__dir__',
 '__doc__',
 '__eq__',
 '__format__',
 '__ge__',
 '__getattribute__',
 '__getitem__',
 '__getstate__',
 '__gt__',
 '__hash__',
 '__init__',
 '__init_subclass__',
 '__iter__',
 '__le__',
 '__len__',
 '__lt__',
 '__module__',
 '__ne__',
 '__new__',
 '__reduce__',
 '__reduce_ex__',
 '__repr__',
 '__setattr__',
 '__setstate__',
 '__sizeof__',
 '__str__',
 '__subclasshook__',
 '__weakref__',
 '_abc_impl',
 '_estimator_type',
 '_get_param_names',
 '_get_tags',
 '_make_estimator',
 '_more_tags',
 '_required_parameters',
 '_set_oob_score',
 '_validate_X_predict',
 '_validate_estimator',
 '_validate_y_class_weight',
 'apply',
 'base_estimator',
 'base_estimator_',
 'bootstrap',
 'ccp_alpha',
 'class_weight',
 'criterion',
 'decision_path',
 'estimator_params',
 'estimators_',
 'feature_importances_',
 'fit',
 'get_params',
 'max_depth',
 'max_features',
 'max_leaf_nodes',
 'max_samples',
 'min_impurity_decrease',
 'm

In [14]:
clf.n_features_

13

In [15]:
boston['data'].shape

(506, 13)

In [16]:
row = boston['data'][17]

In [17]:
row.reshape(-1, 13)

array([[  0.7842,   0.    ,   8.14  ,   0.    ,   0.538 ,   5.99  ,
         81.7   ,   4.2579,   4.    , 307.    ,  21.    , 386.75  ,
         14.67  ]])

In [18]:
clf.predict(row.reshape(-1, 13))

array([17.701])

In [19]:
boston['target'][17]

17.5

In [20]:
from sklearn.model_selection import train_test_split

In [21]:
X_train, X_test, y_train, y_test = train_test_split(boston['data'], boston['target'], test_size=0.3)

In [22]:
clf = RandomForestRegressor()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.8767331209809486

# Preprocess data

In [23]:
import pandas as pd

In [24]:
df = pd.DataFrame(boston['data'], columns=boston['feature_names'])

In [25]:
df.max(axis=0)

CRIM        88.9762
ZN         100.0000
INDUS       27.7400
CHAS         1.0000
NOX          0.8710
RM           8.7800
AGE        100.0000
DIS         12.1265
RAD         24.0000
TAX        711.0000
PTRATIO     22.0000
B          396.9000
LSTAT       37.9700
dtype: float64

There is a huge range of values in the dataset. The maximum of NOX is 0.871, whereas the maximum of TAX is 711. Random Forest regressors can handle differences in data ranges like this, but others can't, such as support vector machines.

In [27]:
from sklearn.svm import SVR
clf = SVR()
clf.fit(X_train, y_train)
clf.score(X_test, y_test)

0.16551662319499216

In [28]:
from sklearn import preprocessing
Xs = preprocessing.scale(boston['data'])

In [31]:
df = pd.DataFrame(Xs, columns=boston['feature_names'])

In [32]:
df.max(axis=0)

CRIM       9.933931
ZN         3.804234
INDUS      2.422565
CHAS       3.668398
NOX        2.732346
RM         3.555044
AGE        1.117494
DIS        3.960518
RAD        1.661245
TAX        1.798194
PTRATIO    1.638828
B          0.441052
LSTAT      3.548771
dtype: float64

The differences are much smaller now. 

In [33]:
Xs_train, Xs_test, ys_train, ys_test = train_test_split(Xs, boston['target'], test_size=0.3)

In [34]:
clf = SVR()
clf.fit(Xs_train, ys_train)
clf.score(Xs_test, ys_test)

0.6093876928733071

# Dimensionality reduction through PCA

There are 13 features of the original dataset. We'll now reduce it to 5 with PCA.

In [35]:
from sklearn.decomposition import PCA

In [36]:
pca = PCA(n_components=5)
pca.fit(boston['data'])

PCA(copy=True, iterated_power='auto', n_components=5, random_state=None,
    svd_solver='auto', tol=0.0, whiten=False)

In [37]:
Xp = pca.transform(boston['data'])

In [38]:
Xp.shape

(506, 5)

In [39]:
clf = RandomForestRegressor()
Xp_train, Xp_test, yp_train, yp_test = train_test_split(Xp, boston['target'], test_size=0.3)
clf.fit(Xp_train, yp_train)
clf.score(Xp_test, yp_test)

0.6316234117112971

# Preprocessing with pipelines

In [40]:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline

In [None]:
pipe = Pipeline([
    ('scale', StandardScaler()),
    ('pca', PCA(n_components=5)),
    ('svr', SVR()),
])