In [1]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.linear_model import LinearRegression
from sklearn.datasets import make_regression
from sklearn.datasets import make_friedman1
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
import numpy as np

make_regression - генерируется случайная линейная зависимость


make_friedman1 - y(X) = 10 * sin(pi * X[:, 0] * X[:, 1]) + 20 * (X[:, 2] - 0.5) ** 2 + 10 * X[:, 3] + 5 * X[:, 4] + noise * N(0, 1) - нелинейная зависимость.

## Линейная зависимость

In [2]:
X_data, y_data = make_regression(n_samples=1000, noise=100, n_features=10)

In [3]:
np.mean(cross_val_score(DecisionTreeRegressor(max_depth=1), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-37860.71115053635

In [4]:
np.mean(cross_val_score(DecisionTreeRegressor(max_depth=5), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-29487.335360341509

In [5]:
np.mean(cross_val_score(DecisionTreeRegressor(max_depth=10), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-33215.415857271451

In [6]:
np.mean(cross_val_score(DecisionTreeRegressor(min_samples_leaf=2), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-32647.960677010735

In [7]:
np.mean(cross_val_score(DecisionTreeRegressor(min_samples_leaf=10), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-25656.711787815882

In [8]:
np.mean(cross_val_score(DecisionTreeRegressor(min_samples_leaf=20), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-26104.032506451251

In [9]:
gs = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_grid ={
        'criterion': ['mse', 'mae'],
        #'max_depth': range(1, 21),
        'min_samples_leaf': range(1, 21)
    },
    scoring='neg_mean_squared_error'
)
gs.fit(X_data, y_data)

print gs.best_params_
print gs.best_score_

{'criterion': 'mae', 'min_samples_leaf': 14}
-25564.9287986


In [10]:
np.mean(cross_val_score(LinearRegression(), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-10014.530948277012

## Нелинейная зависимость

In [11]:
X_data, y_data = make_friedman1(n_samples=1000, noise=10, n_features=10)

In [12]:
np.mean(cross_val_score(DecisionTreeRegressor(max_depth=1), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-117.04933343320286

In [13]:
np.mean(cross_val_score(DecisionTreeRegressor(max_depth=5), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-130.94261887396394

In [14]:
np.mean(cross_val_score(DecisionTreeRegressor(max_depth=10), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-185.96876517522998

In [15]:
np.mean(cross_val_score(DecisionTreeRegressor(min_samples_leaf=2), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-205.41672446722677

In [16]:
np.mean(cross_val_score(DecisionTreeRegressor(min_samples_leaf=10), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-139.58380793814672

In [17]:
np.mean(cross_val_score(DecisionTreeRegressor(min_samples_leaf=20), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-125.33119604250444

In [18]:
gs = GridSearchCV(
    DecisionTreeRegressor(random_state=42),
    param_grid ={
        'criterion': ['mse', 'mae'],
        'max_depth': range(1, 21),
        #'min_samples_leaf': range(1, 21)
    },
    scoring='neg_mean_squared_error'
)
gs.fit(X_data, y_data)

print gs.best_params_
print gs.best_score_

{'criterion': 'mse', 'max_depth': 1}
-117.829746106


In [19]:
np.mean(cross_val_score(LinearRegression(), X_data, y_data, cv=5, scoring='neg_mean_squared_error'))

-109.61269108624897

## Оценка времени работы

In [20]:
X_data, y_data = make_regression(n_samples=100000, noise=1000, n_features=30, random_state=42)

In [21]:
%%time
DecisionTreeRegressor(max_depth=1).fit(X_data, y_data)

Wall time: 424 ms


DecisionTreeRegressor(criterion='mse', max_depth=1, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

In [22]:
%%time
DecisionTreeRegressor(max_depth=2).fit(X_data, y_data)

Wall time: 744 ms


DecisionTreeRegressor(criterion='mse', max_depth=2, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

In [23]:
%%time
DecisionTreeRegressor(max_depth=4).fit(X_data, y_data)

Wall time: 1.4 s


DecisionTreeRegressor(criterion='mse', max_depth=4, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

In [24]:
%%time
DecisionTreeRegressor(max_depth=10).fit(X_data, y_data)

Wall time: 3.06 s


DecisionTreeRegressor(criterion='mse', max_depth=10, max_features=None,
           max_leaf_nodes=None, min_impurity_split=1e-07,
           min_samples_leaf=1, min_samples_split=2,
           min_weight_fraction_leaf=0.0, presort=False, random_state=None,
           splitter='best')

In [25]:
%%time
LinearRegression().fit(X_data, y_data)

Wall time: 99 ms


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)