In [21]:
import pandas as pd

# Загрузим данные

In [22]:
from sklearn.datasets import load_boston

In [23]:
bunch = load_boston()

In [24]:
print(bunch.DESCR)

Boston House Prices dataset

Notes
------
Data Set Characteristics:  

    :Number of Instances: 506 

    :Number of Attributes: 13 numeric/categorical predictive
    
    :Median Value (attribute 14) is usually the target

    :Attribute Information (in order):
        - CRIM     per capita crime rate by town
        - ZN       proportion of residential land zoned for lots over 25,000 sq.ft.
        - INDUS    proportion of non-retail business acres per town
        - CHAS     Charles River dummy variable (= 1 if tract bounds river; 0 otherwise)
        - NOX      nitric oxides concentration (parts per 10 million)
        - RM       average number of rooms per dwelling
        - AGE      proportion of owner-occupied units built prior to 1940
        - DIS      weighted distances to five Boston employment centres
        - RAD      index of accessibility to radial highways
        - TAX      full-value property-tax rate per $10,000
        - PTRATIO  pupil-teacher ratio by town
      

In [25]:
X, y = pd.DataFrame(data=bunch.data, columns=bunch.feature_names.astype(str)), bunch.target

In [26]:
X.head()

Unnamed: 0,CRIM,ZN,INDUS,CHAS,NOX,RM,AGE,DIS,RAD,TAX,PTRATIO,B,LSTAT
0,0.00632,18.0,2.31,0.0,0.538,6.575,65.2,4.09,1.0,296.0,15.3,396.9,4.98
1,0.02731,0.0,7.07,0.0,0.469,6.421,78.9,4.9671,2.0,242.0,17.8,396.9,9.14
2,0.02729,0.0,7.07,0.0,0.469,7.185,61.1,4.9671,2.0,242.0,17.8,392.83,4.03
3,0.03237,0.0,2.18,0.0,0.458,6.998,45.8,6.0622,3.0,222.0,18.7,394.63,2.94
4,0.06905,0.0,2.18,0.0,0.458,7.147,54.2,6.0622,3.0,222.0,18.7,396.9,5.33


Зафиксируем генератор случайных чисел для воспроизводимости:

In [27]:
SEED = 22
np.random.seed = SEED

# Домашка

Разделим данные на условно обучающую и отложенную выборки:

In [28]:
from sklearn.model_selection import train_test_split

In [29]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=SEED)

In [30]:
X_train.shape, y_train.shape, X_test.shape, y_test.shape

((404, 13), (404,), (102, 13), (102,))

Измерять качество будем с помощью метрики среднеквадратичной ошибки:

In [31]:
from sklearn.metrics import mean_squared_error

<div class="panel panel-info" style="margin: 50px 0 0 0">
    <div class="panel-heading">
        <h3 class="panel-title">Задача 1.</h3> 
    </div>
    <div class="panel">
        Обучите <b>LinearRegression</b> из пакета <b>sklearn.linear_model</b> на обучающей выборке (<i>X_train, y_train</i>) и измерьте качество на <i>X_test</i>.
        <br>
        <br>
        <i>P.s. Ошибка должна быть в районе 20. </i>
    </div>
</div>

In [32]:
from sklearn.linear_model import LinearRegression

clf = LinearRegression()
clf.fit(X_train, y_train);
mean_squared_error(y_test, clf.predict(X_test))

20.765767538051904

<div class="panel panel-info" style="margin: 50px 0 0 0">
    <div class="panel-heading">
        <h3 class="panel-title">Задача 2. (с подвохом)</h3> 
    </div>
    <div class="panel">
        Обучите <b>SGDRegressor</b> из пакета <b>sklearn.linear_model</b> на обучающей выборке (<i>X_train, y_train</i>) и измерьте качество на <i>X_test</i>.
    </div>
</div>

In [33]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import SGDRegressor

clf = make_pipeline(StandardScaler(), SGDRegressor())
clf.fit(X_train, y_train);
mean_squared_error(y_test, clf.predict(X_test))

21.617625911636122

<div class="panel panel-info" style="margin: 50px 0 0 0">
    <div class="panel-heading">
        <h3 class="panel-title">Задача 3.</h3>
    </div>
    <div class="panel">
        Попробуйте все остальные классы:
        <ul>
            <li>Ridge
            <li>Lasso
            <li>ElasticNet
        </ul>

        <br>

        В них, как вам уже известно, используются параметры регуляризации <b>alpha</b>. Настройте его как с помощью <b>GridSearchCV</b>, так и с помощью готовых <b>-CV</b> классов (<b>RidgeCV</b>, <b>LassoCV</b> и т.д.).
        
        <br><br>

        Найдите уже, в конце-концов, самую точную линейную модель!

    </div>
</div>


In [34]:
from sklearn.grid_search import GridSearchCV
from sklearn.linear_model import LassoCV, RidgeCV, ElasticNet

elastic_params = { 
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    'l1_ratio': np.linspace(0, 1, num=21)
}

for name, clf in zip(['Lasso', 'Ridge', 'ElasticNet'], [
    LassoCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10, 100]),
    RidgeCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10, 100]),
    GridSearchCV(ElasticNet(), elastic_params, cv=5)
]): 
    clf.fit(X_train, y_train);
    print(name, mean_squared_error(y_test, clf.predict(X_test)))

Lasso 20.763582305
Ridge 20.7678522486
ElasticNet 20.7722160404


<div class="panel panel-info" style="margin: 50px 0 0 0">
    <div class="panel-heading">
        <h3 class="panel-title">Задача 4.</h3>
    </div>
    <div class="panel">
        Проверять качество правильно на кросс-валидации, как известно. Вы знаете, что делать: подключаем <b>cross_val_score</b> из <b>sklearn.model_selection</b>. Параметр <b>cv</b> установите равным 5.
        <br><br>
        Вспомните про все штуки, которым мы с вами научились.
        <br><br>
        Добейтесь <b>MSE &lt; 27</b>.
    </div>
</div>

In [37]:
from sklearn.preprocessing import PolynomialFeatures
from sklearn.model_selection import cross_val_score

In [38]:
elastic_params = { 
    'alpha': [0.0001, 0.001, 0.01, 0.1, 1, 10, 100],
    'l1_ratio': np.linspace(0, 1, num=21)
}
 
for name, clf in zip(['LinearRegression', 'SGD', 'Lasso', 'Ridge', 'ElasticNet'], [
    LinearRegression(),
    SGDRegressor(),
    LassoCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10, 100]),
    RidgeCV(alphas=[0.0001, 0.001, 0.01, 0.1, 1, 10, 100]),
    GridSearchCV(ElasticNet(), elastic_params, cv=5)
]):
    mdl = make_pipeline(PolynomialFeatures(degree=2), StandardScaler(), clf)
    scores = cross_val_score(mdl, X, y, cv=5, scoring='neg_mean_squared_error')
    mn = -np.mean(scores)
    sd = np.std(scores)
    print(name, mn, '+/-', sd * 2)

LinearRegression 133.654063858 +/- 214.691151233
SGD 35.4982777262 +/- 34.6083185653
Lasso 28.6445833293 +/- 37.2853504943
Ridge 64.9836455214 +/- 99.9626384177
ElasticNet 26.0991348332 +/- 36.2554001897
