**OTUS - L1**  

Refactor example notebook on Linear Regression  
'House Price Dataset', ML_recap_30_11-163989-36aa70.ipynb  

Main goals: implement linear regression pipelines, stacking,  
feature selection, hyperparameter tuning.  

Dataset description and EDA part are omitted for brevity.

In [None]:
# Import libraries
import pandas as pd
import numpy as np
import sklearn
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.preprocessing import FunctionTransformer, OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.metrics import mean_absolute_error, r2_score
from scipy.stats import norm
from yellowbrick.regressor import ResidualsPlot
import matplotlib
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
import wget
import pickle
import warnings

from IPython.core.display import display, HTML
display(HTML("<style>.container { width:80% !important; }</style>"))

sklearn.set_config(display='diagram')

sns.set_style('darkgrid')
plt.rcParams["figure.figsize"] = [12, 8]
%matplotlib inline

warnings.filterwarnings("ignore")

Import dataset

In [None]:
# Data source
url = 'https://docs.google.com/uc?export=download&id=1k21iUIrz0NjfiLE_j-oBQm1bNu3wASX6'

# Read csv
data = pd.read_csv(url, index_col=0)

data.head()

Save dataset

In [None]:
with open("house_prices_df.pkl", "wb") as f:
    pickle.dump(data, f)

Select numerical and categorical columns

In [None]:
# All numerical features

numerical_columns = data.select_dtypes(
    include=["int", "float"]).columns.to_list()
numerical_features = numerical_columns[:-1]

# All categorical features
categorical_features = data.select_dtypes(
   exclude=["int", "float"]).columns.to_list()

all_features = categorical_features + numerical_features

Transform target variable, remove outliers


In [None]:
# Take log of target variable and
# Set kill_outliers=True to get rid of outliers > 3 sigma (optional)

def transform_data(df, kill_outliers=False):
    target = np.log(df.iloc[:,-1])
    if kill_outliers:
        m = np.mean(target)
        sigma = np.std(target)
        target.drop(target.index[(target < (m - 3*sigma)) | (target > (m + 3*sigma))],inplace=True)
        df = df.loc[target.index]
        df.reset_index(drop=True,inplace=True)
        target.reset_index(drop=True,inplace=True)
    return df, target

data, target = transform_data(data)

**Analize target distribution**

Check target distribution before and after log transformation with boxplot

In [None]:
fig, axes = plt.subplots(nrows=1,
                         ncols=2, figsize=(14, 4))
fig.suptitle('Target variable distribution before and after log transformation')

axes[0].set_title('Before')
sns.boxplot(data['SalePrice'], ax=axes[0])

axes[1].set_title('After')
sns.boxplot(target, color='green', ax=axes[1])

plt.show()

Check if target distribution is normal

In [None]:
fig, axes = plt.subplots(nrows=1,
                         ncols=2, figsize=(14, 4))
fig.suptitle('Target variable distribution before and after log transformation')

axes[0].set_title('Before')
sns.distplot(data['SalePrice'], fit= norm, ax=axes[0])

axes[1].set_title('After')
sns.distplot(target, fit= norm, color='green', ax=axes[1])

plt.show()

**Proceed to ML**

Train-test split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(data[all_features], target,
                                                    test_size=0.25, random_state=17)

**Build regression pipeline with stacking and hyperparameters search**

Define pipeline elements

In [None]:
# Steps
numerical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='mean')),
    ("scaler", StandardScaler()),
    ("feature_selector", SelectKBest(score_func=f_classif, k=10))])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy='most_frequent')),
    ("onehot", OneHotEncoder(handle_unknown="ignore"))])

data_transformer = ColumnTransformer(transformers=[
    # < -- NB, include target
    ("numerical", numerical_transformer, numerical_features),
    ("categorical", categorical_transformer, categorical_features)])

preprocessor = Pipeline(steps=[("data_transformer", data_transformer)])

estimators = [
    ("Linear_Regression", LinearRegression()),
    ("Lasso", Lasso()),
    ("Ridge",  Ridge())]

stacking_regressor = Pipeline(steps=[
    ("Preprocessor", preprocessor),
    ("Stacking_Regressor", StackingRegressor(
        estimators=estimators,
        passthrough=True,
        n_jobs=-1,
        verbose=0))]) #True

Define and execute grid search --  takes about 4 minutes on Binder

In [None]:
# Define grid search

param_grid = {
    # Ranges are chosen after a bit of experimentation
    'Stacking_Regressor__Lasso__alpha': np.linspace(0.2, 0.3, 4),
    'Stacking_Regressor__Ridge__alpha': np.linspace(3.0, 5.0, 4)
}

stacked_grid_search = GridSearchCV(
    stacking_regressor, param_grid, n_jobs=-1, cv=5)

# Fit the regression pipeline

model = stacked_grid_search.fit(X_train, y_train)

Show the pipeline structure

In [None]:
stacked_grid_search

**Run model and evaluate results**

Fit the model and get the score on train set

In [None]:
train_score = model.score(X_train, y_train)
print(f'Model score on training data: {train_score: .4f}')

Evaluate model on test set

In [None]:
best_score = stacked_grid_search.best_score_
best_params = stacked_grid_search.best_params_

print(f'Best score: {best_score: .4f}\nwith best parameters:')
print(f'         Lasso alpha {best_params["Stacking_Regressor__Lasso__alpha"]: .2f}')
print(f'         Ridge alpha {best_params["Stacking_Regressor__Ridge__alpha"]: .2f}')

model = stacked_grid_search.best_estimator_
test_score = model.score(X_test, y_test)
print(f'Model score on test data: {test_score: .4f}')

Visualize residuals

In [None]:
visualizer = ResidualsPlot(model)

visualizer.fit(X_train, y_train)  # Fit the training data to the visualizer
visualizer.score(X_test, y_test)  # Evaluate the model on the test data
_ = visualizer.show()             # Finalize and render the figure

Evaluate model performance with several metrics  
Use SalePrice instead of its log

In [None]:
# Restore SalePrice variable from targen

yhat = np.exp(model.predict(X_test))
y = np.exp(y_test)

# Metrics

RMSE = mean_squared_error(yhat, y, squared=False)
MAE = mean_absolute_error(yhat, y)
MAPE = mean_absolute_percentage_error(yhat, y)
R2 = r2_score(yhat,y)

print(f"""RMSE = {RMSE: 9.1f}
MAE = {MAE: 10.1f}
MAPE = {MAPE*100: 6.2f}%
R2 = {R2: 8.2f}""")

Save the model

In [None]:
with open("stacked_regression_pipeline.pkl", "wb") as f:
    pickle.dump(stacked_grid_search, f)