# Sklearn Pipelines
Implement routine tasks as a pipelin. The intermediate steps must be a transformer.
Transofrmers must implement fit and trandform methods.

Note the while the input features are automatically scaled by the pipeline, the
predictions are not scaled back to original value. 

In [1]:
# Import CA housing dataset
from sklearn.datasets import fetch_california_housing
from sklearn.preprocessing import StandardScaler, MinMaxScaler, FunctionTransformer
from sklearn.metrics import mean_squared_error, r2_score, mean_absolute_error
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from copy import deepcopy

In [2]:
data_set = fetch_california_housing()
print(f"data_set: {data_set.data.shape}, features: {data_set.feature_names}")
print(f"target: {data_set.target.shape}, target_names: {data_set.target_names}")

data_set: (20640, 8), features: ['MedInc', 'HouseAge', 'AveRooms', 'AveBedrms', 'Population', 'AveOccup', 'Latitude', 'Longitude']
target: (20640,), target_names: ['MedHouseVal']


In [3]:
# Convert to test and training, mising DataFrame and np.array
X_train, X_test, y_train, y_test = train_test_split(
    data_set.data, data_set.target, test_size=0.2, random_state=42)
print(f"types: {type(X_train)}, {type(X_test)}, {type(y_train)}, {type(y_test)}")
print(f"shape: {X_train.shape}, {X_test.shape}, {y_train.shape}, {y_test.shape}")

types: <class 'numpy.ndarray'>, <class 'numpy.ndarray'>, <class 'numpy.ndarray'>, <class 'numpy.ndarray'>
shape: (16512, 8), (4128, 8), (16512,), (4128,)


In [4]:
# Apply different scalars. Note that the target is not scaled.

# Latitude and Longitude are applied StandardScaler
# Get the transform to be applied to the [:, -2:]
std_scaler = StandardScaler().fit(X_train[:, -2:])
# Remaining features are applied MinMaxScaler, all positive values
minmax_scaler = MinMaxScaler().fit(X_train[:, :-2])

In [5]:
# Print the scalers parameters
print(f"std_scaler: {std_scaler.mean_}, {std_scaler.scale_}")

std_scaler: [  35.64314922 -119.58229046], [2.1366006  2.00559281]


In [6]:
# Define a function to apply the scalers
def preprocessor(X):
    A = deepcopy(X)
    A[:, -2:] = std_scaler.transform(A[:, -2:])
    A[:, :-2] = minmax_scaler.transform(A[:, :-2])
    return A

In [7]:
# Make our transformer
our_transformer = FunctionTransformer(preprocessor)
# Create a pipeline
pipeline = Pipeline([
    ('transformer', our_transformer),
    ('regressor', LinearRegression())
])


In [8]:
def fit_model_and_print_results(p, X_train, X_test, y_train, y_test):
    p.fit(X_train, y_train)
    p.fit(X_test, y_test)
    y_train_pred = p.predict(X_train)
    y_test_pred = p.predict(X_test)
    abs_error_train = mean_absolute_error(y_train, y_train_pred)
    abs_error_test = mean_absolute_error(y_test, y_test_pred)
    mse_train = mean_squared_error(y_train, y_train_pred)
    mse_test = mean_squared_error(y_test, y_test_pred)
    r2_train = r2_score(y_train, y_train_pred)
    r2_test = r2_score(y_test, y_test_pred)
    print(f"abs error train: {abs_error_train.mean()}, abs error test: {abs_error_test.mean()}")
    print(f"MSE train: {mse_train}, MSE test: {mse_test}")
    print(f"R2 train: {r2_train}, R2 test: {r2_test}")

In [9]:
fit_model_and_print_results(pipeline, X_train, X_test, y_train, y_test)

abs error train: 0.5342218724337053, abs error test: 0.5289426614283463
MSE train: 0.987610851126717, MSE test: 0.5293336127912476
R2 train: 0.2612006670839664, R2 test: 0.596054650433006
