# Chapter 2: Model Selection and Validation

In [1]:
# Chapter 2 imports and data
import numpy as np
import pandas as pd

from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import LinearRegression, GammaRegressor
from sklearn.preprocessing import (
    OrdinalEncoder,
    StandardScaler,
    OneHotEncoder,
    FunctionTransformer,
    PolynomialFeatures,
)
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.metrics import root_mean_squared_error
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, KFold, GridSearchCV


diamonds = pd.read_parquet("diamonds.parquet")  # Or sns.load_dataset("diamonds")

ord_vars = ["color", "cut", "clarity"]
ord_levels = [diamonds[x].cat.categories.to_list() for x in ord_vars]

## Exercise 1

In [2]:
# THIS IS THE DIFFERENCE TO THE CODE IN THE LECTURE NOTES
dia = diamonds.drop_duplicates(["price", "carat"] + ord_vars)

df_train, df_test, y_train, y_test = train_test_split(
    dia, dia["price"], test_size=0.2, random_state=49
)

In [3]:
# Define knn model
knn_preprocessor = make_pipeline(
    ColumnTransformer(
        transformers=[
            ("ordered", OrdinalEncoder(categories=ord_levels), ord_vars),
            ("linear", "passthrough", ["carat"]),
        ],
        verbose_feature_names_out=False,
    ),
    StandardScaler(),
).set_output(transform="pandas")

knn_model = Pipeline(
    steps=[
        ("prep", knn_preprocessor),
        ("knn", KNeighborsRegressor(n_neighbors=5)),
    ]
)

# Define linear model
linear_regression = make_pipeline(
    ColumnTransformer(
        transformers=[
            ("log_carat", FunctionTransformer(np.log), ["carat"]),
            ("dummies", OneHotEncoder(categories=ord_levels, drop="first"), ord_vars),
        ]
    ),
    LinearRegression(),
)

In [4]:
# CV performance of linear model
cv = KFold(n_splits=5, shuffle=True, random_state=4432)

result_linear = -cross_val_score(
    linear_regression,
    X=df_train,
    y=y_train,
    scoring="neg_root_mean_squared_error",
    cv=cv,
).mean()

print(f"Linear regression CV RMSE: {result_linear:.3f}")

Linear regression CV RMSE: 1938.481


In [5]:
# CV performance of k-nearest-neighbour for k = 1-20
search = GridSearchCV(
    knn_model,
    param_grid={"knn__n_neighbors": range(1, 21)},
    scoring="neg_root_mean_squared_error",
    cv=cv,
)

# Remember: the best model is refitted on training data
search.fit(X=df_train, y=y_train)
best_k = search.best_params_["knn__n_neighbors"]
print(f"Best k of k-NN: {best_k}")
print(f"Its CV-RMSE: {-search.best_score_:.3f}")

Best k of k-NN: 7
Its CV-RMSE: 727.155


In [6]:
# The overall best model seems to be 7-nearest-neighbour
final_rmse = root_mean_squared_error(y_test, search.predict(df_test))
print(f"Test RMSE of final model: {final_rmse:.3f}")

Test RMSE of final model: 700.575


**Comments:** The test performance of the best model (7-NN) seems clearly worse than the one without deduplication (~700 USD RMSE vs ~620). Overall, this is probably the more realistic performance than the one obtained from the original data set. Still, as certain rows could be identical by chance, our deduplication approach might be slightly too conservative. The true performance will probably be somewhere between the two approaches.

## Exercise 2

In [7]:
# Split data into train and test
df_train, df_test, y_train, y_test = train_test_split(
    diamonds, diamonds["price"], test_size=0.2, random_state=49
)

# Define parametrized end-to-end preprocessor with GLM on top of it
prep_carat = Pipeline(
    steps=[
        ("log", FunctionTransformer(np.log)),
        ("scaler", StandardScaler()),  # Not really necessary
        ("poly", PolynomialFeatures(degree=1, include_bias=False)),
    ]
)

preprocessor = ColumnTransformer(
    transformers=[
        ("carat", prep_carat, ["carat"]),
        ("dummies", OneHotEncoder(categories=ord_levels, drop="first"), ord_vars),
    ]
)

gamma_regression = Pipeline(
    steps=[
        ("preprocessor", preprocessor),
        ("glm", GammaRegressor(alpha=0, solver="newton-cholesky")),
    ]
)

In [8]:
# Grid search for best polynomial degree
param_grid = {"preprocessor__carat__poly__degree": range(1, 11)}

search = GridSearchCV(
    gamma_regression,
    param_grid=param_grid,
    scoring="neg_mean_gamma_deviance",
    cv=KFold(n_splits=5, shuffle=True, random_state=4302),
)
search.fit(X=df_train, y=y_train)

# Organize results
results = pd.DataFrame(
    -search.cv_results_["mean_test_score"],
    index=param_grid["preprocessor__carat__poly__degree"],
    columns=["mean_gamma_deviance"],
)
results

found 0 physical cores < 1
  File "d:\ml_lecture\.venv\Lib\site-packages\joblib\externals\loky\backend\context.py", line 282, in _count_physical_cores
    raise ValueError(f"found {cpu_count_physical} physical cores < 1")


Unnamed: 0,mean_gamma_deviance
1,0.018113
2,0.017934
3,0.0162
4,0.016201
5,0.016153
6,0.016619
7,0.016315
8,0.015877
9,0.015921
10,0.016914


In [9]:
# The model was automatically refitted on the train data with optimal degree:
print(f"Mean deviance on the test data: {-search.score(df_test, y_test):.3f}")

Mean deviance on the test data: 0.016


**Comments:** The optimal degree seems to be 8 with a CV deviance of 0.016. The test performance is similar. Caution: Instead of using such high degree polynomial, it is better to use regression splines.

## Exercise 3 (optional)

Solution not shown here.