In [1]:
import pandas as pd


penguins = pd.read_csv("../datasets/penguins.csv")

columns = ["Body Mass (g)", "Flipper Length (mm)", "Culmen Length (mm)"]
target_name = "Species"

# Remove lines with missing values for the columns of interestes
penguins_non_missing = penguins[columns + [target_name]].dropna()

data = penguins_non_missing[columns]
target = penguins_non_missing[target_name]

In [2]:
data.head()

Unnamed: 0,Body Mass (g),Flipper Length (mm),Culmen Length (mm)
0,3750.0,181.0,39.1
1,3800.0,186.0,39.5
2,3250.0,195.0,40.3
4,3450.0,193.0,36.7
5,3650.0,190.0,39.3


In [3]:
target.head()

0    Adelie Penguin (Pygoscelis adeliae)
1    Adelie Penguin (Pygoscelis adeliae)
2    Adelie Penguin (Pygoscelis adeliae)
4    Adelie Penguin (Pygoscelis adeliae)
5    Adelie Penguin (Pygoscelis adeliae)
Name: Species, dtype: object

In [4]:
target.value_counts()

Adelie Penguin (Pygoscelis adeliae)          151
Gentoo penguin (Pygoscelis papua)            123
Chinstrap penguin (Pygoscelis antarctica)     68
Name: Species, dtype: int64

In [10]:
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline


model5 = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=5)),
])

In [11]:
from sklearn.model_selection import cross_validate

cv_results5 = cross_validate(model5, data, target, cv=10, scoring='balanced_accuracy')
cv_results5

{'fit_time': array([0.00698948, 0.00700212, 0.00999928, 0.01000285, 0.01300001,
        0.00900126, 0.00700092, 0.00899863, 0.00799894, 0.00800157]),
 'score_time': array([0.00900221, 0.01000094, 0.00600123, 0.01499891, 0.01200342,
        0.00700021, 0.00700212, 0.00800323, 0.00800109, 0.00700068]),
 'test_score': array([1.        , 1.        , 1.        , 0.91880342, 0.88253968,
        0.95238095, 0.97777778, 0.93015873, 0.90793651, 0.95238095])}

In [12]:
model51 = Pipeline(steps=[
    ("preprocessor", StandardScaler()),
    ("classifier", KNeighborsClassifier(n_neighbors=51)),
])
cv_results51 = cross_validate(model, data, target, cv=10, scoring='balanced_accuracy')
cv_results51

{'fit_time': array([0.00799894, 0.01100492, 0.00900793, 0.01201415, 0.00798225,
        0.0100162 , 0.00901794, 0.00702429, 0.00700164, 0.0070076 ]),
 'score_time': array([0.00900626, 0.00599575, 0.00799561, 0.00600982, 0.00600147,
        0.00598073, 0.00700021, 0.00699115, 0.00698233, 0.00499368]),
 'test_score': array([0.95238095, 0.97777778, 1.        , 0.86324786, 0.88253968,
        0.95238095, 0.95555556, 0.95238095, 0.93015873, 0.95238095])}

In [14]:
cv_results5['test_score'].mean() - cv_results51['test_score'].mean()

0.01031746031746028

In [15]:
cv_results5['test_score'].std()

0.0399020975957868

In [16]:
cv_results_ns = cross_validate(KNeighborsClassifier(n_neighbors=5), data, target, cv=10, scoring='balanced_accuracy')
cv_results_ns

{'fit_time': array([0.00700355, 0.0040009 , 0.00400281, 0.00398755, 0.00599456,
        0.00600243, 0.00500011, 0.00800228, 0.00600076, 0.00599742]),
 'score_time': array([0.00699759, 0.00800037, 0.00699806, 0.00699854, 0.00500298,
        0.00698376, 0.010005  , 0.00899959, 0.00699878, 0.00800395]),
 'test_score': array([0.66468254, 0.73601954, 0.74102564, 0.7042735 , 0.58412698,
        0.66984127, 0.83492063, 0.74285714, 0.88253968, 0.83809524])}

In [17]:
from sklearn.preprocessing import MinMaxScaler
from sklearn.preprocessing import QuantileTransformer
from sklearn.preprocessing import PowerTransformer


all_preprocessors = [
    None,
    StandardScaler(),
    MinMaxScaler(),
    QuantileTransformer(n_quantiles=100),
    PowerTransformer(method="box-cox"),
]

In [19]:
from sklearn.model_selection import GridSearchCV

param_grid = {
    "preprocessor": all_preprocessors,
    "classifier__n_neighbors": [5, 51, 101]
}
model_grid_search = GridSearchCV(model, param_grid=param_grid,
                                 n_jobs=2, cv=10, scoring='balanced_accuracy')
model_grid_search.fit(data, target)

GridSearchCV(cv=10,
             estimator=Pipeline(steps=[('preprocessor', StandardScaler()),
                                       ('classifier',
                                        KNeighborsClassifier(n_neighbors=51))]),
             n_jobs=2,
             param_grid={'classifier__n_neighbors': [5, 51, 101],
                         'preprocessor': [None, StandardScaler(),
                                          MinMaxScaler(),
                                          QuantileTransformer(n_quantiles=100),
                                          PowerTransformer(method='box-cox')]},
             scoring='balanced_accuracy')

In [20]:
accuracy = model_grid_search.score(data, target)
print(
    f"The test accuracy score of the grid-searched pipeline is: "
    f"{accuracy:.2f}"
)

The test accuracy score of the grid-searched pipeline is: 0.98


In [21]:
cv_results = pd.DataFrame(model_grid_search.cv_results_).sort_values(
    "mean_test_score", ascending=False)
cv_results

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_classifier__n_neighbors,param_preprocessor,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,split5_test_score,split6_test_score,split7_test_score,split8_test_score,split9_test_score,mean_test_score,std_test_score,rank_test_score
1,0.0116,0.000917,0.010601,0.001021,5,StandardScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,1.0,1.0,0.918803,0.88254,0.952381,0.977778,0.930159,0.907937,0.952381,0.952198,0.039902,1
2,0.011001,0.001414,0.0098,0.001537,5,MinMaxScaler(),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,0.952381,1.0,0.944444,0.88254,0.930159,0.955556,0.952381,0.907937,0.952381,0.947778,0.034268,2
3,0.014001,0.002,0.009602,0.00102,5,QuantileTransformer(n_quantiles=100),"{'classifier__n_neighbors': 5, 'preprocessor':...",0.952381,0.92674,1.0,0.918803,0.904762,1.0,0.977778,0.930159,0.907937,0.952381,0.947094,0.033797,3
4,0.021052,0.002706,0.010457,0.001387,5,PowerTransformer(method='box-cox'),"{'classifier__n_neighbors': 5, 'preprocessor':...",1.0,0.977778,1.0,0.863248,0.88254,0.952381,0.955556,0.930159,0.907937,1.0,0.94696,0.047387,4
6,0.009999,0.001099,0.009054,0.00115,51,StandardScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.952381,0.977778,1.0,0.863248,0.88254,0.952381,0.955556,0.952381,0.930159,0.952381,0.94188,0.038905,5
8,0.011705,0.000903,0.009296,0.001267,51,QuantileTransformer(n_quantiles=100),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.857143,0.952381,1.0,0.863248,0.904762,0.904762,0.977778,0.930159,0.930159,0.952381,0.927277,0.043759,6
9,0.017408,0.001276,0.009697,0.000907,51,PowerTransformer(method='box-cox'),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.904762,0.977778,1.0,0.863248,0.834921,0.952381,0.907937,0.952381,0.930159,0.904762,0.922833,0.047883,7
7,0.0096,0.001117,0.009752,0.001598,51,MinMaxScaler(),"{'classifier__n_neighbors': 51, 'preprocessor'...",0.904762,0.952381,1.0,0.863248,0.834921,0.952381,0.907937,0.952381,0.930159,0.904762,0.920293,0.045516,8
11,0.009701,0.000899,0.009701,0.001004,101,StandardScaler(),"{'classifier__n_neighbors': 101, 'preprocessor...",0.857143,0.952381,0.944444,0.863248,0.834921,0.857143,0.834921,0.88254,0.834921,0.904762,0.876642,0.041618,9
12,0.009901,0.001042,0.0097,0.000782,101,MinMaxScaler(),"{'classifier__n_neighbors': 101, 'preprocessor...",0.857143,0.857143,0.944444,0.863248,0.834921,0.857143,0.765079,0.904762,0.834921,0.904762,0.862357,0.046244,10


In [22]:
# get the parameter names
column_results = [f"param_{name}" for name in param_grid.keys()]
column_results += [
    "mean_test_score", "std_test_score", "rank_test_score"]
cv_results = cv_results[column_results]

In [23]:
def shorten_param(param_name):
    if "__" in param_name:
        return param_name.rsplit("__", 1)[1]
    return param_name


cv_results = cv_results.rename(shorten_param, axis=1)
cv_results

Unnamed: 0,param_preprocessor,n_neighbors,mean_test_score,std_test_score,rank_test_score
1,StandardScaler(),5,0.952198,0.039902,1
2,MinMaxScaler(),5,0.947778,0.034268,2
3,QuantileTransformer(n_quantiles=100),5,0.947094,0.033797,3
4,PowerTransformer(method='box-cox'),5,0.94696,0.047387,4
6,StandardScaler(),51,0.94188,0.038905,5
8,QuantileTransformer(n_quantiles=100),51,0.927277,0.043759,6
9,PowerTransformer(method='box-cox'),51,0.922833,0.047883,7
7,MinMaxScaler(),51,0.920293,0.045516,8
11,StandardScaler(),101,0.876642,0.041618,9
12,MinMaxScaler(),101,0.862357,0.046244,10
