# Model Tuning: Grid Search + Pipeline

In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
import seaborn as sns
from scipy import stats as stats

from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

from sklearn.metrics import precision_score, recall_score
from sklearn.model_selection import train_test_split, GridSearchCV,\
cross_val_score, RandomizedSearchCV

from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.impute import SimpleImputer
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer

# Objectives

- Explain what hyperparameters are
- Describe the purpose of grid searching
- Implement grid searching for the purposes of model optimization.

# Model Tuning

![](https://imgs.xkcd.com/comics/machine_learning.png)

## Hyperparameters

Many of the models we have looked at are really *families* of models in the sense that they make use of **hyperparameters**.

Thus for example the $k$-nearest-neighbors algorithm allows us to make:

- a 1-nearest-neighbor model
- a 2-nearest-neighbors model
- a 3-nearest-neighbors model
- etc.

Or, for another example, the decision tree algorithm allows us to make:

- a classifier that branches according to information gain
- a classifier that branches according to Gini impurity
- a regressor that branches according to mean squared error
- etc.

Depending on the sort of problem and data at hand, it is natural to experiment with different values of these hyperparameters to try to improve model performance.

> We can think of these **hyperparameters** as _dials_ of the base model

<img width=60% src='images/dials.png'/>

### Difference from Parametric / Non-Parametric Models

Contrast the notion of hyperparameters with the distinction between parametric and non-parametric models.

A linear regression model is parametric in the sense that we start with a given model *form* and we then search for the optimal parameters to fill in that form. But *those* parameters are not the sort we might tweak for the purposes of improving model performance. On the contrary, there is one best set of parameters, and the training of the model is a matter of finding those optimal values.

## Data Example

![Penguins](https://raw.githubusercontent.com/allisonhorst/palmerpenguins/69530276d74b99df81cc385f4e95c644da69ebfa/man/figures/lter_penguins.png)

> Images source: @allison_horst [github.com/allisonhorst/penguins](github.com/allisonhorst/penguins)

In [2]:
penguins = sns.load_dataset('penguins')

![Bill length & depth](https://raw.githubusercontent.com/allisonhorst/palmerpenguins/69530276d74b99df81cc385f4e95c644da69ebfa/man/figures/culmen_depth.png)

> Images source: @allison_horst [github.com/allisonhorst/penguins](github.com/allisonhorst/penguins)

In [3]:
penguins.head()

Unnamed: 0,species,island,bill_length_mm,bill_depth_mm,flipper_length_mm,body_mass_g,sex
0,Adelie,Torgersen,39.1,18.7,181.0,3750.0,Male
1,Adelie,Torgersen,39.5,17.4,186.0,3800.0,Female
2,Adelie,Torgersen,40.3,18.0,195.0,3250.0,Female
3,Adelie,Torgersen,,,,,
4,Adelie,Torgersen,36.7,19.3,193.0,3450.0,Female


In [4]:
penguins.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 344 entries, 0 to 343
Data columns (total 7 columns):
 #   Column             Non-Null Count  Dtype  
---  ------             --------------  -----  
 0   species            344 non-null    object 
 1   island             344 non-null    object 
 2   bill_length_mm     342 non-null    float64
 3   bill_depth_mm      342 non-null    float64
 4   flipper_length_mm  342 non-null    float64
 5   body_mass_g        342 non-null    float64
 6   sex                333 non-null    object 
dtypes: float64(4), object(3)
memory usage: 18.9+ KB


### Data Prep

We'll try to predict species given the other columns' values. Let's dummy-out `island` and `sex`:

In [5]:
penguins.isna().sum().sum()

19

In [6]:
penguins = penguins.dropna()

In [7]:
y = penguins.pop('species')

In [8]:
# Note we're dedicating a lot of data to the testing set just for demonstrative purposes
X_train, X_test, y_train, y_test = train_test_split(
    penguins, y, test_size=0.5, random_state=42)

In [9]:
X_train_cat = X_train.select_dtypes('object')

ohe = OneHotEncoder(
    drop='first',
    sparse=False)

dums = ohe.fit_transform(X_train_cat)
dums_df = pd.DataFrame(dums,
                       columns=ohe.get_feature_names_out(),
                       index=X_train_cat.index)



In [10]:
dums_df.head()

Unnamed: 0,island_Dream,island_Torgersen,sex_Male
160,1.0,0.0,0.0
237,0.0,0.0,1.0
2,0.0,1.0,0.0
121,0.0,1.0,1.0
179,1.0,0.0,1.0


In [11]:
X_train_nums = X_train.select_dtypes('float64')

ss = StandardScaler()

ss.fit(X_train_nums)
nums_df = pd.DataFrame(ss.transform(X_train_nums),
                      index=X_train_nums.index)

In [12]:
X_train_clean = pd.concat([nums_df, dums_df], axis=1)

In [13]:
X_train_clean.head()

Unnamed: 0,0,1,2,3,island_Dream,island_Torgersen,sex_Male
160,0.362748,0.903276,-0.472344,-0.094599,1.0,0.0,0.0
237,0.973499,-0.977375,1.408317,2.512546,0.0,0.0,1.0
2,-0.725152,0.44582,-0.472344,-1.185963,0.0,1.0,0.0
121,-1.221387,1.360731,-0.255345,-0.882806,0.0,1.0,1.0
179,1.030757,0.954104,-0.110678,-0.519018,1.0,0.0,1.0


#### Preparing the Test Set

In [14]:
X_test_cat = X_test.select_dtypes('object')

test_dums = ohe.transform(X_test_cat)
test_dums_df = pd.DataFrame(test_dums,
                       columns=ohe.get_feature_names_out(),
                      index=X_test_cat.index)

In [15]:
X_test_nums = X_test.select_dtypes('float64')

test_nums = ss.transform(X_test_nums)
test_nums_df = pd.DataFrame(test_nums,
                           index=X_test_nums.index)

In [16]:
X_test_clean = pd.concat([test_nums_df,
                 test_dums_df], axis=1)

In [17]:
X_test_clean.head()

Unnamed: 0,0,1,2,3,island_Dream,island_Torgersen,sex_Male
30,-0.877839,-0.214949,-1.702007,-1.185963,1.0,0.0,0.0
317,0.534522,-1.282345,1.48065,0.784554,0.0,0.0,0.0
79,-0.381604,1.004932,-0.472344,-0.276493,0.0,1.0,1.0
201,1.088015,0.090021,-0.255345,-0.670597,1.0,0.0,0.0
63,-0.572464,0.547477,-0.689343,-0.215862,0.0,0.0,1.0


### Trying Different Models & Values

#### $k$-Nearest Neighbors Model

In [18]:
knn_model = KNeighborsClassifier()
X_train_clean.columns = X_train_clean.columns.astype(str)
knn_model.fit(X_train_clean, y_train)

In [19]:
X_train_clean.columns

Index(['0', '1', '2', '3', 'island_Dream', 'island_Torgersen', 'sex_Male'], dtype='object')

In [20]:
scores = cross_val_score(estimator=knn_model, X=X_train_clean,
               y=y_train, cv=10)
scores

array([1.        , 1.        , 1.        , 1.        , 1.        ,
       0.94117647, 1.        , 1.        , 1.        , 1.        ])

In [21]:
np.median(scores)

1.0

In [22]:
X_test_clean.columns = X_test_clean.columns.astype(str)


In [23]:

knn_model.score(X_test_clean, y_test)

0.9880239520958084

##### Decreasing $k$

In [24]:
knn3 = KNeighborsClassifier(n_neighbors=3)

knn3.fit(X_train_clean, y_train)

In [25]:
knn3.score(X_test_clean, y_test)

0.9940119760479041

#### Decision Tree

In [26]:
ct = DecisionTreeClassifier(random_state=10)

ct.fit(X_train_clean, y_train)

In [27]:
ct.score(X_test_clean, y_test)

0.9760479041916168

##### Changing the branching criterion

In [28]:
ct = DecisionTreeClassifier(criterion='entropy',
                          random_state=10)

ct.fit(X_train_clean, y_train)

In [29]:
ct.score(X_test_clean, y_test)

0.9760479041916168

# Automatically Searching with Grid Search

It's not a bad idea to experiment with the values of your models' hyperparameters a bit as you're getting a feel for your models' performance. But there are more systematic ways of going about the search for optimal hyperparameters. One method of hyperparameter tuning is **grid searching**. 

The idea is to build multiple models with different hyperparameter values and then see which one performs the best. The hyperparameters and the values to try form a sort of *grid* along which we are looking for the best performance. For example:


    1           | 'minkowski' | 'uniform'
    3           | 'manhattan' | 'distance'
    5           |
    ______________________________________
    n_neighbors | metric      | weights

Scikit-Learn has a [`GridSearchCV`](https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.GridSearchCV.html) class whose `fit()` method runs this procedure. Note that this can be quite computationally expensive since:

- A model is constructed for each combination of hyperparameter values that we input; and
- Each model is cross-validated.

### `GridSearchCV`

In [30]:
# Define the parameter grid
grid = {'metric': ['minkowski', 'manhattan'],
       'n_neighbors': [1,3,5],
       'weights':['uniform', 'distance']}
gs = GridSearchCV(estimator=knn_model, param_grid=grid, verbose=3, n_jobs=-2)

**Question: How many models will we be constructing with this grid?**

In [31]:
gs.fit(X_train_clean,y_train)

Fitting 5 folds for each of 12 candidates, totalling 60 fits


In [32]:
# Initialize the grid search object with five-fold cross-validation

gs.best_params_

{'metric': 'minkowski', 'n_neighbors': 1, 'weights': 'uniform'}

In [33]:
gs.best_score_

0.9939393939393939

In [34]:
gs.best_estimator_.score(X_test_clean, y_test)

0.9940119760479041

In [35]:
pd.DataFrame(gs.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_metric,param_n_neighbors,param_weights,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003419,0.001787,0.005621,0.002467,minkowski,1,uniform,"{'metric': 'minkowski', 'n_neighbors': 1, 'wei...",1.0,1.0,0.969697,1.0,1.0,0.993939,0.012121,1
1,0.001653,0.000712,0.002833,0.002186,minkowski,1,distance,"{'metric': 'minkowski', 'n_neighbors': 1, 'wei...",1.0,1.0,0.969697,1.0,1.0,0.993939,0.012121,1
2,0.002568,0.001348,0.005161,0.003149,minkowski,3,uniform,"{'metric': 'minkowski', 'n_neighbors': 3, 'wei...",1.0,1.0,0.969697,1.0,1.0,0.993939,0.012121,1
3,0.001735,0.000723,0.001706,0.000908,minkowski,3,distance,"{'metric': 'minkowski', 'n_neighbors': 3, 'wei...",1.0,1.0,0.969697,1.0,1.0,0.993939,0.012121,1
4,0.003673,0.002824,0.004367,0.001833,minkowski,5,uniform,"{'metric': 'minkowski', 'n_neighbors': 5, 'wei...",1.0,1.0,0.969697,1.0,1.0,0.993939,0.012121,1
5,0.001753,0.001126,0.001124,6.2e-05,minkowski,5,distance,"{'metric': 'minkowski', 'n_neighbors': 5, 'wei...",1.0,1.0,0.969697,1.0,1.0,0.993939,0.012121,1
6,0.001437,0.000527,0.002121,0.000304,manhattan,1,uniform,"{'metric': 'manhattan', 'n_neighbors': 1, 'wei...",1.0,1.0,0.969697,1.0,1.0,0.993939,0.012121,1
7,0.001338,0.000414,0.001633,0.000607,manhattan,1,distance,"{'metric': 'manhattan', 'n_neighbors': 1, 'wei...",1.0,1.0,0.969697,1.0,1.0,0.993939,0.012121,1
8,0.00219,0.000967,0.002158,0.000312,manhattan,3,uniform,"{'metric': 'manhattan', 'n_neighbors': 3, 'wei...",1.0,1.0,0.969697,1.0,1.0,0.993939,0.012121,1
9,0.001621,0.000469,0.00104,0.000154,manhattan,3,distance,"{'metric': 'manhattan', 'n_neighbors': 3, 'wei...",1.0,1.0,0.969697,1.0,1.0,0.993939,0.012121,1


### Choice of Grid Values

Which values should you pick for your grid? Intuitively, you should try both "large" and "small" values, but of course what counts as large and small will really depend on the type of hyperparameter.

- For a k-nearest neighbors model, 1 or 3 would be a small value for the number of neighbors and 15 or 17 would be a large value.
- For a decision tree model, what counts as a small `max_depth` will really depend on the size of your training data. A `max_depth` of 5 would likely have little effect on a very small dataset but, at the same time, it would probably significantly decrease the variance of a model where the dataset is large.
- For a logistic regression's regularization constant, you may want to try a set of values that are exponentially separated, like \[1, 10, 100, 1000\].
- **If a grid search finds optimal values at the ends of your hyperparameter ranges, you might try another grid search with more extreme values.**

### Exercise

Do a grid search on a **decision tree model** of penguin species. What are the optimal values for the hyperparameters you've chosen?

# Better Process: Pipelines

> **Pipelines** can keep our code neat and clean all the way from gathering & cleaning our data, to creating models & fine-tuning them!

![](https://imgs.xkcd.com/comics/data_pipeline.png)

The `Pipeline` class from [Scikit-Learn's API](https://scikit-learn.org/stable/modules/generated/sklearn.pipeline.Pipeline.html) is especially convenient since it allows us to use our other Estimators that we know and love!

## Advantages of `Pipeline`

### Reduces Complexity

> You can focus on particular parts of the pipeline one at a time and debug or adjust parts as needed.

### Convenient

> The pipeline summarizes your fine-detail steps. That way you can focus on the big-picture aspects.

### Flexible

> You can use pipelines with different models and with GridSearch.

### Prevent Mistakes

> We can focus on one section at a time.
>
> We also can ensure data leakage between our training and doesn't occur between our training dataset and validation/testing datasets!

## Example of Using `Pipeline`

In [36]:
# Getting some data
from sklearn import datasets

iris = datasets.load_iris()
X = iris.data
y = iris.target

# Split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, 
                                                    random_state=27)

### Without the Pipeline class

In [37]:
# Define transformers (will adjust/massage the data)
imputer = SimpleImputer(strategy="median") # replaces missing values
std_scaler = StandardScaler() # scales the data

# Define the classifier (predictor) to train
rf_clf = DecisionTreeClassifier(random_state=42)

# Have the classifer (and full pipeline) learn/train/fit from the data
X_train_filled = imputer.fit_transform(X_train)
X_train_scaled = std_scaler.fit_transform(X_train_filled)
rf_clf.fit(X_train_scaled, y_train)

# Predict using the trained classifier (still need to do the transformations)
X_test_filled = imputer.transform(X_test)
X_test_scaled = std_scaler.transform(X_test_filled)
y_pred = rf_clf.predict(X_test_scaled)
print(y_pred)

[2 0 2 2 1 1 1 2 2 0 1 1 0 1 1 1 0 2 2 2 1 0 1 0 2 2 1 0 1 2]


> Note that if we were to add more steps in this process, we'd have to change both the *training* and *testing* processes.

### With `Pipeline` Class

In [38]:
pipeline = Pipeline([
        ('imputer', SimpleImputer(strategy="median")), 
        ('std_scaler', StandardScaler()),
        ('rf_clf', DecisionTreeClassifier(random_state=42)),
])


# Train the pipeline (tranformations & predictor)
pipeline.fit(X_train, y_train)

# Predict using the pipeline (includes the transfomers & trained predictor)
predicted = pipeline.predict(X_test)
print(predicted)

[2 0 2 2 1 1 1 2 2 0 1 1 0 1 1 1 0 2 2 2 1 0 1 0 2 2 1 0 1 2]


In [39]:
pipeline['imputer']

In [40]:
print(pipeline)

Pipeline(steps=[('imputer', SimpleImputer(strategy='median')),
                ('std_scaler', StandardScaler()),
                ('rf_clf', DecisionTreeClassifier(random_state=42))])


> If we need to change our process, we change it _just once_ in the Pipeline

## Grid Searching a Pipeline

> Let's first get our data prepared like we did before

In [41]:
penguins = sns.load_dataset('penguins')
penguins = penguins.dropna()

In [42]:
y = penguins.pop('species')
X_train, X_test, y_train, y_test = train_test_split(
    penguins, y, test_size=0.5, random_state=42)

In [43]:
X_train_nums = X_train.select_dtypes('float64')

ss = StandardScaler()

ss.fit(X_train_nums)
nums_df = pd.DataFrame(ss.transform(X_train_nums),
                      index=X_train_nums.index)

In [44]:
X_train_cat = X_train.select_dtypes('object')

ohe = OneHotEncoder(
    drop='first',
    sparse=False)

dums = ohe.fit_transform(X_train_cat)
dums_df = pd.DataFrame(dums,
                       columns=ohe.get_feature_names_out(),
                       index=X_train_cat.index)



> Intermediary step to treat categorical and numerical data differently

### Using `ColumnTransformer`

In [45]:
X_train_nums.columns

Index(['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'], dtype='object')

In [46]:
numerical_pipeline = Pipeline(steps=[('ss', StandardScaler())])

categorical_pipeline = Pipeline(steps=[('ohe', OneHotEncoder(sparse=False, 
                                                            handle_unknown='ignore', 
                                                           drop ='first'))])
transformer = ColumnTransformer(transformers=[('num', numerical_pipeline, X_train_nums.columns), 
                                             ('cat', categorical_pipeline, X_train_cat.columns)])

In [47]:
model_pipe = Pipeline(steps=[('col_tr', transformer),
                            ('knn', KNeighborsClassifier())])


> Finally showing we can fit the full pipeline

In [48]:
model_pipe.fit(X_train, y_train)



In [49]:
model_pipe.score(X_train,y_train)

0.9939759036144579

In [50]:
model_pipe['col_tr'].named_transformers_['num']['ss']

In [51]:
model_pipe

In [52]:
model_pipe.named_steps

{'col_tr': ColumnTransformer(transformers=[('num',
                                  Pipeline(steps=[('ss', StandardScaler())]),
                                  Index(['bill_length_mm', 'bill_depth_mm', 'flipper_length_mm', 'body_mass_g'], dtype='object')),
                                 ('cat',
                                  Pipeline(steps=[('ohe',
                                                   OneHotEncoder(drop='first',
                                                                 handle_unknown='ignore',
                                                                 sparse=False))]),
                                  Index(['island', 'sex'], dtype='object'))]),
 'knn': KNeighborsClassifier()}

> Performing grid search on the full pipeline

In [53]:
pipe_grid = {'knn__n_neighbors':[3, 5, 7],
             'knn__p':[1, 2, 3],
             'col_tr__num__ss__with_std':[True, False]}
gs_pipe = GridSearchCV(model_pipe, pipe_grid, verbose=3)

In [54]:
gs_pipe.fit(X_train, y_train)

Fitting 5 folds for each of 18 candidates, totalling 90 fits
[CV 1/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=3, knn__p=1;, score=1.000 total time=   0.0s
[CV 2/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=3, knn__p=1;, score=1.000 total time=   0.0s
[CV 3/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=3, knn__p=1;, score=0.970 total time=   0.0s
[CV 4/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=3, knn__p=1;, score=1.000 total time=   0.0s
[CV 5/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=3, knn__p=1;, score=1.000 total time=   0.0s
[CV 1/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=3, knn__p=2;, score=1.000 total time=   0.0s
[CV 2/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=3, knn__p=2;, score=1.000 total time=   0.0s
[CV 3/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=3, knn__p=2;, score=0.970 total time=   0.0s
[CV 4/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=3, knn__p=2;, score=1




[CV 5/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=3, knn__p=2;, score=1.000 total time=   0.0s
[CV 1/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=3, knn__p=3;, score=1.000 total time=   0.0s
[CV 2/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=3, knn__p=3;, score=1.000 total time=   0.0s
[CV 3/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=3, knn__p=3;, score=0.970 total time=   0.0s
[CV 4/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=3, knn__p=3;, score=1.000 total time=   0.0s
[CV 5/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=3, knn__p=3;, score=1.000 total time=   0.0s
[CV 1/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=5, knn__p=1;, score=1.000 total time=   0.0s
[CV 2/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=5, knn__p=1;, score=1.000 total time=   0.0s
[CV 3/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=5, knn__p=1;, score=0.970 total time=   0.0s
[CV 4/5] END col_tr__num__ss__with_s



[CV 5/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=7, knn__p=1;, score=1.000 total time=   0.0s
[CV 1/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=7, knn__p=2;, score=1.000 total time=   0.0s
[CV 2/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=7, knn__p=2;, score=1.000 total time=   0.0s
[CV 3/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=7, knn__p=2;, score=0.970 total time=   0.0s
[CV 4/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=7, knn__p=2;, score=0.970 total time=   0.0s
[CV 5/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=7, knn__p=2;, score=1.000 total time=   0.0s
[CV 1/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=7, knn__p=3;, score=1.000 total time=   0.0s
[CV 2/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=7, knn__p=3;, score=1.000 total time=   0.0s
[CV 3/5] END col_tr__num__ss__with_std=True, knn__n_neighbors=7, knn__p=3;, score=0.970 total time=   0.0s
[CV 4/5] END col_tr__num__ss__with_st



[CV 3/5] END col_tr__num__ss__with_std=False, knn__n_neighbors=3, knn__p=1;, score=0.727 total time=   0.0s
[CV 4/5] END col_tr__num__ss__with_std=False, knn__n_neighbors=3, knn__p=1;, score=0.515 total time=   0.0s
[CV 5/5] END col_tr__num__ss__with_std=False, knn__n_neighbors=3, knn__p=1;, score=0.879 total time=   0.0s
[CV 1/5] END col_tr__num__ss__with_std=False, knn__n_neighbors=3, knn__p=2;, score=0.765 total time=   0.0s
[CV 2/5] END col_tr__num__ss__with_std=False, knn__n_neighbors=3, knn__p=2;, score=0.727 total time=   0.0s
[CV 3/5] END col_tr__num__ss__with_std=False, knn__n_neighbors=3, knn__p=2;, score=0.727 total time=   0.0s
[CV 4/5] END col_tr__num__ss__with_std=False, knn__n_neighbors=3, knn__p=2;, score=0.515 total time=   0.0s
[CV 5/5] END col_tr__num__ss__with_std=False, knn__n_neighbors=3, knn__p=2;, score=0.848 total time=   0.0s
[CV 1/5] END col_tr__num__ss__with_std=False, knn__n_neighbors=3, knn__p=3;, score=0.765 total time=   0.0s
[CV 2/5] END col_tr__num__ss



[CV 4/5] END col_tr__num__ss__with_std=False, knn__n_neighbors=5, knn__p=2;, score=0.515 total time=   0.0s
[CV 5/5] END col_tr__num__ss__with_std=False, knn__n_neighbors=5, knn__p=2;, score=0.818 total time=   0.0s
[CV 1/5] END col_tr__num__ss__with_std=False, knn__n_neighbors=5, knn__p=3;, score=0.618 total time=   0.0s
[CV 2/5] END col_tr__num__ss__with_std=False, knn__n_neighbors=5, knn__p=3;, score=0.727 total time=   0.0s
[CV 3/5] END col_tr__num__ss__with_std=False, knn__n_neighbors=5, knn__p=3;, score=0.727 total time=   0.0s
[CV 4/5] END col_tr__num__ss__with_std=False, knn__n_neighbors=5, knn__p=3;, score=0.515 total time=   0.0s
[CV 5/5] END col_tr__num__ss__with_std=False, knn__n_neighbors=5, knn__p=3;, score=0.818 total time=   0.0s
[CV 1/5] END col_tr__num__ss__with_std=False, knn__n_neighbors=7, knn__p=1;, score=0.676 total time=   0.0s
[CV 2/5] END col_tr__num__ss__with_std=False, knn__n_neighbors=7, knn__p=1;, score=0.697 total time=   0.0s
[CV 3/5] END col_tr__num__ss



[CV 4/5] END col_tr__num__ss__with_std=False, knn__n_neighbors=7, knn__p=3;, score=0.485 total time=   0.0s
[CV 5/5] END col_tr__num__ss__with_std=False, knn__n_neighbors=7, knn__p=3;, score=0.818 total time=   0.0s


In [55]:
pd.DataFrame(gs_pipe.cv_results_)

Unnamed: 0,mean_fit_time,std_fit_time,mean_score_time,std_score_time,param_col_tr__num__ss__with_std,param_knn__n_neighbors,param_knn__p,params,split0_test_score,split1_test_score,split2_test_score,split3_test_score,split4_test_score,mean_test_score,std_test_score,rank_test_score
0,0.003357,0.00051,0.002637,0.000173,True,3,1,"{'col_tr__num__ss__with_std': True, 'knn__n_ne...",1.0,1.0,0.969697,1.0,1.0,0.993939,0.012121,1
1,0.002713,0.000136,0.002431,0.00022,True,3,2,"{'col_tr__num__ss__with_std': True, 'knn__n_ne...",1.0,1.0,0.969697,1.0,1.0,0.993939,0.012121,1
2,0.003067,0.000331,0.002499,0.000143,True,3,3,"{'col_tr__num__ss__with_std': True, 'knn__n_ne...",1.0,1.0,0.969697,1.0,1.0,0.993939,0.012121,1
3,0.002777,0.000111,0.002496,0.000255,True,5,1,"{'col_tr__num__ss__with_std': True, 'knn__n_ne...",1.0,1.0,0.969697,1.0,1.0,0.993939,0.012121,1
4,0.00266,0.000195,0.002205,0.000162,True,5,2,"{'col_tr__num__ss__with_std': True, 'knn__n_ne...",1.0,1.0,0.969697,1.0,1.0,0.993939,0.012121,1
5,0.002779,0.000197,0.002552,0.000242,True,5,3,"{'col_tr__num__ss__with_std': True, 'knn__n_ne...",1.0,1.0,0.969697,1.0,1.0,0.993939,0.012121,1
6,0.002771,0.00031,0.002512,0.000303,True,7,1,"{'col_tr__num__ss__with_std': True, 'knn__n_ne...",1.0,1.0,0.969697,0.969697,1.0,0.987879,0.014845,7
7,0.003187,0.000256,0.002901,0.000356,True,7,2,"{'col_tr__num__ss__with_std': True, 'knn__n_ne...",1.0,1.0,0.969697,0.969697,1.0,0.987879,0.014845,7
8,0.002923,0.000263,0.002428,0.000139,True,7,3,"{'col_tr__num__ss__with_std': True, 'knn__n_ne...",1.0,1.0,0.969697,0.969697,1.0,0.987879,0.014845,7
9,0.002987,0.000348,0.002616,0.000267,False,3,1,"{'col_tr__num__ss__with_std': False, 'knn__n_n...",0.794118,0.727273,0.727273,0.515152,0.878788,0.72852,0.120343,10


In [56]:
gs_pipe.best_params_

{'col_tr__num__ss__with_std': True, 'knn__n_neighbors': 3, 'knn__p': 1}

In [57]:
gs_pipe.best_estimator_

## A Note on Data Leakage

Note we still have to be careful in performing a grid search!

We can accidentally "leak" information by doing transformations with the **whole data set**, instead of just the **training set**!

### Example of leaking information

In [58]:
scaler = StandardScaler()
# Scales over all of the X-train data! (validation set will be considered in scaling)
scaled_data = scaler.fit_transform(X_train.select_dtypes('float64'))

parameters = {
    'n_neighbors': [1, 3, 5],
    'metric': ['minkowski', 'manhattan'],
    'weights': ['uniform', 'distance']
}

clf_dt = KNeighborsClassifier()
clf = GridSearchCV(clf_dt, parameters)
clf.fit(X_train.select_dtypes('float64'), y_train)

### Example of Grid Search with no leakage

In [59]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('clf', KNeighborsClassifier())
])

# Note you use the part of the pipeline's name `NAME__{parameter}`
parameters = {
    'scaler__with_mean': [True, False],
    'clf__n_neighbors': [1, 3, 5],
    'clf__metric': ['minkowski', 'manhattan'],
    'clf__weights': ['uniform', 'distance']
}

cv = GridSearchCV(pipeline, param_grid=parameters)

cv.fit(X_train.select_dtypes('float64'), y_train)
y_pred = cv.predict(X_test.select_dtypes('float64'))
print(y_pred)

['Adelie' 'Gentoo' 'Adelie' 'Chinstrap' 'Adelie' 'Gentoo' 'Gentoo'
 'Chinstrap' 'Chinstrap' 'Chinstrap' 'Adelie' 'Adelie' 'Gentoo' 'Adelie'
 'Gentoo' 'Adelie' 'Adelie' 'Chinstrap' 'Adelie' 'Gentoo' 'Adelie'
 'Adelie' 'Gentoo' 'Chinstrap' 'Adelie' 'Adelie' 'Gentoo' 'Gentoo'
 'Chinstrap' 'Gentoo' 'Chinstrap' 'Gentoo' 'Adelie' 'Adelie' 'Gentoo'
 'Gentoo' 'Chinstrap' 'Gentoo' 'Adelie' 'Adelie' 'Adelie' 'Adelie'
 'Chinstrap' 'Chinstrap' 'Adelie' 'Adelie' 'Gentoo' 'Adelie' 'Adelie'
 'Gentoo' 'Adelie' 'Gentoo' 'Gentoo' 'Adelie' 'Adelie' 'Gentoo'
 'Chinstrap' 'Adelie' 'Chinstrap' 'Chinstrap' 'Gentoo' 'Gentoo' 'Gentoo'
 'Adelie' 'Adelie' 'Gentoo' 'Adelie' 'Gentoo' 'Adelie' 'Gentoo' 'Adelie'
 'Adelie' 'Gentoo' 'Gentoo' 'Gentoo' 'Chinstrap' 'Adelie' 'Adelie'
 'Adelie' 'Gentoo' 'Chinstrap' 'Adelie' 'Chinstrap' 'Adelie' 'Gentoo'
 'Gentoo' 'Adelie' 'Gentoo' 'Chinstrap' 'Chinstrap' 'Gentoo' 'Gentoo'
 'Adelie' 'Adelie' 'Adelie' 'Adelie' 'Adelie' 'Chinstrap' 'Adelie'
 'Adelie' 'Adelie' 'Gentoo' 'Gentoo

# Grid Search Exercise

Use a classifier of your choice to predict the category of price range for the phones in this dataset. Try tuning some hyperparameters using a grid search, and then write up a short paragraph about your findings.

In [60]:
phones_train = pd.read_csv('data/train.csv')
phones_test = pd.read_csv('data/test.csv')

# Level Up: Random Searching

It is also possible to search for good hyperparameter values randomly. This is a nice choice if computation time is an issue or if you are tuning over continuous hyperparameters.

### `RandomizedSearchCV` with `LogisticRegression`

In [61]:
log_reg_grid = {'C': stats.uniform(loc=0, scale=10),
               'l1_ratio': stats.expon(scale=0.2)}

In [62]:
rs = RandomizedSearchCV(estimator=LogisticRegression(penalty='elasticnet',
                                                    solver='saga',
                                                    max_iter=1000,
                                            random_state=42),
                        param_distributions=log_reg_grid,
                       random_state=42)

rs.fit(X_train_clean, y_train)

rs.best_params_

{'C': 3.745401188473625, 'l1_ratio': 0.6020242861835042}

# Level Up: SMOTE

Often we encounter a problem of imbalance classification that there are too few observations of the minority class for a model to effectively learn the decision boundary. 

One way to solve this problem is to **oversample** the observations in the minority class (or alternatively **undersample** the observations in the majority class) by synthesizing new observation from the minority class.

The most widely used approach to synthesizing new observations is called the **Synthetic Minority Oversampling Technique**, or **SMOTE** for short. 

Resource: https://arxiv.org/abs/1106.1813

Before getting into the example, please note the following,

1. Oversampling process is based on **k-nearest neighbors** of the minority class.
2. Oversampling only works with **numerical predictors** since the synthetic observations are created based on the k-nearest neighbors algorithm, which is a distance based algorithm.  

We use the Scikit-Learn's breast cancer dataset to demonstrate the use of SMOTE from imblearn package.

In [63]:
from sklearn.datasets import load_breast_cancer

# Load the data
preds, target = load_breast_cancer(return_X_y=True)

# Split into train and test
X_train, X_test, y_train, y_test = train_test_split(preds, target,
                                                   random_state=42)

In [64]:
# Import imblearn dependencies
from imblearn.over_sampling import SMOTE
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline as imbPipeline
from collections import Counter
from matplotlib import pyplot
from numpy import where

Note that we have an imbalance class for the target variable in this dataset.

In [65]:
# Check the class distribution of the target
counter = Counter(y_train)
print(counter)

# scatter plot of examples by class label
for label, _ in counter.items():
    row_ix = where(y_train == label)[0]
    pyplot.scatter(X_train_sc[row_ix, 0], X_train_sc[row_ix, 1], label=str(label))
pyplot.legend()
pyplot.show()

Counter({1: 268, 0: 158})


NameError: name 'X_train_sc' is not defined

In [None]:
# Create the oversampler and undersampler objects
over = SMOTE(sampling_strategy=0.7)
under = RandomUnderSampler(sampling_strategy=0.8)

# transform the dataset
X, y = under.fit_resample(X_train_sc, y_train)

After the oversampling and undersampling process, we observe a more balance class distribution in the target variable.

In [66]:
# Check the class distribution of the target
counter = Counter(y)
print(counter)

# scatter plot of examples by class label
for label, _ in counter.items():
    row_ix = where(y == label)[0]
    pyplot.scatter(X_train_sc[row_ix, 0], X_train_sc[row_ix, 1], label=str(label))
pyplot.legend()
pyplot.show()

Counter({'Adelie': 146, 'Gentoo': 119, 'Chinstrap': 68})


NameError: name 'X_train_sc' is not defined

In [67]:
# Create imblearn pipeline for the oversampler and undersampler
steps = [('o', over), ('u', under),('scale', StandardScaler()),
         ('model', DecisionTreeClassifier())]
pipeline = imbPipeline(steps=steps)


# Create the GridSearchCV object with different hyperparameters
parameters = {
    'model__max_depth': [2, 4, 6, 8],
    'model__min_samples_split': [5, 10, 15],
    'model__criterion': ['gini', 'entropy']
}

cv = GridSearchCV(pipeline, param_grid=parameters)

cv.fit(X_train, y_train)

# Predict the label with the best model
y_pred = cv.predict(X_test)
print(y_pred)

NameError: name 'over' is not defined

[CV 1/5] END metric=minkowski, n_neighbors=1, weights=distance;, score=1.000 total time=   0.0s
[CV 5/5] END metric=manhattan, n_neighbors=1, weights=uniform;, score=1.000 total time=   0.0s
[CV 2/5] END metric=minkowski, n_neighbors=1, weights=uniform;, score=1.000 total time=   0.0s
[CV 2/5] END metric=minkowski, n_neighbors=3, weights=uniform;, score=1.000 total time=   0.0s
[CV 4/5] END metric=minkowski, n_neighbors=3, weights=uniform;, score=1.000 total time=   0.0s
[CV 1/5] END metric=minkowski, n_neighbors=3, weights=distance;, score=1.000 total time=   0.0s
[CV 2/5] END metric=minkowski, n_neighbors=3, weights=distance;, score=1.000 total time=   0.0s
[CV 4/5] END metric=minkowski, n_neighbors=3, weights=distance;, score=1.000 total time=   0.0s
[CV 1/5] END metric=minkowski, n_neighbors=5, weights=uniform;, score=1.000 total time=   0.0s
[CV 4/5] END metric=minkowski, n_neighbors=5, weights=uniform;, score=1.000 total time=   0.0s
[CV 2/5] END metric=minkowski, n_neighbors=5, 