## Pipeline Implementation Image recognition [To Fix it]

In [6]:
import pandas as pd
import numpy  as np
import matplotlib.pyplot as plt
import seaborn as sns

import os
from pathlib   import Path
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
from sklearn.metrics         import accuracy_score, make_scorer, roc_auc_score
from sklearn.linear_model    import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline        import Pipeline

from sklearn.ensemble          import RandomForestClassifier
from sklearn.feature_selection import SelectKBest, f_classif

In [2]:
# global setting
plt.style.use('ggplot')

### Load dataset

In [11]:
dataset = load_breast_cancer()

In [12]:
X, y  = dataset.data, dataset.target
# Split the data into train and test, with 20% as test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 1, stratify = y)

## Hyperparameter tuning using `Pipeline`

In [None]:
# Create pipeline with feature selector and classifier
pipe = Pipeline([
    ('feature_selection', SelectKBest(score_func = f_classif, k = 20)),
    ('clf', RandomForestClassifier(random_state = 2))])

# Create a parameter grid
params = {
   'feature_selection__k':[100, 150],
   'clf__n_estimators':[2, 5, 10, 50, 100, 150]}

# Initialise the grid search object
grid_search = GridSearchCV(estimator = pipe, param_grid = params, cv = 3)

# Fit it to the data and print the best value combination
print(grid_search.fit(X_train, y_train).best_params_)

In [15]:
# Print the tuned parameters and score
print(f"o Tuned Random Forest Parameters: {grid_search.best_params_}") 
print(f"o Best score is {round(grid_search.best_score_ * 100, 2)} %")

o Tuned Random Forest Parameters: {'clf__n_estimators': 150, 'feature_selection__k': 150}
o Best score is 58.75 %


## Pipeline with custom scores

**Using sklearn function**

In [None]:
# Create a custom scorer
scorer = make_scorer(roc_auc_score)

# Initialize the CV object
gs = GridSearchCV(pipe, param_grid = params, scoring=scorer)

# Fit it to the data and print the winning combination
print(gs.fit(X_train, y_train).best_params_)

**Using custom function**

In [None]:
def my_metric(y_test, y_est, cost_fp=10.0, cost_fn=1.0):
    tn, fp, fn, tp = confusion_matrix(y_test, y_est).ravel()
    return cost_fp * fp + cost_fn * fn

In [None]:
# Create a custom scorer
scorer = make_scorer(my_metric)

# Initialise the CV object
gs = GridSearchCV(pipe, param_grid=params, scoring=scorer)

# Fit it to the data and print the winning combination
print(gs.fit(X_train, y_train).best_params_)