## Problem Statement
- Build a machine learning pipeline and optimize

#### Prerequisite 
- Grid search parameter tuning
- Also called **hyperparameter optimization**
- scikit-learn provides **GridSearchCV()** class

#### Load Python libraries and dataset

In [None]:
import pandas as pd
from sklearn.preprocessing import StandardScaler, Normalizer, MinMaxScaler
from sklearn.model_selection import KFold
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import cross_val_score
from numpy import set_printoptions
from sklearn.pipeline import Pipeline

In [None]:
data = pd.read_csv("../data/pima-indians-diabetes.csv")

#### Check Your Data

In [None]:
# check first 20 rows of the dataset
print(data.head(5))

### Separate input and target variables

In [None]:
# separate inout and target variables
data_array = data.values
X = data_array[:,0:8]
y = data_array[:,8]


## <span style="color:red"> Grid Search Parameter Tuning</span>

- Grid search is used for parameter tuning
- It will evaluate the model for each combination of algorithm parameters specified in the grid.


### Problem Statement

- Create a classifier using **RandomForestClassifier** algorithm
- Tune the number of trees in the classifier and get the best parameter for the model.

class sklearn.ensemble.RandomForestClassifier(**n_estimators=100**, *, criterion='gini', max_depth=None, min_samples_split=2, min_samples_leaf=1, min_weight_fraction_leaf=0.0, max_features='auto', max_leaf_nodes=None, min_impurity_decrease=0.0, bootstrap=True, oob_score=False, n_jobs=None, random_state=None, verbose=0, warm_start=False, class_weight=None, ccp_alpha=0.0, max_samples=None)

In [None]:
# Load Python libraries
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

In [None]:
# Create parameter grid 
param_grid = {'n_estimators': [25, 50, 75, 95]}

# create base model
clf  = RandomForestClassifier()

# Instantiate the grid search 
grid = GridSearchCV(estimator=clf, param_grid=param_grid, cv=5)
grid.fit(X, y)

print(f'Best Score -> {grid.best_score_}')
print(f'Selected best estimator -> {grid.best_estimator_}')



## <span style="color:red"> ML Pipeline Tuning - 1</span>

### Problem Statement

- Create a pipeline and tune the hyperparameters of the workflows and optimize the pipline.
### [input] - [standardize] - [feature-selection] - [classifier] - [predictions]

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.tree import DecisionTreeClassifier

# define pipeline workflows
estimator = []
estimator.append(('scaler', StandardScaler()))
estimator.append(('select_best', SelectKBest()))
estimator.append(('clf', RandomForestClassifier()))


# instantiate Pipeline class with pipeline workflows
pipe = Pipeline(estimator)

# create grid for pipeline
parameters = {'scaler': [StandardScaler(), MinMaxScaler(), Normalizer()], 
             'select_best__k':[4, 5, 6], 
             'clf__n_estimators': [25, 50, 75, 95]}

# Instantiate GridSearchCV() class
grid = GridSearchCV(pipe, parameters, cv=5)

# fir the grid
grid.fit(X, y)

# access the best set of parameters
best_params = grid.best_params_
print(f'Best parameters for workflows \n==============================\n{best_params}\n')

# stores the optimum model in best_pipe
best_pipe = grid.best_estimator_
print(f'Best Pipeline\n============== \n{best_pipe}')


In [None]:
# Analyze results
# Store results in a DataFrame for better visualization
df = pd.DataFrame.from_dict(grid.cv_results_, orient='columns')

# Columns of the DataFrame
print(df.columns)

In [None]:
pd.set_option("display.max_rows", None, "display.max_columns", None)
print(df)

In [None]:
import matplotlib.pyplot as plt                      
import seaborn as sns
sns.relplot(data=df, kind='line', x='param_clf__n_estimators', 
            y='mean_test_score',hue='param_scaler', col='param_select_best__k')
plt.show()

## <span style="color:red"> ML Pipeline Tuning  - 2 </span>

### Problem Statement

- Create a pipeline and tune the hyperparameters of the workflows and optimize the pipline.
### [input] - [standardize] - [feature-selection] - [classifier] - [predictions]

In [None]:
from sklearn.feature_selection import SelectKBest
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
# define pipeline workflows
estimator2 = []
estimator2.append(('scaler', StandardScaler()))
estimator2.append(('select_best', SelectKBest()))
estimator2.append(('clf', DecisionTreeClassifier()))



# instantiate Pipeline class with pipeline workflows
pipe2 = Pipeline(estimator2)

# create grid for pipeline
parameters2 = {'scaler': [StandardScaler(), MinMaxScaler(), Normalizer()], 
             'select_best__k':[4, 5, 6], 
             'clf': [DecisionTreeClassifier(), LogisticRegression()]}

# Instantiate GridSearchCV() class
grid2 = GridSearchCV(pipe2, parameters2, cv=5)

# fir the grid
grid2.fit(X, y)

# access the best set of parameters
best_params = grid2.best_params_
print(f'Best parameters for workflows \n==============================\n{best_params}\n')

# stores the optimum model in best_pipe
best_pipe = grid2.best_estimator_
print(f'Best Pipeline\n============== \n{best_pipe}')

In [None]:
# Analyze results
# Store results in a DataFrame for better visualization
df2 = pd.DataFrame.from_dict(grid2.cv_results_, orient='columns')

# Columns of the DataFrame
print(df2.columns)

In [None]:
pd.set_option("display.max_rows", None, "display.max_columns", None)
print(df2)