## Pipeline in Sklearn

In [24]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
# from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

import numpy as np

In [4]:
iris_df = load_iris()

In [6]:
iris_df.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [20]:
X_train, X_test, y_train, y_test = train_test_split(iris_df.data, iris_df.target, test_size=0.3, random_state=100)


### Pipeline Creation
1. Data Preprocessing by using Standard Scaler
2. Reduce Dimension using PCA
3. Apply Classifier

In [14]:
pipeline_lr = Pipeline([
    ("scaler1", StandardScaler()), 
    ("pca1", PCA(n_components=2)), 
    ("lr_classifier", LogisticRegression(random_state=0))
])

In [15]:
pipeline_dt = Pipeline([
    ("scaler2", StandardScaler()), 
    ("pca2", PCA(n_components=2)), 
    ("dt_classifier", DecisionTreeClassifier())
])

In [16]:
pipeline_randomforest = Pipeline([
    ("scaler3", StandardScaler()), 
    ("pca3", PCA(n_components=2)), 
    ("rf_classifier", RandomForestClassifier())
])

In [17]:
pipelines = [pipeline_lr, pipeline_dt, pipeline_randomforest]

In [18]:
best_accuracy = 0.0
best_classifier = 0
best_pipeline = ""

In [19]:
pipe_dict = {0: "Logistic Regression", 1: "Decision Tree", 2: "RandomForest"}

for pipe in pipelines:
    pipe.fit(X_train, y_train)

In [21]:
for i, model in enumerate(pipelines):
    print(f"{pipe_dict[i]} Test Accuracy: {model.score(X_test, y_test)}")

Logistic Regression Test Accuracy: 0.9555555555555556
Decision Tree Test Accuracy: 0.9777777777777777
RandomForest Test Accuracy: 0.9555555555555556


In [22]:
for i, model in enumerate(pipelines):
    if model.score(X_test, y_test) > best_accuracy:
        best_accuracy = model.score(X_test, y_test)
        best_pipeline = model
        best_classifier = i

print(f"Classifier with best accuracy: {pipe_dict[best_classifier]}")

Classifier with best accuracy: Decision Tree


### Pipelines Perform Hyperparameter Tuning Using Grid SearchCV

* Voi gridSearch co the truyen vao pipeline hoac model cu the

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline

In [27]:
# Create a pipeline 
pipe = Pipeline([
    ("classifier", RandomForestClassifier())
])

grid_param = [
    {
        "classifier": [LogisticRegression()], 
        "classifier__penalty": ['l2', 'l1'], 
        "classifier__C": np.logspace(0, 4, 10)
    },
    {
        "classifier": [LogisticRegression()], 
        "classifier__penalty": ['l2'], 
        "classifier__C": np.logspace(0, 4, 10), 
        "classifier__solver": ["newton-cg", "saga", "sag", "liblinear"] # this solvers don't allow L1 penalty
    }, 
    {
        "classifier": [RandomForestClassifier()], 
        "classifier__n_estimators": [10, 100, 1000], 
        "classifier__max_depth": [5, 8, 15, 25, 30, None], 
        "classifier__min_samples_leaf": [1, 2, 5, 10, 15, 100], 
        "classifier__max_leaf_nodes": [2, 5, 10]
    }
]

gridsearch = GridSearchCV(
    pipe, 
    grid_param, 
    cv=5, # cross-validation (None: default 5-fold)
    verbose=0, 
    n_jobs=-1 # use all processor
)
best_model = gridsearch.fit(X_train, y_train)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver opt

In [29]:
print(best_model.best_estimator_)
print(f"The mean accuracy of the model is: ", best_model.score(X_test, y_test))

Pipeline(steps=[('classifier', LogisticRegression(solver='sag'))])
The mean accuracy of the model is:  1.0


In [31]:
# show parameter
best_model.best_params_

{'classifier': LogisticRegression(solver='sag'),
 'classifier__C': 1.0,
 'classifier__penalty': 'l2',
 'classifier__solver': 'sag'}

### GridSearchCV with SVM (or any Classification Model)

In [33]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# import the dataset
data = pd.read_csv("../../../data/Advertising_data.csv")
X = data.iloc[:, [2, 3]].values
y = data.iloc[:, 4].values

In [34]:
data.head()

Unnamed: 0,User ID,Gender,Age,EstimatedSalary,Purchased
0,15624510,Male,19.0,19000.0,0
1,15810944,Male,35.0,20000.0,0
2,15668575,Female,26.0,43000.0,0
3,15603246,Female,27.0,57000.0,0
4,15804002,Male,19.0,76000.0,0


In [35]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=5)

In [36]:
# feature scaling
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()

X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

In [42]:
from sklearn.svm import SVC
classifier = SVC(kernel="linear", random_state=0)
classifier.fit(X_train, y_train)

In [43]:
y_pred = classifier.predict(X_test)

In [45]:
classifier.score(X_test, y_test)

0.85

In [48]:
# making ther confusion matrix
from sklearn.metrics import confusion_matrix
cm = confusion_matrix(y_test, y_pred)

cm

array([[60,  6],
       [ 9, 25]])

In [50]:
from sklearn.metrics import accuracy_score
accuracy = accuracy_score(y_test, y_pred)
accuracy

0.85

In [70]:
## applying gridSearch to find the best model and the best parameter

from sklearn.model_selection import GridSearchCV

parameters = [
    {'C': [1, 10, 100, 1000], 'kernel': ['linear']}, 
    {
        'C': np.arange(1, 500, 1), 
        'kernel': ['rbf'], 
        'gamma': [0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
    }
]

grid_search = GridSearchCV(
    estimator=classifier, 
    param_grid=parameters, 
    scoring="accuracy", 
    cv=10, # Kfold
    n_jobs = -1, 
    verbose=3
)

grid_search = grid_search.fit(X_train, y_train)

Fitting 10 folds for each of 4495 candidates, totalling 44950 fits
[CV 1/10] END ...............C=1, kernel=linear;, score=0.833 total time=   0.0s
[CV 2/10] END ...............C=1, kernel=linear;, score=0.800 total time=   0.0s
[CV 4/10] END ...............C=1, kernel=linear;, score=0.900 total time=   0.0s
[CV 3/10] END ...............C=1, kernel=linear;, score=0.867 total time=   0.0s
[CV 5/10] END ...............C=1, kernel=linear;, score=0.733 total time=   0.0s
[CV 6/10] END ...............C=1, kernel=linear;, score=0.867 total time=   0.0s
[CV 7/10] END ...............C=1, kernel=linear;, score=0.800 total time=   0.0s
[CV 9/10] END ...............C=1, kernel=linear;, score=0.900 total time=   0.0s
[CV 8/10] END ...............C=1, kernel=linear;, score=0.767 total time=   0.0s
[CV 10/10] END ..............C=1, kernel=linear;, score=0.833 total time=   0.0s
[CV 1/10] END ..............C=10, kernel=linear;, score=0.833 total time=   0.0s
[CV 2/10] END ..............C=10, kernel=l

In [71]:
grid_search

In [72]:
best_accuracy = grid_search.best_score_

best_accuracy

0.9133333333333334

In [73]:
grid_search.best_estimator_

In [74]:
grid_search.best_params_

{'C': 5, 'gamma': 0.4, 'kernel': 'rbf'}

In [75]:
classifier = SVC(kernel="rbf", C=5, gamma=0.4)
classifier.fit(X_train, y_train)

In [76]:
classifier.score(X_test, y_test)

0.93