In [3]:
!pip install joblib



In [16]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
import joblib
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
iris_df=load_iris()

In [3]:
iris_df.data

array([[5.1, 3.5, 1.4, 0.2],
       [4.9, 3. , 1.4, 0.2],
       [4.7, 3.2, 1.3, 0.2],
       [4.6, 3.1, 1.5, 0.2],
       [5. , 3.6, 1.4, 0.2],
       [5.4, 3.9, 1.7, 0.4],
       [4.6, 3.4, 1.4, 0.3],
       [5. , 3.4, 1.5, 0.2],
       [4.4, 2.9, 1.4, 0.2],
       [4.9, 3.1, 1.5, 0.1],
       [5.4, 3.7, 1.5, 0.2],
       [4.8, 3.4, 1.6, 0.2],
       [4.8, 3. , 1.4, 0.1],
       [4.3, 3. , 1.1, 0.1],
       [5.8, 4. , 1.2, 0.2],
       [5.7, 4.4, 1.5, 0.4],
       [5.4, 3.9, 1.3, 0.4],
       [5.1, 3.5, 1.4, 0.3],
       [5.7, 3.8, 1.7, 0.3],
       [5.1, 3.8, 1.5, 0.3],
       [5.4, 3.4, 1.7, 0.2],
       [5.1, 3.7, 1.5, 0.4],
       [4.6, 3.6, 1. , 0.2],
       [5.1, 3.3, 1.7, 0.5],
       [4.8, 3.4, 1.9, 0.2],
       [5. , 3. , 1.6, 0.2],
       [5. , 3.4, 1.6, 0.4],
       [5.2, 3.5, 1.5, 0.2],
       [5.2, 3.4, 1.4, 0.2],
       [4.7, 3.2, 1.6, 0.2],
       [4.8, 3.1, 1.6, 0.2],
       [5.4, 3.4, 1.5, 0.4],
       [5.2, 4.1, 1.5, 0.1],
       [5.5, 4.2, 1.4, 0.2],
       [4.9, 3

In [4]:
x_train,x_test,y_train,y_test=train_test_split(iris_df.data,iris_df.target,test_size=0.3,random_state=42)

In [5]:
# Pipelines Creations
# 1) data Preprocessing by using Standard Scaler
# 2) Reduce dimension using PCA
# 3) Apply Classifier

In [6]:
pipeline_lr=Pipeline([('scalar1',StandardScaler()),
                     ('pca1',PCA(n_components=2)),
                     ('lr_classifier',LogisticRegression(random_state=42))])

In [7]:
pipeline_dt=Pipeline([('scalar2',StandardScaler()),
                     ('pca2',PCA(n_components=2)),
                     ('dt_classifier',DecisionTreeClassifier())])

In [8]:
pipeline_randomforest=Pipeline([('scalar3',StandardScaler()),
                     ('pca3',PCA(n_components=2)),
                     ('rf_classifier',RandomForestClassifier())])

In [9]:
## List of Pipelines
pipelines = [pipeline_lr, pipeline_dt, pipeline_randomforest]

In [10]:
best_accuracy=0.0
best_classifier=0
best_pipeline=""

In [11]:
pipe_dict= {0:'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest'}

for pipe in pipelines:
    pipe.fit(x_train,y_train)

In [12]:
for i,model in enumerate(pipelines):
    print('{} Test Accuracy: {}'.format(pipe_dict[i],model.score(x_test,y_test)))

Logistic Regression Test Accuracy: 0.9111111111111111
Decision Tree Test Accuracy: 0.9555555555555556
RandomForest Test Accuracy: 0.9555555555555556


In [13]:
for i,model in enumerate(pipelines):
    if model.score(x_test,y_test)>best_accuracy:
        best_accuracy=model.score(x_test,y_test)
        best_pipeline=model
        best_classifier=i
print('Classifier with best accuracy:{}'.format(pipe_dict[best_classifier]))

Classifier with best accuracy:Decision Tree


### Pipeline Perform Hyperparameter Tuning Using Grid SearchCV

In [14]:
from sklearn.model_selection import GridSearchCV

In [17]:
# create a Pipeline
pipe = Pipeline([("classifier",RandomForestClassifier())])
grid_param = [
    {"classifier":[LogisticRegression()],
    "classifier__penalty": ['l1','l2'],
    "classifier__C": np.logspace(0,4,10)
    },
    {"classifier": [LogisticRegression()],
     "classifier__penalty": ['l2'],
     "classifier__C":np.logspace(0,4,10),
     "classifier__solver":['newton-cg','saga','sag','liblinear']
    },
    {"classifier":[RandomForestClassifier()],
     "classifier__n_estimators":[10,100,1000],
     "classifier__max_depth":[5,8,15,25,30,None],
     "classifier__min_samples_leaf":[1,2,5,10,15,100],
     "classifier__max_leaf_nodes":[2,10,5]}]

gridsearch = GridSearchCV(pipe,grid_param,cv=5,verbose=0,n_jobs=-1)
best_model = gridsearch.fit(x_train,y_train)

50 fits failed out of a total of 1920.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
50 fits failed with the following error:
Traceback (most recent call last):
  File "E:\Users\HP ELITEBOOK 810 G3\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 686, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "E:\Users\HP ELITEBOOK 810 G3\anaconda3\lib\site-packages\sklearn\pipeline.py", line 382, in fit
    self._final_estimator.fit(Xt, y, **fit_params_last_step)
  File "E:\Users\HP ELITEBOOK 810 G3\anaconda3\lib\site-packages\sklearn\linear_model\_logistic.py", line 1091, in fit
    solver = _check_solver(self.solver, self.penalty, self.dual)
  File "E:\Users\HP ELITEBOOK 810 G3\anaconda3\lib\

In [18]:
print(best_model.best_estimator_)
print("The mean accuracy of the model is:",best_model.score(x_test,y_test))

Pipeline(steps=[('classifier', LogisticRegression(solver='sag'))])
The mean accuracy of the model is: 1.0
