# **PipeLine**

Compare the models LogisticRegression, DecisionTreeClassifier,RandomForestClassifier using Pipeline

In [1]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

In [2]:
#load data
iris_df=load_iris()
#Train-Test split
X_train,X_test,y_train,y_test=train_test_split(iris_df.data,iris_df.target,test_size=0.3,random_state=0)


In [3]:
## Pipelines Creation
## 1. Data Preprocessing by using Standard Scaler
## 2. Reduce Dimension using PCA
## 3. Apply  Classifier
pipeline_lr=Pipeline([('scalar1',StandardScaler()),
                     ('pca1',PCA(n_components=2)),
                     ('lr_classifier',LogisticRegression(random_state=0))]) # For LogisticRegression
pipeline_dt=Pipeline([('scalar2',StandardScaler()),
                     ('pca2',PCA(n_components=2)),
                     ('dt_classifier',DecisionTreeClassifier())]) # For DecisionTreeClassifier
pipeline_randomforest=Pipeline([('scalar3',StandardScaler()),
                     ('pca3',PCA(n_components=2)),
                     ('rf_classifier',RandomForestClassifier())])  # For RandomForestClassifier         

In [4]:
## Make the list of pipelines created above
pipelines = [pipeline_lr, pipeline_dt, pipeline_randomforest]
# Fit the pipelines
for pipe in pipelines:
	pipe.fit(X_train, y_train)

In [5]:
#Score comparison of the different models


# Dictionary of pipelines and classifier types for ease of reference
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest'}
for i,model in enumerate(pipelines):
    print("{} Test Accuracy: {}".format(pipe_dict[i],model.score(X_test,y_test)))

Logistic Regression Test Accuracy: 0.8666666666666667
Decision Tree Test Accuracy: 0.9111111111111111
RandomForest Test Accuracy: 0.9111111111111111


# **Pipeline & GridSearchCV** 

In [6]:
from sklearn.pipeline import make_pipeline
# Create a pipeline
pipe = make_pipeline((RandomForestClassifier()))

In [7]:
from sklearn.model_selection import GridSearchCV
# Create dictionary with model and their hyperparameters
grid_param = [
                {"randomforestclassifier": [RandomForestClassifier()],
                 "randomforestclassifier__n_estimators": [10, 50, 100],
                 "randomforestclassifier__max_depth":[5,8,15,25,30,None],
                 "randomforestclassifier__min_samples_leaf":[1,2,5,10,15,30],
                 "randomforestclassifier__max_leaf_nodes": [2, 5,10]}]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
model = gridsearch.fit(X_train,y_train)

In [8]:
#Best Parameters
model.best_params_

{'randomforestclassifier': RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                        criterion='gini', max_depth=5, max_features='auto',
                        max_leaf_nodes=10, max_samples=None,
                        min_impurity_decrease=0.0, min_impurity_split=None,
                        min_samples_leaf=10, min_samples_split=2,
                        min_weight_fraction_leaf=0.0, n_estimators=50,
                        n_jobs=None, oob_score=False, random_state=None,
                        verbose=0, warm_start=False),
 'randomforestclassifier__max_depth': 5,
 'randomforestclassifier__max_leaf_nodes': 10,
 'randomforestclassifier__min_samples_leaf': 10,
 'randomforestclassifier__n_estimators': 50}

In [9]:
# Best score
model.best_score_

0.9619047619047618