<a href="https://colab.research.google.com/github/plaban1981/Pipelines/blob/master/Sckitlearn_PipeLine.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

##Creating Piplines using SKLEARN Machine Learning

In [0]:
import pandas as pd
import numpy as np
from sklearn.datasets import load_iris
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split

In [3]:
iris = load_iris()
iris.keys()

dict_keys(['data', 'target', 'target_names', 'DESCR', 'feature_names', 'filename'])

In [4]:
iris['feature_names']

['sepal length (cm)',
 'sepal width (cm)',
 'petal length (cm)',
 'petal width (cm)']

In [5]:
iris['target_names']

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [6]:
iris['target'][:5]

array([0, 0, 0, 0, 0])

In [0]:
df_iris = pd.DataFrame(iris['data'],columns= iris['feature_names'])

In [8]:
df_iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2


In [0]:
df_iris['target'] = iris['target']

In [10]:
df_iris.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [11]:
df_iris.shape

(150, 5)

In [0]:
X = iris['data']
Y = iris['target']

## Split the sample data into training and test set

In [0]:
X_train,X_test,y_train,y_test = train_test_split(X,Y,test_size=0.15,random_state=2)

In [14]:
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(127, 4)
(23, 4)
(127,)
(23,)


## Pipelines Creation
* 1. Data Preprocessing by using Standard Scaler
* 2. Reduce Dimension using PCA
* 3. Apply  Classifier

In [0]:
pipeline_lr=Pipeline([('scalar1',StandardScaler()),
                     ('pca1',PCA(n_components=2)),
                     ('lr_classifier',LogisticRegression(random_state=0))])

In [0]:
pipeline_dt=Pipeline([('scalar2',StandardScaler()),
                     ('pca2',PCA(n_components=2)),
                     ('dt_classifier',DecisionTreeClassifier(random_state=0))])

In [0]:
pipeline_randomforest=Pipeline([('scalar3',StandardScaler()),
                     ('pca3',PCA(n_components=2)),
                     ('rf_classifier',RandomForestClassifier(random_state=0))])

##making list of pipelines

In [0]:
pipelines = [pipeline_lr, pipeline_dt, pipeline_randomforest]

##Initialization

In [0]:
best_accuracy=0.0
best_classifier=0
best_pipeline=""

### Dictionary of pipelines and classifier types for ease of reference

In [0]:
pipe_dict = {0: 'Logistic Regression', 1: 'Decision Tree', 2: 'RandomForest'}

##Fit the pipelines

In [0]:
for pipe in pipelines:
	pipe.fit(X_train,y_train)

##Evaluate the models in the pipeline

In [22]:
for i,model in enumerate(pipelines):
  print(f'{pipe_dict[i]} : model_accuracy {model.score(X_test,y_test)}')

Logistic Regression : model_accuracy 0.9130434782608695
Decision Tree : model_accuracy 0.9130434782608695
RandomForest : model_accuracy 0.9130434782608695


## Selecting Best Classifier

In [23]:
for i , model in enumerate(pipelines):
  if model.score(X_test,y_test) > best_accuracy:
    best_accuracy = model.score(X_test,y_test)
    best_pipeline = model
    best_classifier = pipe_dict[i]
print(f'The classifier with best accuracy : {best_classifier}')

The classifier with best accuracy : Logistic Regression


##Pipelines Perform Hyperparameter Tuning Using Grid SearchCV

In [0]:
from sklearn.model_selection import GridSearchCV

In [0]:

# Create a pipeline
pipe = Pipeline([("classifier", RandomForestClassifier())])
# Create dictionary with candidate learning algorithms and their hyperparameters
grid_param = [
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2','l1'],
                 "classifier__C": np.logspace(0, 4, 10)
                 },
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2'],
                 "classifier__C": np.logspace(0, 4, 10),
                 "classifier__solver":['newton-cg','saga','sag','liblinear'] ##This solvers don't allow L1 penalty
                 },
                {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [10, 100, 1000],
                 "classifier__max_depth":[5,8,15,25,30,None],
                 "classifier__min_samples_leaf":[1,2,5,10,15,100],
                 "classifier__max_leaf_nodes": [2, 5,10]}]
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)

In [26]:
print(best_model.best_estimator_)
print("The mean accuracy of the model is:",best_model.score(X_test,y_test))

Pipeline(memory=None,
         steps=[('classifier',
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
                                        class_weight=None, criterion='gini',
                                        max_depth=25, max_features='auto',
                                        max_leaf_nodes=10, max_samples=None,
                                        min_impurity_decrease=0.0,
                                        min_impurity_split=None,
                                        min_samples_leaf=15,
                                        min_samples_split=2,
                                        min_weight_fraction_leaf=0.0,
                                        n_estimators=10, n_jobs=None,
                                        oob_score=False, random_state=None,
                                        verbose=0, warm_start=False))],
         verbose=False)
The mean accuracy of the model is: 0.9565217391304348


In [27]:
# Create a pipeline
pipe = Pipeline([("classifier", LogisticRegression())])
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=0,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)



In [28]:
print(best_model.best_estimator_)
print("The mean accuracy of the model is:",best_model.score(X_test,y_test))

Pipeline(memory=None,
         steps=[('classifier',
                 LogisticRegression(C=1.0, class_weight=None, dual=False,
                                    fit_intercept=True, intercept_scaling=1,
                                    l1_ratio=None, max_iter=100,
                                    multi_class='auto', n_jobs=None,
                                    penalty='l2', random_state=None,
                                    solver='saga', tol=0.0001, verbose=0,
                                    warm_start=False))],
         verbose=False)
The mean accuracy of the model is: 1.0


In [0]:
grid_param = [
                {"classifier": [LogisticRegression()],
                 "classifier__penalty": ['l2','l1'],
                 "classifier__C": np.logspace(0, 4, 10)
                 },
                {"classifier":[DecisionTreeClassifier(random_state=0)],
                  "classifier__max_depth": [5,8,15,25,30,None],
                  "classifier__min_samples_leaf":[1,2,5,10,15,100],
                 },
                {"classifier": [RandomForestClassifier()],
                 "classifier__n_estimators": [10, 100, 1000],
                 "classifier__max_depth":[5,8,15,25,30,None],
                 "classifier__min_samples_leaf":[1,2,5,10,15,100],
                 "classifier__max_leaf_nodes": [2, 5,10]}]

In [66]:
# Create a pipeline
pipe = Pipeline([("classifier", DecisionTreeClassifier(random_state=2))])
# create a gridsearch of the pipeline, the fit the best model
gridsearch = GridSearchCV(pipe, grid_param, cv=5, verbose=2,n_jobs=-1) # Fit grid search
best_model = gridsearch.fit(X_train,y_train)

Fitting 5 folds for each of 380 candidates, totalling 1900 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done  92 tasks      | elapsed:    2.6s
[Parallel(n_jobs=-1)]: Done 512 tasks      | elapsed:  1.9min
[Parallel(n_jobs=-1)]: Done 715 tasks      | elapsed:  3.7min
[Parallel(n_jobs=-1)]: Done 998 tasks      | elapsed:  6.1min
[Parallel(n_jobs=-1)]: Done 1363 tasks      | elapsed:  9.1min
[Parallel(n_jobs=-1)]: Done 1808 tasks      | elapsed: 12.9min
[Parallel(n_jobs=-1)]: Done 1900 out of 1900 | elapsed: 13.7min finished


In [67]:
print("The mean accuracy of the model is:",best_model.score(X_test,y_test))

The mean accuracy of the model is: 0.9130434782608695


In [73]:
model_score = {}
classifier = [ LogisticRegression(),DecisionTreeClassifier(),RandomForestClassifier()]
for i,model in enumerate(classifier):
  print(model)
  pipe =Pipeline([('classifier',model)])
  gridsearchcv = GridSearchCV(pipe, grid_param[i], cv=5, verbose=0,n_jobs=4)
  gridsearchcv.fit(X_train,y_train)
  model_score[pipe_dict[i]] = gridsearchcv.score(X_test,y_test)
  print('*'*80)
  print(model_score)


LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='auto', n_jobs=None, penalty='l2',
                   random_state=None, solver='lbfgs', tol=0.0001, verbose=0,
                   warm_start=False)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


********************************************************************************
{'Logistic Regression': 0.9565217391304348}
DecisionTreeClassifier(ccp_alpha=0.0, class_weight=None, criterion='gini',
                       max_depth=None, max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort='deprecated',
                       random_state=None, splitter='best')
********************************************************************************
{'Logistic Regression': 0.9565217391304348, 'Decision Tree': 0.9130434782608695}
RandomForestClassifier(bootstrap=True, ccp_alpha=0.0, class_weight=None,
                       criterion='gini', max_depth=None, max_features='auto',
                       max_leaf_nodes=None, max_samples=None,
                       min_impurity_decrease=0.0, min_impurity_split=None

In [76]:
import pandas as pd
score = []
model = []
for k,v in model_score.items():
  score.append(v)
  model.append(k)

df_score = pd.DataFrame({'Model':model,'Accuracy':score})
df_score

Unnamed: 0,Model,Accuracy
0,Logistic Regression,0.956522
1,Decision Tree,0.913043
2,RandomForest,0.956522
