# <center> Pipelines

In [None]:
from sklearn.pipeline import Pipeline

In [None]:
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.externals import joblib
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
import pandas as pd

In [None]:
## load data
iris = load_iris()
X = iris.data
y = iris.target
df = pd.DataFrame(X, columns=iris.feature_names)
df['iris'] = y
df.head()

In [None]:
## train-test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [None]:
## create list of classifiers
random_state = 4
classifiers = [LogisticRegression(random_state=random_state), 
               SVC(random_state=random_state), 
               DecisionTreeClassifier(random_state=random_state)]

In [None]:
## create list of params for grid search
params = [
    {'clf__C':[0.25, 0.50, 0.75, 1.0]}, ## Logistic Regression
    
    {'clf__kernel': ['poly', 'sigmoid'], ## SVM
     'clf__C': [0.01, 1, 100],
     'clf__degree': [2,3,4,5],
     'clf__gamma': [0.001, 0.01]},
    
    {'clf__criterion':['gini','entropy'],
     'clf__max_depth':[1,2,3,4],
     'clf__min_impurity_decrease':[0, 0.25, 0.50, 0.75]}
]

In [None]:
## build pipeline for each classifier
pipelines = []
best_clfs = []
for clf,param in zip(classifiers,params):
    pipe = Pipeline([('scl', StandardScaler()),
                     ('pca', PCA(n_components=2)),
                     ('clf', clf)])
    gs = GridSearchCV(estimator=pipe,
                  param_grid=param,
                  scoring='accuracy',
                  cv=3)
    gs.fit(X_train, y_train)
    best_clfs.append(gs.best_estimator_)
    pipelines.append(pipe)  
    print(clf.__class__.__name__)
    print('Best params:', gs.best_params_)
    print('Train Accuracy:',gs.best_estimator_.score(X_train, y_train))
    print('Test Accuracy:',gs.best_estimator_.score(X_test, y_test))
    print('')

# <center> Activity

In [None]:
from sklearn.datasets import make_classification
X,y = make_classification(n_classes=5, n_clusters_per_class=3, class_sep=0.5, n_features=5000,n_samples=11000, n_informative=5000, n_redundant=0, random_state=4, scale=None)