# Pipelines

In [1]:
import numpy as np

np.random.seed(42)

# Test data
X = np.random.rand(1000, 2)
Y = (X[:, 0] + X[:, 1]) > 1

X.shape, Y.shape

((1000, 2), (1000,))

In [2]:
from sklearn.model_selection import train_test_split

x_nontest, x_test, y_nontest, y_test = train_test_split(
    X, Y, test_size=0.3, stratify=Y, shuffle=True, random_state=42
)

x_nontest.shape, x_test.shape, y_nontest.shape, y_test.shape

((700, 2), (300, 2), (700,), (300,))

In [3]:
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score

from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.linear_model import LogisticRegression

In [4]:
rf_pl = Pipeline([
    ("scaler", StandardScaler()), 
    ("classifier", RandomForestClassifier())
])
rf_grid_search_params = {
    "classifier__n_estimators": [50, 100, 150],
    "classifier__criterion": ["entropy", "gini"]
}
rf_gs = GridSearchCV(rf_pl, rf_grid_search_params, n_jobs=-1)
rf_gs.fit(x_nontest, y_nontest)

print("Best parameters", rf_gs.best_params_)
print("CrossVal accuracy", rf_gs.best_score_)
print("Test Set accuracy", accuracy_score(y_test, rf_gs.predict(x_test)))

Best parameters {'classifier__criterion': 'gini', 'classifier__n_estimators': 50}
CrossVal accuracy 0.97
Test Set accuracy 0.97


In [5]:
logreg_pl = Pipeline([
    ("scaler", StandardScaler()), 
    ("classifier", LogisticRegression())
])
logreg_grid_search_params = {
    "classifier__penalty": ['l1', 'l2', 'elasticnet', 'none'],
    "classifier__class_weight" : [None, "balanced"]
}
logreg_gs = GridSearchCV(logreg_pl, logreg_grid_search_params, n_jobs=-1)
logreg_gs.fit(x_nontest, y_nontest)

print("Best parameters", logreg_gs.best_params_)
print("CrossVal accuracy", logreg_gs.best_score_)
print("Test Set accuracy", accuracy_score(y_test, logreg_gs.predict(x_test)))

Best parameters {'classifier__class_weight': None, 'classifier__penalty': 'none'}
CrossVal accuracy 0.9971428571428571
Test Set accuracy 1.0
