<h3>Pipeline Example</h3>
Scikit-learn Pipelines Explained: Streamline and Optimize Your Machine Learning Processes

In [3]:
import numpy as np, pandas as pd
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.model_selection import train_test_split

In [11]:
data = {
	"state": ["CA", "WA", "CA", np.nan, "NV", "WA"],
	"gender": ["male", "female", "female", "male", np.nan, "female"],
	"age": [34, 29, 22, 44, 55, np.nan],
	"weight": [122, 150, 130, np.nan, 140, 175],
	"target": [0, 1, 0, 1, 0, 1]
}
df = pd.DataFrame(data)
X = df.drop("target", axis=1)
y = df["target"]

X

Unnamed: 0,state,gender,age,weight
0,CA,male,34.0,122.0
1,WA,female,29.0,150.0
2,CA,female,22.0,130.0
3,,male,44.0,
4,NV,,55.0,140.0
5,WA,female,,175.0


In [12]:
numeric_preprocessor = Pipeline(
	steps=[
		("imputation_mean", SimpleImputer(missing_values=np.nan, strategy="mean")),
		("scaler", StandardScaler()),
	]
)

categorical_preprocessor = Pipeline(
	steps=[
		(
			"imputation_constant",
			SimpleImputer(fill_value="missing", strategy="constant"),
		),
		("onehot", OneHotEncoder(handle_unknown="ignore")),
	]
)

preprocessor = ColumnTransformer(
	[
		("categorical", categorical_preprocessor, ["state", "gender"]),
		("numerical", numeric_preprocessor, ["age", "weight"]),
	]
)

pipe = make_pipeline(preprocessor, LogisticRegression(max_iter=500))
pipe  # click on the diagram below to see the details of each step

train

In [13]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline to your training data
pipe.fit(X_train, y_train)

# Make predictions on the test set
predictions = pipe.predict(X_test)

print("Actual values:", y_test.values)
print("Predicted values:", predictions)


Actual values: [0 1]
Predicted values: [0 1]


In [14]:
accuracy = 0
correct = 0
for actual, predicted in zip(y_test.values, predictions):
	if actual == predicted:
		correct += 1
accuracy = correct / len(y_test)
print("Accuracy:", accuracy)

Accuracy: 1.0
