# Pipeline

### Simple pipeline

In [5]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import load_iris

X, y = load_iris(return_X_y=True)

# Create a pipeline
pipeline = Pipeline([
('scaler', StandardScaler()), # First step: Feature scaling
('rf', RandomForestClassifier()) # Second step: RF Classifier
])

# Fit the pipeline to the data
pipeline.fit(X, y)

### Numerical and categorical variables

In [4]:
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.ensemble import RandomForestClassifier
from sklearn.datasets import fetch_openml

# Load a sample dataset
X, y = fetch_openml('titanic', version=1, as_frame=True, return_X_y=True)

# Define columns
numeric_features = ['age', 'fare']
categorical_features = ['sex', 'embarked']

# Drop rows with missing values
X.dropna(subset=numeric_features + categorical_features, inplace=True)
y = y[X.index]

# Create transformers for numerical and categorical features
numeric_transformer = Pipeline(steps=[
    ('scaler', StandardScaler())
])

categorical_transformer = Pipeline(steps=[
    ('onehot', OneHotEncoder(handle_unknown='ignore'))
])

# Combine transformers into a ColumnTransformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ])

# Create a complete pipeline with preprocessing and classifier
pipeline = Pipeline(steps=[
    ('preprocessor', preprocessor),
    ('classifier', RandomForestClassifier())
])

# Fit the pipeline to the data
pipeline.fit(X, y)

  warn(
