# Pipelines

In [11]:
import numpy as np
from random import randint
from sklearn.datasets import make_classification

def get_data(n_features=20, n_samples=2000, n_missing=100):
    def generate_coordinates(m, n):
        seen = set()

        x, y = randint(0, m - 1), randint(0, n - 1)

        while True:
            seen.add((x, y))
            yield (x, y)
            x, y = randint(0, m - 1), randint(0, n - 1)
            while (x, y) in seen:
                x, y = randint(0, m - 1), randint(0, n - 1)
                
    def make_missing(X):
        coords = generate_coordinates(n_samples, n_features)
    
        for _ in range(n_missing):
            i, j = next(coords)
            X[i][j] = np.nan
    
    X, y = make_classification(**{
        'n_samples': n_samples, 
        'n_features': n_features, 
        'n_informative': 2, 
        'n_redundant': 2, 
        'n_repeated': 0, 
        'n_classes': 2, 
        'n_clusters_per_class': 2, 
        'random_state': 37
    })
    
    return X, y

# to verify missing data
# np.count_nonzero(np.isnan(X))
X, y = get_data()

In [14]:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import IterativeImputer

imputer = IterativeImputer(missing_values=np.nan, random_state=37)
scaler = StandardScaler()
pca = PCA(n_components=3, random_state=37)
rf = RandomForestClassifier(n_estimators=100)

pipeline = Pipeline([
    ('imputer', imputer),
    ('scaler', scaler), 
    ('pca', pca),
    ('rf', rf)
])

pipeline.fit(X, y)

Pipeline(memory=None,
         steps=[('imputer',
                 IterativeImputer(add_indicator=False, estimator=None,
                                  imputation_order='ascending',
                                  initial_strategy='mean', max_iter=10,
                                  max_value=None, min_value=None,
                                  missing_values=nan, n_nearest_features=None,
                                  random_state=37, sample_posterior=False,
                                  tol=0.001, verbose=0)),
                ('scaler',
                 StandardScaler(copy=True, with_mean=True, with_std=T...
                 RandomForestClassifier(bootstrap=True, class_weight=None,
                                        criterion='gini', max_depth=None,
                                        max_features='auto',
                                        max_leaf_nodes=None,
                                        min_impurity_decrease=0.0,
                           