## Pipelines

Pipelines in scikit-learn offer a convenient way to build models. Typically, building a model involves several steps: imputing missing values using an imputer, applying PCA for dimensionality reduction, building the model, and then making predictions. Instead of performing these steps separately, they can be combined into a single pipeline.

In [3]:
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.datasets import make_regression
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

# Number of features
n_features = 10

# Generate synthetic regression data
x, y = make_regression(n_informative=5, n_features=n_features)

# Introduce NaN values randomly in the dataset
for _ in range(20):
    rand_row = np.random.randint(100)
    rand_col = np.random.randint(n_features)
    x[rand_row, rand_col] = np.nan
    print(f"x[{rand_row}][{rand_col}] = np.nan")

# Define the pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values
    ('scaler', StandardScaler()),                # Standardize features
    ('pca', PCA(n_components=5)),                # Reduce dimensionality
    ('svr', SVR())                               # Support Vector Regression
])

# Fit the pipeline to the data
pipeline.fit(x, y)

# Example usage: Predict using the pipeline
y_pred = pipeline.predict(x)


x[59][8] = np.nan
x[6][6] = np.nan
x[50][4] = np.nan
x[41][9] = np.nan
x[91][5] = np.nan
x[31][8] = np.nan
x[25][6] = np.nan
x[86][5] = np.nan
x[64][6] = np.nan
x[9][3] = np.nan
x[30][5] = np.nan
x[70][7] = np.nan
x[0][5] = np.nan
x[70][8] = np.nan
x[72][5] = np.nan
x[6][2] = np.nan
x[80][9] = np.nan
x[83][0] = np.nan
x[18][6] = np.nan
x[49][4] = np.nan


In [5]:
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR

# Define the pipeline
pipeline = Pipeline([
    ('imputer', SimpleImputer(strategy='mean')),  # Impute missing values
    ('scaler', StandardScaler()),                # Standardize features
    ('pca', PCA(n_components=6)),                # Reduce dimensionality
    ('svr', SVR())                               # Support Vector Regression
])

# Display the steps in the pipeline
pipeline.steps


[('imputer', SimpleImputer()),
 ('scaler', StandardScaler()),
 ('pca', PCA(n_components=6)),
 ('svr', SVR())]