# Pipelines: Basic Template

In [1]:
# import modules
import pandas as pd
import numpy as np
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score

### Import and Split Data

In [2]:
# import data
my_df = pd.read_csv('data/pipeline_data.csv')
my_df.head()

Unnamed: 0,purchase,age,gender,credit_score
0,0,47.0,F,309.0
1,1,18.0,F,230.0
2,1,25.0,M,92.0
3,0,38.0,M,486.0
4,1,38.0,M,236.0


In [3]:
# split data into input and output variables
X = my_df.drop(['purchase'], axis = 1)
y = my_df['purchase']

In [4]:
# split data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, stratify = y)

In [5]:
# specify numeric and categorical features
numeric_features = ['age', 'credit_score']
categorical_features = ['gender']

### Set Up Pipelines

In [6]:
# numerical feature transformer
numeric_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer()),
    ('scaler', StandardScaler())
])

In [7]:
# categorical feature transformer
categorical_transformer = Pipeline(steps = [
    ('imputer', SimpleImputer(strategy = 'constant', fill_value = 'U')),
    ('ohe', OneHotEncoder(handle_unknown = 'ignore'))
])

In [8]:
# preprocessing pipeline
preprocessing_pipeline = ColumnTransformer(transformers = [
    ('numeric', numeric_transformer, numeric_features),
    ('categorical', categorical_transformer, categorical_features)
])

### Apply the Pipeline

In [9]:
# apply pipeline to logistic regression model
log_clf = Pipeline(steps = [
    ('preprocessing_pipeline', preprocessing_pipeline),
    ('classifier', LogisticRegression())
])

# train model
log_clf.fit(X_train, y_train)

# run predictions
y_pred_class = log_clf.predict(X_test)

# output accuracy score
log_accuracy = accuracy_score(y_test, y_pred_class)
log_accuracy

0.85

In [10]:
# apply pipeline to logistic regression model
forest_clf = Pipeline(steps = [
    ('preprocessing_pipeline', preprocessing_pipeline),
    ('classifier', RandomForestClassifier())
])

# train model
forest_clf.fit(X_train, y_train)

# run predictions
y_pred_class = forest_clf.predict(X_test)

# output accuracy score
forest_accuracy = accuracy_score(y_test, y_pred_class)
forest_accuracy

0.85

### Save Pipeline

In [11]:
import joblib

In [12]:
# save pipelines
joblib.dump(log_clf, 'data/log_clf_model.joblib')
joblib.dump(forest_clf, 'data/forest_clf_model.joblib')

['data/forest_clf_model.joblib']

### Test Pipeline on New Data

In [13]:
# import modules
import joblib
import pandas as pd
import numpy as np

In [14]:
# import pipelines
log_clf_test = joblib.load('data/log_clf_model.joblib')
forest_clf_test = joblib.load('data/forest_clf_model.joblib')

In [15]:
# create sample test data
new_data = pd.DataFrame({
    'age': [25, np.nan, 50],
    'gender': ['M', 'F', 'np.nan'],
    'credit_score': [200, 100, 500]
})

new_data

Unnamed: 0,age,gender,credit_score
0,25.0,M,200
1,,F,100
2,50.0,np.nan,500


In [16]:
# pass new data into logistic regression pipeline object and return predictions
log_clf_test_predictions = log_clf_test.predict(new_data)
log_clf_test_predictions

array([1, 1, 0])

In [17]:
# pass new data into random forest pipeline object and return predictions
forest_clf_test = forest_clf_test.predict(new_data)
forest_clf_test

array([1, 0, 0])

In [18]:
# append predictions

# convert predictions to dataframes
df_log = pd.DataFrame(log_clf_test_predictions).rename({0: 'log_prediction'}, axis = 1)
df_forest = pd.DataFrame(forest_clf_test).rename({0: 'forest_prediction'}, axis = 1)

# merge dataframe predictions
combined = pd.merge(df_log, df_forest, left_index = True, right_index = True)

# merge final dataframe and output
final_df = pd.merge(new_data, combined, left_index = True, right_index = True)
final_df

Unnamed: 0,age,gender,credit_score,log_prediction,forest_prediction
0,25.0,M,200,1,1
1,,F,100,1,0
2,50.0,np.nan,500,0,0
