In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.impute import KNNImputer, SimpleImputer
from sklearn.experimental import enable_iterative_imputer

In [2]:
data = pd.read_csv("/home/mike/Downloads/data/archive(4)/bank.csv")

In [3]:
data.head()

Unnamed: 0,age,job,marital,education,default,balance,housing,loan,contact,day,month,duration,campaign,pdays,previous,poutcome,y
0,30,unemployed,married,primary,no,1787,no,no,cellular,19,oct,79,1,-1,0,unknown,no
1,33,services,married,secondary,no,4789,yes,yes,cellular,11,may,220,1,339,4,failure,no
2,35,management,single,tertiary,no,1350,yes,no,cellular,16,apr,185,1,330,1,failure,no
3,30,management,married,tertiary,no,1476,yes,yes,unknown,3,jun,199,4,-1,0,unknown,no
4,59,blue-collar,married,secondary,no,0,yes,no,unknown,5,may,226,1,-1,0,unknown,no


### Make pipeline to preprocess the data

In [4]:
y = data.pop('y')
X = data

In [5]:
from sklearn.compose import make_column_selector

cat_selector = make_column_selector(dtype_include=object)
num_selector = make_column_selector(dtype_include=np.number)
cat_selector(X)

['job',
 'marital',
 'education',
 'default',
 'housing',
 'loan',
 'contact',
 'month',
 'poutcome']

In [6]:
num_selector(X)

['age', 'balance', 'day', 'duration', 'campaign', 'pdays', 'previous']

#### We will first design the pipeline required for the tree-based models.

In [7]:
from sklearn.compose import make_column_transformer
from sklearn.impute import SimpleImputer
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import OrdinalEncoder

cat_tree_processor = OrdinalEncoder(
  
)
num_tree_processor = SimpleImputer(strategy="mean", add_indicator=True)

tree_preprocessor = make_column_transformer(
    (num_tree_processor, num_selector), (cat_tree_processor, cat_selector)
)
tree_preprocessor

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('simpleimputer',
                                 SimpleImputer(add_indicator=True, copy=True,
                                               fill_value=None,
                                               missing_values=nan,
                                               strategy='mean', verbose=0),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f65d324f410>),
                                ('ordinalencoder',
                                 OrdinalEncoder(categories='auto',
                                                dtype=<class 'numpy.float64'>),
                                 <sklearn.compose._column_transformer.make_column_selector object at 0x7f65d324f3d0>)],
                  verbose=False)

#### hen, we will now define the preprocessor used when the ending regressor is a linear model.

In [8]:
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler

cat_linear_processor = OneHotEncoder(handle_unknown="ignore")
num_linear_processor = make_pipeline(
    StandardScaler(), SimpleImputer(strategy="mean", add_indicator=True)
)

linear_preprocessor = make_column_transformer(
    (num_linear_processor, num_selector), (cat_linear_processor, cat_selector)
)
linear_preprocessor

ColumnTransformer(n_jobs=None, remainder='drop', sparse_threshold=0.3,
                  transformer_weights=None,
                  transformers=[('pipeline',
                                 Pipeline(memory=None,
                                          steps=[('standardscaler',
                                                  StandardScaler(copy=True,
                                                                 with_mean=True,
                                                                 with_std=True)),
                                                 ('simpleimputer',
                                                  SimpleImputer(add_indicator=True,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
      

In [10]:
from itertools import product

import matplotlib.pyplot as plt

from sklearn import datasets
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier
# from sklearn.inspection import DecisionBoundaryDisplay

In [11]:
# Training classifiers
clf1 = DecisionTreeClassifier(max_depth=4)
clf2 = KNeighborsClassifier(n_neighbors=7)
clf3 = SVC(gamma=0.1, kernel="rbf", probability=True)
eclf = VotingClassifier(
    estimators=[("dt", clf1), ("knn", clf2), ("svc", clf3)],
    voting="soft",
    weights=[2, 1, 2],
)

clf1.fit(X, y)
clf2.fit(X, y)
clf3.fit(X, y)
eclf.fit(X, y)

# Plotting decision regions
f, axarr = plt.subplots(2, 2, sharex="col", sharey="row", figsize=(10, 8))
for idx, clf, tt in zip(
    product([0, 1], [0, 1]),
    [clf1, clf2, clf3, eclf],
    ["Decision Tree (depth=4)", "KNN (k=7)", "Kernel SVM", "Soft Voting"],
):
    DecisionBoundaryDisplay.from_estimator(
        clf, X, alpha=0.4, ax=axarr[idx[0], idx[1]], response_method="predict"
    )
    axarr[idx[0], idx[1]].scatter(X[:, 0], X[:, 1], c=y, s=20, edgecolor="k")
    axarr[idx[0], idx[1]].set_title(tt)

plt.show()

ValueError: could not convert string to float: 'unemployed'

## Stack of predictors on a single data set

In [9]:
from sklearn.linear_model import LogisticRegressionCV

log_pipeline = make_pipeline(linear_preprocessor, LogisticRegressionCV())
log_pipeline

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('pipeline',
                                                  Pipeline(memory=None,
                                                           steps=[('standardscaler',
                                                                   StandardScaler(copy=True,
                                                                                  with_mean=True,
                                                                                  with_std=True)),
                                                                  ('simpleimputer',
                                                                   SimpleImputer(add_indicator=True,
                                                              

In [10]:
from sklearn.ensemble import RandomForestClassifier

rf_pipeline = make_pipeline(tree_preprocessor, RandomForestClassifier(random_state=42))
rf_pipeline

Pipeline(memory=None,
         steps=[('columntransformer',
                 ColumnTransformer(n_jobs=None, remainder='drop',
                                   sparse_threshold=0.3,
                                   transformer_weights=None,
                                   transformers=[('simpleimputer',
                                                  SimpleImputer(add_indicator=True,
                                                                copy=True,
                                                                fill_value=None,
                                                                missing_values=nan,
                                                                strategy='mean',
                                                                verbose=0),
                                                  <sklearn.compose._column_transformer.make_column_selector object at 0x7fb0f...
                 RandomForestClassifier(bootstrap=True, ccp_alpha=0.0,
        

In [11]:
from sklearn.ensemble import StackingClassifier


estimators = [
    ("Random Forest", rf_pipeline),
    ("log", log_pipeline)
]

stacking_classifier = StackingClassifier(estimators=estimators, final_estimator=LogisticRegressionCV())
stacking_classifier

StackingClassifier(cv=None,
                   estimators=[('Random Forest',
                                Pipeline(memory=None,
                                         steps=[('columntransformer',
                                                 ColumnTransformer(n_jobs=None,
                                                                   remainder='drop',
                                                                   sparse_threshold=0.3,
                                                                   transformer_weights=None,
                                                                   transformers=[('simpleimputer',
                                                                                  SimpleImputer(add_indicator=True,
                                                                                                copy=True,
                                                                                                fill_value=None,
                 

In [14]:
# import time
# import matplotlib.pyplot as plt
# # from sklearn.metrics import PredictionErrorDisplay
# from sklearn.model_selection import cross_validate, cross_val_predict

# fig, axs = plt.subplots(2, 2, figsize=(9, 7))
# axs = np.ravel(axs)

# for ax, (name, est) in zip(
#     axs, estimators + [("Stacking Regressor", stacking_classifier)]
# ):
#     scorers = {"R2": "r2", "MAE": "neg_mean_absolute_error"}

#     start_time = time.time()
#     scores = cross_validate(
#         est, X, y, scoring=list(scorers.values()), n_jobs=-1, verbose=0
#     )
#     elapsed_time = time.time() - start_time

#     y_pred = cross_val_predict(est, X, y, n_jobs=-1, verbose=0)
#     scores = {
#         key: (
#             f"{np.abs(np.mean(scores[f'test_{value}'])):.2f} +- "
#             f"{np.std(scores[f'test_{value}']):.2f}"
#         )
#         for key, value in scorers.items()
#     }

#     display = PredictionErrorDisplay.from_predictions(
#         y_true=y,
#         y_pred=y_pred,
#         kind="actual_vs_predicted",
#         ax=ax,
#         scatter_kwargs={"alpha": 0.2, "color": "tab:blue"},
#         line_kwargs={"color": "tab:red"},
#     )
#     ax.set_title(f"{name}\nEvaluation in {elapsed_time:.2f} seconds")

#     for name, score in scores.items():
#         ax.plot([], [], " ", label=f"{name}: {score}")
#     ax.legend(loc="upper left")

# plt.suptitle("Single predictors versus stacked predictors")
# plt.tight_layout()
# plt.subplots_adjust(top=0.9)
# plt.show()