# Import

## Tools

In [None]:
# standard libary and settings
import copy
import os
import pickle
import sys
import importlib
import itertools
from functools import reduce
import time

rundate = time.strftime("%Y%m%d")

import warnings

warnings.simplefilter("ignore")

from IPython.core.display import display, HTML

display(HTML("<style>.container { width:95% !important; }</style>"))

# data extensions and settings
import numpy as np

np.set_printoptions(threshold=np.inf, suppress=True)

import pandas as pd

pd.set_option("display.max_rows", 500)
pd.set_option("display.max_columns", 500)
pd.options.display.float_format = "{:,.6f}".format

# modeling extensions
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.datasets import load_breast_cancer
from sklearn.ensemble import (
    RandomForestClassifier,
    GradientBoostingClassifier,
    AdaBoostClassifier,
    ExtraTreesClassifier,
    IsolationForest,
)
from sklearn.impute import SimpleImputer
from sklearn.linear_model import (
    Lasso,
    Ridge,
    ElasticNet,
    LinearRegression,
    LogisticRegression,
    SGDRegressor,
)
from sklearn.model_selection import (
    KFold,
    train_test_split,
    GridSearchCV,
    StratifiedKFold,
    cross_val_score,
    RandomizedSearchCV,
)
from sklearn.neighbors import KNeighborsClassifier, KNeighborsRegressor
from sklearn.pipeline import make_pipeline, Pipeline, FeatureUnion
from sklearn.preprocessing import (
    StandardScaler,
    RobustScaler,
    PolynomialFeatures,
    OrdinalEncoder,
    LabelEncoder,
    OneHotEncoder,
    KBinsDiscretizer,
    QuantileTransformer,
    PowerTransformer,
    MinMaxScaler,
)
from sklearn.svm import SVC, SVR
from category_encoders import (
    WOEEncoder,
    TargetEncoder,
    CatBoostEncoder,
    BinaryEncoder,
    CountEncoder,
)

from lightgbm import LGBMClassifier, LGBMRegressor
from xgboost import XGBClassifier, XGBRegressor

from hyperopt import hp

import eif
import shap

shap.initjs()

# visualization extensions and settings
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

import missingno as msno
import squarify

sys.path.append(f"{os.environ['REPOS']}/mlmachine")
sys.path.append(f"{os.environ['REPOS']}/prettierplot")

import mlmachine as mlm
import mlmachine.data as data
from mlmachine.features.preprocessing import (
    DataFrameSelector,
    PandasTransformer,
    KFoldEncoder,
    GroupbyImputer,
    PandasFeatureUnion,
    DualTransformer,
)
from prettierplot.plotter import PrettierPlot
import prettierplot.style as style

%load_ext autoreload
%autoreload 2


## Reload objects

In [None]:
# #

# experiment_path_root = "/data/t1-tpeterso/repos/kaggle-titanic/experiments/titanic_survivorship_classification"
# experiment = "210801185140"

# # reload objects
# machine = pickle.load(open(os.path.join(experiment_path_root, experiment, "machine", "machine.pkl"), 'rb'))
# # impute_pipe = pickle.load(open(os.path.join(experiment_path_root, experiment, "transformers", "impute_pipe.pkl"), 'rb'))
# # polynomial_pipe = pickle.load(open(os.path.join(experiment_path_root, experiment, "transformers", "polynomial_pipe.pkl"), 'rb'))
# # encode_pipe = pickle.load(open(os.path.join(experiment_path_root, experiment, "transformers", "encode_pipe.pkl"), 'rb'))
# # target_encode_pipe = pickle.load(open(os.path.join(experiment_path_root, experiment, "transformers", "target_encode_pipe.pkl"), 'rb'))
# # skew_pipe = pickle.load(open(os.path.join(experiment_path_root, experiment, "transformers", "skew_pipe.pkl"), 'rb'))
# scale_pipe = pickle.load(open(os.path.join(experiment_path_root, experiment, "transformers", "scale_pipe.pkl"), 'rb'))
# fs = pickle.load(open(os.path.join(experiment_path_root, experiment, "feature_selection", "FeatureSelector.pkl"), 'rb'))


## Data

### Load & review dataset

In [None]:
# load data and print dimensions
df_train, df_valid = data.titanic()

print("Training data dimensions: {}".format(df_train.shape))
print("Validation data dimensions: {}".format(df_valid.shape))


In [None]:
# display info and first 5 rows
df_train.info()
display(df_train[:5])


In [None]:
# review counts of different column types
df_train.dtypes.value_counts()


### Create machine object

In [None]:
#
df_train, df_valid = mlm.train_test_df_compile(data=df_train, target_col="Survived")

# Load dataset into mlmachine
ordinal_encodings = {
        "Pclass": [1, 2, 3], # Pclass
    }

machine = mlm.Machine(
    experiment_name="titanic_survivorship_classification",
    training_dataset=df_train,
    validation_dataset=df_valid,    
    target="Survived",
    remove_features=["PassengerId", "Ticket", "Cabin"],
    identify_as_continuous=["Age","Fare"],
    identify_as_count=["Parch","SibSp"],
    identify_as_nominal=["Embarked","Name"],
    identify_as_ordinal=["Pclass"],
    ordinal_encodings = ordinal_encodings,
    is_classification=True,
)


In [None]:
# review mlm dtypes
machine.training_features.mlm_dtypes


# EDA

## Category feature EDA

In [None]:
# category features
for feature in machine.training_features.mlm_dtypes["category"]:
    machine.eda_cat_target_cat_feat(
        feature=feature,
        level_count_cap=10,
        legend_labels=["Died","Survived"],
        chart_scale=15,
        training_data=True
    )


## Count feature EDA

In [None]:
# number features
for feature in machine.training_features.mlm_dtypes["count"]:
    machine.eda_cat_target_cat_feat(
        feature=feature,
        level_count_cap=10,
        legend_labels=["Died","Survived"],
        chart_scale=15
    )


## Continuous feature EDA

In [None]:
# continuous features
for feature in machine.training_features.mlm_dtypes["continuous"]:
    machine.eda_cat_target_num_feat(
        feature=feature,
        outliers_out_of_scope=5,
        legend_labels=["Died","Survived"],
        chart_scale=12
    )


In [None]:
# continuous features
machine.eda(save_plots=True)


### Correlation

In [None]:
# correlation heat map
p = PrettierPlot()
ax = p.make_canvas()
p.corr_heatmap(
    df=machine.recombine_data(training_data=True),
    annot=True,
    ax=ax,
)


In [None]:
# correlation heat map with most highly correlated features relative to the target
p = PrettierPlot(plot_orientation='tall',chart_scale=10)
ax = p.make_canvas()
p.corr_heatmap_target(
    df=machine.training_features,
    target=machine.training_target,
    thresh=0.01,
    annot=True,
    ax=ax,
)


### Pair plot

In [None]:
# pair plot
p = PrettierPlot(chart_scale=15)
p.pair_plot(
    df=machine.training_features[["Age","Fare"]],
    target=machine.training_target,
    diag_kind="auto",
    legend_labels=["Died","Survived"],
)


## Faceting

In [None]:
# facet Pclass vs Embarked
p = PrettierPlot(chart_scale=12)
ax = p.make_canvas(title="Survivorship, embark location by passenger class", y_shift=0.7)
p.facet_two_cat_bar(
    df=machine.recombine_data(training_data=True),
    x="Embarked",
    y=machine.training_target.name,
    split="Pclass",
    y_units="ff",
    ax=ax,
)


In [None]:
# facet Pclass vs Embarked
p = PrettierPlot(chart_scale=12)
ax = p.make_canvas(title="Survivorship, passenger class by gender", y_shift=0.7)
p.facet_two_cat_bar(
    df=machine.recombine_data(training_data=True),
    x="Pclass",
    y=machine.training_target.name,
    split="Sex",
    y_units="ff",
    ax=ax,
)


In [None]:
# facet Pclass vs Embarked
p = PrettierPlot(chart_scale=12)
ax = p.make_canvas(title="Survivorship,embark location by gender", y_shift=0.7)
p.facet_two_cat_bar(
    df=machine.recombine_data(training_data=True),
    x="Embarked",
    y=machine.training_target.name,
    split="Sex",
    y_units="ff",
    ax=ax,
)


In [None]:
#
p = PrettierPlot()
p.facet_two_cat_point(
    df=machine.recombine_data(training_data=True),
    x="Sex",
    y=machine.training_target.name,
    split="Pclass",
    cat_col="Embarked",
    aspect=1.0,
    height=5,
    bbox=(1.3, 1.2),
    legend_labels=["1st class", "2nd class", "3rd class"],
)


In [None]:
#
p = PrettierPlot()
p.facet_two_cat_point(
    df=machine.recombine_data(training_data=True).dropna(subset=["Embarked"]),
    x="Embarked",
    y=machine.training_target.name,
    split="Pclass",
    cat_col="Sex",
    aspect=1.0,
    height=5,
    bbox=(1.5, 0.8),
    legend_labels=["1st class", "2nd class", "3rd class"],
)


In [None]:
#
p = PrettierPlot()
p.facet_cat_num_hist(
    df=machine.recombine_data(training_data=True),
    split=machine.training_target.name,
    legend_labels=["Died", "Lived"],
    cat_row="Sex",
    cat_col="Embarked",
    num_col="Age",
    bbox=(1.9, 1.0),
    height=4,
    aspect=1,
)


In [None]:
#
p = PrettierPlot(chart_scale=15)
p.facet_cat_num_scatter(
    df=machine.recombine_data(training_data=True),
    split=machine.training_target.name,
    legend_labels=["Died", "Lived"],
    cat_row="Sex",
    cat_col="Embarked",
    x="Fare",
    y="Age",
    bbox=(1.9, 1.0),
    height=4,
    aspect=1,
)


## Target variable evaluation

In [None]:
# null score
pd.Series(machine.training_target).value_counts(normalize=True)


# Data preparation

## Missing data


### Training

In [None]:
# evaluate missing data
machine.eda_missing_summary(training_data=True)


In [None]:
# missingno matrix
msno.matrix(machine.training_features)


In [None]:
# missingno bar
msno.bar(machine.training_features)


In [None]:
# missingno heatmap
msno.heatmap(machine.training_features)


In [None]:
# missingno dendrogram
msno.dendrogram(machine.training_features)


### Validation

In [None]:
# evaluate missing data
machine.eda_missing_summary(training_data=False)


In [None]:
# missingno matrix
msno.matrix(machine.validation_features)


In [None]:
# missingno bar
msno.bar(machine.validation_features)


In [None]:
# missingno heatmap
msno.heatmap(machine.validation_features)


In [None]:
# missingno dendrogram
msno.dendrogram(machine.validation_features)


### Training vs. validation


In [None]:
# compare feature with missing data
machine.missing_column_compare()


### Impute

In [None]:
# impute pipeline
impute_pipe = PandasFeatureUnion([
    ("age", make_pipeline(
        DataFrameSelector(include_columns=["Age","Pclass"]),
        GroupbyImputer(null_column="Age", groupby_column="Pclass", strategy="mean")
    )),
    ("fare", make_pipeline(
        DataFrameSelector(include_columns=["Fare","Pclass"]),
        GroupbyImputer(null_column="Fare", groupby_column="Pclass")
    )),
    ("embarked", make_pipeline(
        DataFrameSelector(include_columns=["Embarked"]),
        PandasTransformer(SimpleImputer(strategy="most_frequent"))
    )),
#     ("cabin", make_pipeline(
#         DataFrameSelector(include_columns=["Cabin"]),
#         PandasTransformer(SimpleImputer(strategy="constant", fill_value="X"))
#     )),
    ("diff", make_pipeline(
        DataFrameSelector(exclude_columns=["Age","Fare","Embarked"])
    )),
])

# fit & save objects
impute_pipe.fit(machine.training_features)
with open(os.path.join(machine.current_experiment_dir, "transformers", "impute_pipe.pkl"), 'wb') as handle:
    pickle.dump(impute_pipe, handle, protocol=pickle.HIGHEST_PROTOCOL)

# transform datasets
machine.training_features = impute_pipe.fit_transform(machine.training_features)
machine.validation_features = impute_pipe.transform(machine.validation_features)


In [None]:
#
machine.eda_missing_summary(training_data=True)


In [None]:
#
machine.eda_missing_summary(training_data=False)


## Feature engineering

### Handcrafted

In [None]:
### training data
# parse titles to learn gender, and identify rare titles which may convey status
title = [i.split(",")[1].split(".")[0].strip() for i in machine.training_features["Name"]]
machine.training_features["Title"] = pd.Series(
    title,
    index=machine.training_features.index,
    dtype="object",
)
machine.training_features["Title"] = machine.training_features["Title"].replace(
    [
        "Lady",
        "the Countess",
        "Countess",
        "Capt",
        "Col",
        "Don",
        "Dr",
        "Major",
        "Rev",
        "Sir",
        "Jonkheer",
        "Dona",
    ],
    "Rare",
)
machine.training_features["Title"] = machine.training_features["Title"].map(
    {"Master": 0, "Miss": 1, "Ms": 1, "Mme": 1, "Mlle": 1, "Mrs": 1, "Mr": 2, "Rare": 3}
)
machine.training_features["Title"] = machine.training_features["Title"].astype("category")

# # distill cabin feature
# machine.training_features["CabinQuarter"] = pd.Series(
#     [i[0] if not pd.isnull(i) else "X" for i in machine.training_features["Cabin"]],
#     index=machine.training_features.index,
#     dtype="category",
# )

# family size features and binning
machine.training_features["FamilySize"] = machine.training_features["SibSp"] + machine.training_features["Parch"] + 1
machine.training_features["FamilySize"] = machine.training_features["FamilySize"].astype("int64")


In [None]:
### validation data
# parse titles to learn gender, and identify rare titles which may convey status
title = [i.split(",")[1].split(".")[0].strip() for i in machine.validation_features["Name"]]
machine.validation_features["Title"] = pd.Series(
    title,
    index=machine.validation_features.index,
    dtype="object",
)
machine.validation_features["Title"] = machine.validation_features["Title"].replace(
    [
        "Lady",
        "the Countess",
        "Countess",
        "Capt",
        "Col",
        "Don",
        "Dr",
        "Major",
        "Rev",
        "Sir",
        "Jonkheer",
        "Dona",
    ],
    "Rare",
)
machine.validation_features["Title"] = machine.validation_features["Title"].map(
    {"Master": 0, "Miss": 1, "Ms": 1, "Mme": 1, "Mlle": 1, "Mrs": 1, "Mr": 2, "Rare": 3}
)
machine.validation_features["Title"] = machine.validation_features["Title"].astype("category")

# # distill cabin feature
# machine.validation_features["CabinQuarter"] = pd.Series(
#     [i[0] if not pd.isnull(i) else "X" for i in machine.validation_features["Cabin"]],
#     index=machine.validation_features.index,
#     dtype="category",
# )

# additional features
machine.validation_features["FamilySize"] = machine.validation_features["SibSp"] + machine.validation_features["Parch"] + 1
machine.validation_features["FamilySize"] = machine.validation_features["FamilySize"].astype("int64")


### Polynomial features

In [None]:
# transform pipe
polynomial_pipe = PandasFeatureUnion([
    ("polynomial", make_pipeline(
        DataFrameSelector(include_mlm_dtypes=["continuous"]),
        PandasTransformer(PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)),
    )),
    ("diff", make_pipeline(
        DataFrameSelector(exclude_mlm_dtypes=["continuous"], exclude_columns=["Name"]),
    )),
])

# fit & save objects
polynomial_pipe.fit(machine.training_features)
with open(os.path.join(machine.current_experiment_dir, "transformers", "polynomial_pipe.pkl"), 'wb') as handle:
    pickle.dump(polynomial_pipe, handle, protocol=pickle.HIGHEST_PROTOCOL)

# transform datasets
machine.training_features = polynomial_pipe.fit_transform(machine.training_features)
machine.validation_features = polynomial_pipe.transform(machine.validation_features)

machine.update_dtypes()


### Encoding

#### Evaluate

In [None]:
### training data
# counts of unique values in training data string columns
machine.training_features[machine.training_features.mlm_dtypes["category"]].apply(pd.Series.nunique, axis=0)


In [None]:
### train data
# print unique values in each category columns
machine.unique_category_levels()


In [None]:
### validation data
# counts of unique values in validation data string columns
machine.validation_features[machine.training_features.mlm_dtypes["category"]].apply(pd.Series.nunique, axis=0)


In [None]:
### validation data
# print unique values in each category columns
machine.unique_category_levels(training_data=False)


In [None]:
# identify values that are present in the training data but not the validation data, and vice versa
machine.compare_train_valid_levels()


#### Encode

In [None]:
# encode pipeline
encode_pipe = PandasFeatureUnion([
    ("nominal", make_pipeline(
        DataFrameSelector(include_columns=machine.training_features.mlm_dtypes["nominal"]),
        PandasTransformer(OneHotEncoder(drop="first")),
    )),
    ("ordinal", make_pipeline(
        DataFrameSelector(include_columns=list(ordinal_encodings.keys())),
        PandasTransformer(OrdinalEncoder(categories=list(ordinal_encodings.values()))),
    )),
#     ("bin", make_pipeline(
#         DataFrameSelector(include_columns=machine.training_features.mlm_dtypes["continuous"]),
#         PandasTransformer(KBinsDiscretizer(encode="ordinal")),
#     )),
    ("diff", make_pipeline(
        DataFrameSelector(exclude_columns=machine.training_features.mlm_dtypes["nominal"] + list(ordinal_encodings.keys())),
    )),
])

# fit & save objects
encode_pipe.fit(machine.training_features)
with open(os.path.join(machine.current_experiment_dir, "transformers", "encode_pipe.pkl"), 'wb') as handle:
    pickle.dump(encode_pipe, handle, protocol=pickle.HIGHEST_PROTOCOL)

# transform datasets
machine.training_features = encode_pipe.fit_transform(machine.training_features)
machine.validation_features = encode_pipe.transform(machine.validation_features)

machine.update_dtypes()


In [None]:
# target encoding pipe
target_encode_pipe = PandasFeatureUnion([
    ("target", make_pipeline(
        DataFrameSelector(include_mlm_dtypes=["category"]),
        KFoldEncoder(
            target=machine.training_target,
            cv=KFold(n_splits=5, shuffle=False),
            encoder=TargetEncoder,
        ),
    )),
#     ("woe", make_pipeline(
#         DataFrameSelector(include_mlm_dtypes=["category"]),
#         KFoldEncoder(
#             target=machine.training_target,
#             cv=KFold(n_splits=5, shuffle=False),
#             encoder=WOEEncoder,
#         ),
#     )),
#     ("catboost", make_pipeline(
#         DataFrameSelector(include_mlm_dtypes=["category"]),
#         KFoldEncoder(
#             target=machine.training_target,
#             cv=KFold(n_splits=5, shuffle=False),
#             encoder=CatBoostEncoder,
#         ),
#     )),
    ("diff", make_pipeline(
        DataFrameSelector(exclude_mlm_dtypes=["category"]),
    )),
])

# fit & save objects
target_encode_pipe.fit(machine.training_features)
with open(os.path.join(machine.current_experiment_dir, "transformers", "target_encode_pipe.pkl"), 'wb') as handle:
    pickle.dump(target_encode_pipe, handle, protocol=pickle.HIGHEST_PROTOCOL)

# transform datasets
machine.training_features = target_encode_pipe.fit_transform(machine.training_features)
machine.validation_features = target_encode_pipe.transform(machine.validation_features)

machine.update_dtypes()


## Feature transformation

### Skew correction

In [None]:
### training data
# evaluate skew of number features
machine.skew_summary(columns=machine.training_features.mlm_dtypes["continuous"])


In [None]:
### validation data
# evaluate skew of number features
machine.skew_summary(training_data=False, columns=machine.training_features.mlm_dtypes["continuous"])


In [None]:
# skew correction pipeline
skew_pipe = PandasFeatureUnion([
    ("skew", make_pipeline(
        DataFrameSelector(include_mlm_dtypes=["continuous"]),
        DualTransformer(),
    )),    
    ("diff", make_pipeline(
        DataFrameSelector(exclude_mlm_dtypes=["continuous"]),
    )),
])

# fit & save objects
skew_pipe.fit(machine.training_features)
with open(os.path.join(machine.current_experiment_dir, "transformers", "skew_pipe.pkl"), 'wb') as handle:
    pickle.dump(skew_pipe, handle, protocol=pickle.HIGHEST_PROTOCOL)

# transform datasets
machine.training_features = skew_pipe.fit_transform(machine.training_features)
machine.validation_features = skew_pipe.transform(machine.validation_features)

machine.update_dtypes()


### Scaling

In [None]:
#
scale_pipe = PandasFeatureUnion([
    ("scale", make_pipeline(
        DataFrameSelector(),
        PandasTransformer(RobustScaler())
    )),
])

# fit & save objects
scale_pipe.fit(machine.training_features)
with open(os.path.join(machine.current_experiment_dir, "transformers", "scale_pipe.pkl"), 'wb') as handle:
    pickle.dump(scale_pipe, handle, protocol=pickle.HIGHEST_PROTOCOL)

# transform datasets
machine.training_features = scale_pipe.fit_transform(machine.training_features)
machine.validation_features = scale_pipe.transform(machine.validation_features)

machine.update_dtypes()


## Outliers


In [None]:
# identify outliers using IQR
train_pipe = Pipeline([
    ("outlier",machine.OutlierIQR(
                outlier_count=10,
                iqr_step=1.5,
                features=machine.training_features.mlm_dtypes["continuous"],
                drop_outliers=False,))
    ])
machine.training_features = train_pipe.transform(machine.training_features)

# capture outliers
iqr_outliers = np.array(sorted(train_pipe.named_steps["outlier"].outliers))
print(iqr_outliers)


In [None]:
# identify outliers using Isolation Forest
clf = IsolationForest(
#     behaviour="new",
    max_samples=machine.training_features.shape[0],
    random_state=0,
    contamination=0.01,
)
clf.fit(machine.training_features[machine.training_features.columns])
preds = clf.predict(machine.training_features[machine.training_features.columns])

# evaluate index values
mask = np.isin(preds, -1)
if_outliers = np.array(machine.training_features[mask].index)
print(if_outliers)


In [None]:
# identify outliers using extended isolation forest
train_pipe = Pipeline([
    ("outlier",machine.ExtendedIsoForest(
                columns=machine.training_features.mlm_dtypes["continuous"],
                n_trees=100,
                sample_size=256,
                extension_level=1,
                anomalies_ratio=0.03,
                drop_outliers=False,))
    ])
machine.training_features = train_pipe.transform(machine.training_features)

# capture outliers
eif_outliers = np.array(sorted(train_pipe.named_steps["outlier"].outliers))
print(eif_outliers)


In [None]:
# identify outliers that are identified in multiple algorithms
outliers = reduce(np.intersect1d, (iqr_outliers, if_outliers, eif_outliers))
# outliers = reduce(np.intersect1d, (if_outliers, eif_outliers))
print(outliers)


In [None]:
# review outlier identification summary
outlier_summary = machine.outlier_summary(iqr_outliers=iqr_outliers,
                             if_outliers=if_outliers,
                             eif_outliers=eif_outliers
                            )
outlier_summary[outlier_summary["count"] >= 3]


In [None]:
# # remove outlers from predictors and response
# outliers = np.array([258, 305, 438, 679, 737, 745])
# machine.training_features = machine.training_features.drop(outliers)
# machine.training_target = machine.training_target.drop(index=outliers)


## Additional exploratory data analysis

In [None]:
# correlation heat map with most highly correlated features relative to the target
p = PrettierPlot(plot_orientation='tall',chart_scale=15)
ax = p.make_canvas()
p.corr_heatmap_target(
    df=machine.training_features,
    target=machine.training_target,
    thresh=0.3,
    annot=True,
    ax=ax,
)


In [None]:
# correlation heat map
p = PrettierPlot(chart_scale=25)
ax = p.make_canvas()
p.corr_heatmap(df=machine.training_features, annot=False, ax=ax)


## Machine checkpoint

In [None]:
# save machine object
with open(os.path.join(machine.current_experiment_dir, "machine", "machine.pkl"), 'wb') as handle:
    pickle.dump(machine, handle, protocol=pickle.HIGHEST_PROTOCOL)


# Feature selection

In [None]:
# generate feature importance summary
knn10 = KNeighborsClassifier(n_neighbors=10)

lgb2 = LGBMClassifier(max_depth=2, n_estimators=500)
lgb3 = LGBMClassifier(max_depth=3, n_estimators=500)
lgb4 = LGBMClassifier(max_depth=4, n_estimators=500)

xgb2 = XGBClassifier(max_depth=2, n_estimators=500)
xgb3 = XGBClassifier(max_depth=3, n_estimators=500)
xgb4 = XGBClassifier(max_depth=4, n_estimators=500)

rf2 = RandomForestClassifier(max_depth=2, n_estimators=500)
rf3 = RandomForestClassifier(max_depth=3, n_estimators=500)
rf4 = RandomForestClassifier(max_depth=4, n_estimators=500)

estimators = [
    SVC,
    LGBMClassifier,
    LogisticRegression,
    XGBClassifier,
    RandomForestClassifier,
    GradientBoostingClassifier,
    KNeighborsClassifier,
#     knn10,
#     lgb2,
#     lgb3,
#     lgb4,
#     xgb2,
#     xgb3,
#     xgb4,
#     rf2,
#     rf3,
#     rf4,
]

fs = machine.FeatureSelector(
    training_features=machine.training_features,
    training_target=machine.training_target,
    validation_features=machine.validation_features,
    validation_target=machine.validation_target,
    estimators=estimators,
    experiment_dir=machine.current_experiment_dir,
)

fs.feature_selector_suite(
    sequential_scoring=["roc_auc"],
#     sequential_scoring=["accuracy","precision","recall","roc_auc"],
    n_jobs=4,
    save_to_csv=True,
    verbose=True,
)


In [None]:
# calculate cross-validation performance
fs.run_cross_val(
    estimators=estimators,
    scoring=["roc_auc"],
    n_folds=5,
    step=1,
    n_jobs=2,
    save_to_csv=True,
)


In [None]:
# fs.cv_summary[fs.cv_summary["estimator"] == "LGBMClassifier"].sort_values("validation score")

## ROC AUC

In [None]:
# visualize CV performance for diminishing feature set
fs.plot_results(
    scoring="roc_auc_score",
    title_scale=0.8,
    save_plots=True
)


In [None]:
#
fs.create_cross_val_features_df(scoring="roc_auc_score")
# fs.cross_val_features_df


In [None]:
#
fs.create_cross_val_features_dict(scoring="roc_auc_score")
fs.cross_val_features_dict


In [None]:
# save feature selector
with open(os.path.join(machine.current_experiment_dir, "feature_selection", "FeatureSelector.pkl"), 'wb') as handle:
    pickle.dump(fs, handle, protocol=pickle.HIGHEST_PROTOCOL)


# Modeling

## Data preparation

In [None]:
# #################################################################################
# # import training data
# df_train, df_valid = data.titanic()

# #
# df_train, df_valid = mlm.train_test_df_compile(data=df_train, target_col="Survived")


# # Load training data into mlmachine
# ordinal_encodings = {
#         "Pclass": [1, 2, 3], # Pclass
#     }

# machine = mlm.Machine(
#     experiment_name="titanic_survivorship_classification",
#     training_dataset=df_train,
#     validation_dataset=df_valid,    
#     target="Survived",
#     remove_features=["PassengerId", "Ticket"],
#     identify_as_continuous=["Age","Fare"],
#     identify_as_count=["Parch","SibSp"],
#     identify_as_nominal=["Cabin","Embarked","Name"],
#     identify_as_ordinal=["Pclass"],
#     ordinal_encodings = ordinal_encodings,
#     is_classification=True,
# )

# #################################################################################
# # impute pipeline
# impute_pipe = PandasFeatureUnion([
#     ("age", make_pipeline(
#         DataFrameSelector(include_columns=["Age","Pclass"]),
#         GroupbyImputer(null_column="Age", groupby_column="Pclass", strategy="mean")
#     )),
#     ("fare", make_pipeline(
#         DataFrameSelector(include_columns=["Fare","Pclass"]),
#         GroupbyImputer(null_column="Fare", groupby_column="Pclass")
#     )),
#     ("embarked", make_pipeline(
#         DataFrameSelector(include_columns=["Embarked"]),
#         PandasTransformer(SimpleImputer(strategy="most_frequent"))
#     )),
#     ("cabin", make_pipeline(
#         DataFrameSelector(include_columns=["Cabin"]),
#         PandasTransformer(SimpleImputer(strategy="constant", fill_value="X"))
#     )),
#     ("diff", make_pipeline(
#         DataFrameSelector(exclude_columns=["Age","Fare","Embarked","Cabin"])
#     )),
# ])

# machine.training_features = impute_pipe.fit_transform(machine.training_features)
# machine.validation_features = impute_pipe.transform(machine.validation_features)

# #################################################################################
# # feature engineering - training

# # parse titles to learn gender, and identify rare titles which may convey status
# title = [i.split(",")[1].split(".")[0].strip() for i in machine.training_features["Name"]]
# machine.training_features["Title"] = pd.Series(
#     title,
#     index=machine.training_features.index,
#     dtype="object",
# )
# machine.training_features["Title"] = machine.training_features["Title"].replace(
#     [
#         "Lady",
#         "the Countess",
#         "Countess",
#         "Capt",
#         "Col",
#         "Don",
#         "Dr",
#         "Major",
#         "Rev",
#         "Sir",
#         "Jonkheer",
#         "Dona",
#     ],
#     "Rare",
# )
# machine.training_features["Title"] = machine.training_features["Title"].map(
#     {"Master": 0, "Miss": 1, "Ms": 1, "Mme": 1, "Mlle": 1, "Mrs": 1, "Mr": 2, "Rare": 3}
# )
# machine.training_features["Title"] = machine.training_features["Title"].astype("category")

# # distill cabin feature
# machine.training_features["CabinQuarter"] = pd.Series(
#     [i[0] if not pd.isnull(i) else "X" for i in machine.training_features["Cabin"]],
#     index=machine.training_features.index,
#     dtype="category",
# )

# # family size features
# machine.training_features["FamilySize"] = pd.to_numeric(machine.training_features["SibSp"]) + pd.to_numeric(machine.training_features["Parch"]) + 1

# #################################################################################
# # feature engineering - validation

# # parse titles to learn gender, and identify rare titles which may convey status
# title = [i.split(",")[1].split(".")[0].strip() for i in machine.validation_features["Name"]]
# machine.validation_features["Title"] = pd.Series(
#     title,
#     index=machine.validation_features.index,
#     dtype="object"
# )
# machine.validation_features["Title"] = machine.validation_features["Title"].replace(
#     [
#         "Lady",
#         "the Countess",
#         "Countess",
#         "Capt",
#         "Col",
#         "Don",
#         "Dr",
#         "Major",
#         "Rev",
#         "Sir",
#         "Jonkheer",
#         "Dona",
#     ],
#     "Rare",
# )
# machine.validation_features["Title"] = machine.validation_features["Title"].map(
#     {"Master": 0, "Miss": 1, "Ms": 1, "Mme": 1, "Mlle": 1, "Mrs": 1, "Mr": 2, "Rare": 3}
# )
# machine.validation_features["Title"] = machine.validation_features["Title"].astype("category")

# # distill cabin feature
# machine.validation_features["CabinQuarter"] = pd.Series(
#     [i[0] if not pd.isnull(i) else "X" for i in machine.validation_features["Cabin"]],
#     index=machine.validation_features.index,
#     dtype="category",
# )

# # additional features
# machine.validation_features["FamilySize"] = pd.to_numeric(machine.validation_features["SibSp"]) + pd.to_numeric(machine.validation_features["Parch"]) + 1

# machine.update_dtypes()


# #################################################################################
# ### feature transformation pipeline
# # polynomial feature pipe
# polynomial_pipe = PandasFeatureUnion([
#     ("polynomial", make_pipeline(
#         DataFrameSelector(include_mlm_dtypes=["continuous"]),
#         PandasTransformer(PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)),
#     )),
#     ("diff", make_pipeline(
#         DataFrameSelector(exclude_mlm_dtypes=["continuous"], exclude_columns=["Name","Cabin"]),
#     )),
# ])

# machine.training_features = polynomial_pipe.fit_transform(machine.training_features)
# machine.validation_features = polynomial_pipe.transform(machine.validation_features)

# machine.update_dtypes()


# # encode & bin pipeline
# encode_pipe = PandasFeatureUnion([
#     ("nominal", make_pipeline(
#         DataFrameSelector(include_columns=machine.training_features.mlm_dtypes["nominal"]),
#         PandasTransformer(OneHotEncoder(drop="first")),
#     )),
#     ("ordinal", make_pipeline(
#         DataFrameSelector(include_columns=list(ordinal_encodings.keys())),
#         PandasTransformer(OrdinalEncoder(categories=list(ordinal_encodings.values()))),
#     )),
# #     ("bin", make_pipeline(
# #         DataFrameSelector(include_columns=machine.training_features.mlm_dtypes["continuous"]),
# #         PandasTransformer(KBinsDiscretizer(encode="ordinal")),
# #     )),
#     ("diff", make_pipeline(
#         DataFrameSelector(exclude_columns=machine.training_features.mlm_dtypes["nominal"] + list(ordinal_encodings.keys())),
#     )),
# ])

# machine.training_features = encode_pipe.fit_transform(machine.training_features)
# machine.validation_features = encode_pipe.transform(machine.validation_features)

# machine.update_dtypes()


# ###
# # complex encoding
# target_encode_pipe = PandasFeatureUnion([
#     ("target", make_pipeline(
#         DataFrameSelector(include_mlm_dtypes=["category"]),
#         KFoldEncoder(
#             target=machine.training_target,
#             cv=KFold(n_splits=5, shuffle=False),
#             encoder=TargetEncoder,
#         ),
#     )),
# #     ("woe", make_pipeline(
# #         DataFrameSelector(include_mlm_dtypes=["category"]),
# #         KFoldEncoder(
# #             target=machine.training_target,
# #             cv=KFold(n_splits=5, shuffle=False),
# #             encoder=WOEEncoder,
# #         ),
# #     )),
# #     ("catboost", make_pipeline(
# #         DataFrameSelector(include_mlm_dtypes=["category"]),
# #         KFoldEncoder(
# #             target=machine.training_target,
# #             cv=KFold(n_splits=5, shuffle=False),
# #             encoder=CatBoostEncoder,
# #         ),
# #     )),
#     ("diff", make_pipeline(
#         DataFrameSelector(exclude_mlm_dtypes=["category"]),
#     )),
# ])

# machine.training_features = target_encode_pipe.fit_transform(machine.training_features)
# machine.validation_features = target_encode_pipe.transform(machine.validation_features)

# machine.update_dtypes()


# ### scale features
# scale_pipe = PandasFeatureUnion([
#     ("scale", make_pipeline(
#         DataFrameSelector(),
#         PandasTransformer(RobustScaler())
#     )),
# ])

# machine.training_features = scale_pipe.fit_transform(machine.training_features)
# machine.validation_features = scale_pipe.transform(machine.validation_features)

# machine.update_dtypes()


# # #################################################################################
# # # remove outliers
# # outliers = np.array([258, 305, 438, 679, 737, 745])
# # machine.training_features = machine.training_features.drop(outliers)
# # machine.training_target = machine.training_target.drop(index=outliers)

# print('completed')


## Bayesian hyper-parameter optimization

In [None]:
# model/parameter space
estimator_parameter_space = {
    "SVC": {
        "C": hp.uniform("C", 0.001, 15),
        "decision_function_shape": hp.choice("decision_function_shape", ["ovo", "ovr"]),
        "gamma": hp.uniform("gamma", 0.000000001, 5),
    },
    "LGBMClassifier": {
        "class_weight": hp.choice("class_weight", [None, "balanced"]),
        "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
        "boosting_type": hp.choice("boosting_type", ["gbdt", "dart", "goss"])
        # ,'boosting_type': hp.choice('boosting_type'
        #                    ,[{'boosting_type': 'gbdt', 'subsample': hp.uniform('gdbt_subsample', 0.5, 1)}
        #                    ,{'boosting_type': 'dart', 'subsample': hp.uniform('dart_subsample', 0.5, 1)}
        #                    ,{'boosting_type': 'goss', 'subsample': 1.0}])
        ,
        "learning_rate": hp.uniform("learning_rate", 0.01, 0.3),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "min_child_samples": hp.uniform("min_child_samples", 20, 500),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 1, dtype=int)),
        "num_leaves": hp.uniform("num_leaves", 8, 150),
        "reg_alpha": hp.uniform("reg_alpha", 0.0, 1.5),
        "reg_lambda": hp.uniform("reg_lambda", 0.0, 1.0),
        "subsample_for_bin": hp.uniform("subsample_for_bin", 20000, 400000),
    },
#     "LogisticRegression": {
#         "C": hp.loguniform("C", np.log(0.001), np.log(0.2)),
#         "penalty": hp.choice("penalty", ["l2", 'none']),
#     },
    "XGBClassifier": {
        "colsample_bytree": hp.uniform("colsample_bytree", 0.5, 1.0),
        "gamma": hp.uniform("gamma", 0.0, 10),
        "learning_rate": hp.uniform("learning_rate", 0.01, 0.3),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "min_child_weight": hp.uniform("min_child_weight", 1, 20),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 1, dtype=int)),
        "subsample": hp.uniform("subsample", 0.3, 1),
    },
    "RandomForestClassifier": {
        "bootstrap": hp.choice("bootstrap", [True, False]),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 1, dtype=int)),
        "max_features": hp.choice("max_features", ["auto", "sqrt"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(2, 40, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 40, dtype=int)),
    },
    "GradientBoostingClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 10000, 1, dtype=int)),
        "max_depth": hp.choice("max_depth", np.arange(2, 20, dtype=int)),
        "max_features": hp.choice("max_features", ["auto", "sqrt"]),
        "learning_rate": hp.uniform("learning_rate", 0.01, 0.3),
        "loss": hp.choice("loss", ["deviance", "exponential"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(2, 40, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 40, dtype=int)),
    },
    "KNeighborsClassifier": {
        "algorithm": hp.choice("algorithm", ["auto", "ball_tree", "kd_tree", "brute"]),
        "n_neighbors": hp.choice("n_neighbors", np.arange(1, 20, dtype=int)),
        "weights": hp.choice("weights", ["distance", "uniform"]),
    },
}


In [None]:
# execute bayesian optimization grid search
machine.exec_bayes_optim_search(
    estimator_parameter_space=estimator_parameter_space,
    training_features=machine.training_features,
    training_target=machine.training_target,
    validation_features=machine.validation_features,
    validation_target=machine.validation_target,
    scoring="roc_auc",
    n_folds=5,
    n_jobs=2,
    iters=125,
    show_progressbar=True,
    columns=fs.cross_val_features_dict
)

# save Machine object
with open(os.path.join(machine.current_experiment_dir, "machine", "machine.pkl"), 'wb') as handle:
    pickle.dump(machine, handle, protocol=pickle.HIGHEST_PROTOCOL)


In [None]:
machine.bayes_optim_summary.sort_values("validation_score", ascending=False)[:10]

### Model loss by iteration

In [None]:
# model loss plot
for estimator in np.unique(machine.bayes_optim_summary["estimator"]):
    machine.model_loss_plot(
        bayes_optim_summary=machine.bayes_optim_summary,
        estimator_class=estimator,
        save_plots=True,
    )

### Parameter selection by iteration

In [None]:
# estimator parameter plots
for estimator in np.unique(machine.bayes_optim_summary["estimator"]):
    machine.model_param_plot(
        bayes_optim_summary=machine.bayes_optim_summary,
        estimator_class=estimator,
        estimator_parameter_space=estimator_parameter_space,
        n_iter=1000,
#         chart_scale=15,
        title_scale=1.2,
        save_plots=True
    )

In [None]:
# pair-wise comparison
p = PrettierPlot(chart_scale=12)
p.pair_plot_custom(
    df=machine.unpack_bayes_optim_summary(machine.bayes_optim_summary, "LGBMClassifier"),
    columns=["colsample_bytree", "learning_rate", "iteration","iter_loss"],
    gradient_col="iteration",
    color=style.style_grey
)

## Model performance evaluation - standard models

In [None]:
#
top_models = machine.top_bayes_optim_models(
                bayes_optim_summary=machine.bayes_optim_summary,
                metric="validation_score",
                num_models=1,
            )
top_models


In [None]:
# classification panel, single model
# estimator_class = 'LGBMClassifier'; model_iter = 27
# estimator_class = 'XGBClassifier'; model_iter = 148
# estimator_class = 'RandomForestClassifier'; model_iter = 48
# estimator_class = 'GradientBoostingClassifier'; model_iter = 402
# estimator_class = 'AdaBoostClassifier'; model_iter = 418
# estimator_class = 'ExtraTreesClassifier'; model_iter = 261
estimator_class = 'SVC'; model_iter = 61
# estimator_class = 'KNeighborsClassifier'; model_iter = 466

model = machine.BayesOptimClassifierBuilder(
    bayes_optim_summary=machine.bayes_optim_summary,
    estimator_class=estimator_class,
    model_iter=model_iter,
)


In [None]:
#
machine.binary_classification_panel(
    model=model,
#     X_train=machine.training_features,
#     y_train=machine.training_target,
    X_train=machine.training_features,
    y_train=machine.training_target,
    X_valid=machine.validation_features,
    y_valid=machine.validation_target,
    labels=["Dies", "Survives"],
#     n_folds=3,
)


In [None]:
#
machine.binary_prediction_summary(
    model=model,
    X_train=machine.training_features,
    y_train=machine.training_target,
)


In [None]:
# create classification reports for training data
for estimator, model_iters in top_models.items():
    for model_iter in model_iters:
        model = machine.BayesOptimClassifierBuilder(
            bayes_optim_summary=machine.bayes_optim_summary,
            estimator_class=estimator,
            model_iter=model_iter,
        )
        machine.binary_classification_panel(
            model=model,
        #     X_train=machine.training_features,
        #     y_train=machine.training_target,
            X_train=machine.training_features,
            y_train=machine.training_target,
            X_valid=machine.validation_features,
            y_valid=machine.validation_target,
            labels=["Dies", "Survives"],
        )


# Model explanability

In [None]:
# 
# estimator = 'LGBMClassifier'; model_iter = 21
# estimator = 'XGBClassifier'; model_iter = 148
# estimator = 'RandomForestClassifier'; model_iter = 493
# estimator = 'GradientBoostingClassifier'; model_iter = 402
# estimator = 'AdaBoostClassifier'; model_iter = 418
# estimator = 'ExtraTreesClassifier'; model_iter = 261
estimator = 'SVC'; model_iter = 61

# estimator = 'KNeighborsClassifier'; model_iter = 466

model = machine.BayesOptimClassifierBuilder(
    bayes_optim_summary=machine.bayes_optim_summary,
    estimator_class=estimator_class,
    model_iter=model_iter,
)

model.fit(machine.training_features.values, machine.training_target.values)


## Permutation importance

## Partial dependence plots

## SHAP

### Training

In [None]:
machine.single_shap_viz_tree(obs_ix=444, model=model, data=machine.training_features, target=machine.training_target)

In [None]:
# SHAP force plots for individual observations
for i in machine.training_features.index[:5]:
    machine.single_shap_viz_tree(obs_ix=i, model=model, data=machine.training_features, target=machine.training_target)

In [None]:
# SHAP force plot a set of data
visual = machine.multi_shap_viz_tree(obs_ixs=machine.training_features.index, model=model, data=machine.training_features)
visual

In [None]:
# generate SHAP values for set of observations
obs_data, _, obs_shap_values = machine.multi_shap_value_tree(
    obs_ixs=machine.training_features.index, model=model, data=machine.training_features
)

In [None]:
# SHAP dependence plot grid
# grid_features = ["Pclass", "Age", "Fare", "SibSp","Parch"]
# grid_features = ['Age*Fare','Title_ordinal_encoded','Sex_male','Fare','Pclass_ordinal_encoded','CabinQuarter_X']
grid_features = ['Age*Fare','Title_ordinal_encoded','Fare','Pclass_ordinal_encoded','Sex_male']


machine.shap_dependence_grid(
    obs_data=obs_data,
    obs_shap_values=obs_shap_values,
    grid_features=grid_features,
    all_features=machine.training_features.columns,
    dot_size=35,
    alpha=0.5,
)

In [None]:
# single SHAP dependence plot
p = PrettierPlot()
ax = p.make_canvas()

machine.shap_dependence_plot(
    obs_data=obs_data,
    obs_shap_values=obs_shap_values,
    scatter_feature="Fare",
    color_feature="Sex_male",
    feature_names=machine.training_features.columns,
    dot_size=50,
    alpha=0.5,
    ax=ax
)

In [None]:
# SHAP dependence plots for all feature relative to an interaction feature
feature_names = machine.training_features.columns.tolist()
top_shap = np.argsort(-np.sum(np.abs(obs_shap_values), 0))

for top_ix in top_shap:
    p = PrettierPlot()
    ax = p.make_canvas()
    
    machine.shap_dependence_plot(
        obs_data=obs_data,
        obs_shap_values=obs_shap_values,
        scatter_feature=feature_names[top_ix],
        color_feature="Fare",
        feature_names=feature_names,
        dot_size=50,
        alpha=0.5,
        ax=ax,
    )

In [None]:
# SHAP summary plot
machine.shap_summary_plot(
        obs_data=obs_data,
        obs_shap_values=obs_shap_values,
        feature_names=machine.training_features.columns,
    )

### Validation

In [None]:
# SHAP force plots for individual observations
for i in machine.validation_features.index[:2]:
    machine.single_shap_viz_tree(obsIx=i, model=model, data=machine.validation_features)

In [None]:
# SHAP force plot a set of data
visual = machine.multi_shap_viz_tree(obs_ixs=machine.validation_features.index, model=model, data=machine.validation_features)
visual

In [None]:
# generate SHAP values for set of observations
obs_data, _, obs_shap_values = machine.multi_shap_value_tree(
    obs_ixs=machine.validation_features.index, model=model, data=machine.validation_features
)

In [None]:
# SHAP dependence plot grid
grid_features = ["Pclass", "Age", "Fare", "SibSp","Parch"]
grid_features = ["Pclass_ordinal_encoded", "Age", "Fare"]


machine.shap_dependence_grid(
    obs_data=obs_data,
    obs_shap_values=obs_shap_values,
    grid_features=grid_features,
    all_features=machine.validation_features.columns,
    dot_size=35,
    alpha=0.5,
)

In [None]:
# single SHAP dependence plot
p = PrettierPlot()
ax = p.make_canvas()

machine.shap_dependence_plot(
    obs_data=obs_data,
    obs_shap_values=obs_shap_values,
    scatter_feature="Age",
    color_feature="Parch",
    feature_names=machine.validation_features.columns,
    dot_size=50,
    alpha=0.5,
    ax=ax
)


In [None]:
# SHAP dependence plots for all feature relative to an interaction feature
feature_names = machine.validation_features.columns.tolist()
top_shap = np.argsort(-np.sum(np.abs(obs_shap_values), 0))

for top_ix in top_shap:
    p = PrettierPlot()
    ax = p.make_canvas()
    
    machine.shap_dependence_plot(
        obs_data=obs_data,
        obs_shap_values=obs_shap_values,
        scatter_feature=feature_names[top_ix],
        color_feature="Age",
        feature_names=feature_names,
        dot_size=50,
        alpha=0.5,
        ax=ax,
    )

In [None]:
# SHAP summary plot
machine.shap_summary_plot(
        obs_data=obs_data,
        obs_shap_values=obs_shap_values,
        feature_names=machine.validation_features.columns,
    )

# Stacking

## Primary models

In [None]:
{'LGBMClassifier': [778],
 'LogisticRegression': [730],
 'XGBClassifier': [371],
 'RandomForestClassifier': [712],
 'GradientBoostingClassifier': [965],
 'AdaBoostClassifier': [512],
 'ExtraTreesClassifier': [244],
 'SVC': [551],
 'KNeighborsClassifier': [576]}

In [None]:
lgb = machine.BayesOptimClassifierBuilder(bayes_optim_summary=bayes_optim_summary, estimator="LGBMClassifier", model_iter=778)
lr = machine.BayesOptimClassifierBuilder(bayes_optim_summary=bayes_optim_summary, estimator="LogisticRegression", model_iter=730)
xgb = machine.BayesOptimClassifierBuilder(bayes_optim_summary=bayes_optim_summary, estimator="XGBClassifier", model_iter=371)
rf = machine.BayesOptimClassifierBuilder(bayes_optim_summary=bayes_optim_summary, estimator="RandomForestClassifier", model_iter=712)
gb = machine.BayesOptimClassifierBuilder(bayes_optim_summary=bayes_optim_summary, estimator="GradientBoostingClassifier", model_iter=965)
ada = machine.BayesOptimClassifierBuilder(bayes_optim_summary=bayes_optim_summary, estimator="AdaBoostClassifier", model_iter=512)
ext = machine.BayesOptimClassifierBuilder(bayes_optim_summary=bayes_optim_summary, estimator="ExtraTreesClassifier", model_iter=244)
svc = machine.BayesOptimClassifierBuilder(bayes_optim_summary=bayes_optim_summary, estimator="SVC", model_iter=551)
kn = machine.BayesOptimClassifierBuilder(bayes_optim_summary=bayes_optim_summary, estimator="KNeighborsClassifier", model_iter=576)

In [None]:
from vecstack import StackingTransformer
from sklearn.metrics import accuracy_score
# Get your data

# Initialize 1st level estimators
estimators = [('lgb', lgb.model),
              ('lr',lr.model),
              ('xgb',xgb.model),
              ('rf',rf.model),
              ('gb',gb.model),
              ('ada',ada.model),
              ('ext',ext.model),
              ('svc',svc.model),
              ('kn',kn.model),
             ]
              
# Initialize StackingTransformer
stack = StackingTransformer(
    estimators,
    regression=False,
    metric=accuracy_score,
    verbose=2
)

# Fit
stack = stack.fit(machine.training_features, machine.training_target)

# Get your stacked features
oof_train = stack.transform(machine.training_features)
oof_valid = stack.transform(machine.validation_features)

# Use 2nd level estimator with stacked features

In [None]:
# get out-of-fold predictions
oof_train, oof_valid, columns = machine.model_stacker(
    models=top_models,
    bayes_optim_summary=bayes_optim_summary,
    X_train=machine.training_features.values,
    y_train=machine.training_target.values,
    X_valid=machine.validation_features.values,
    n_folds=10,
    n_jobs=10,
)

In [None]:
# view correlations of predictions
p = PrettierPlot()
ax = p.make_canvas()
p.corr_heatmap(
    df=pd.DataFrame(oof_train, columns=columns), annot=True, ax=ax, vmin=0
)
plt.show()

## Meta model

In [None]:
# parameter space
estimator_parameter_space = {
    "LGBMClassifier": {
        "class_weight": hp.choice("class_weight", [None]),
        "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 0.7),
        "boosting_type": hp.choice("boosting_type", ["dart"]),
        "subsample": hp.uniform("subsample", 0.5, 1),
        "learning_rate": hp.uniform("learning_rate", 0.15, 0.25),
        "max_depth": hp.choice("max_depth", np.arange(4, 20, dtype=int)),
        "min_child_samples": hp.quniform("min_child_samples", 50, 150, 5),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "num_leaves": hp.quniform("num_leaves", 30, 70, 1),
        "reg_alpha": hp.uniform("reg_alpha", 0.75, 1.25),
        "reg_lambda": hp.uniform("reg_lambda", 0.0, 1.0),
        "subsample_for_bin": hp.quniform("subsample_for_bin", 100000, 350000, 20000),
    },
    "XGBClassifier": {
        "colsample_bytree": hp.uniform("colsample_bytree", 0.4, 0.7),
        "gamma": hp.quniform("gamma", 0.0, 10, 0.05),
        "learning_rate": hp.quniform("learning_rate", 0.01, 0.2, 0.01),
        "max_depth": hp.choice("max_depth", np.arange(2, 15, dtype=int)),
        "min_child_weight": hp.quniform("min_child_weight", 2.5, 7.5, 1),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "subsample": hp.uniform("subsample", 0.4, 0.7),
    },
    "RandomForestClassifier": {
        "bootstrap": hp.choice("bootstrap", [True, False]),
        "max_depth": hp.choice("max_depth", np.arange(2, 10, dtype=int)),
        "n_estimators": hp.choice("n_estimators", np.arange(100, 8000, 10, dtype=int)),
        "max_features": hp.choice("max_features", ["sqrt"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(15, 25, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 20, dtype=int)),
    },
    "GradientBoostingClassifier": {
        "n_estimators": hp.choice("n_estimators", np.arange(100, 4000, 10, dtype=int)),
        "max_depth": hp.choice("max_depth", np.arange(2, 11, dtype=int)),
        "max_features": hp.choice("max_features", ["sqrt"]),
        "learning_rate": hp.quniform("learning_rate", 0.01, 0.09, 0.01),
        "loss": hp.choice("loss", ["deviance", "exponential"]),
        "min_samples_split": hp.choice(
            "min_samples_split", np.arange(2, 40, dtype=int)
        ),
        "min_samples_leaf": hp.choice("min_samples_leaf", np.arange(2, 40, dtype=int)),
    },
    "SVC": {
        "C": hp.uniform("C", 0.00000001, 15),
        "decision_function_shape": hp.choice("decision_function_shape", ["ovr", "ovo"]),
        "gamma": hp.uniform("gamma", 0.00000001, 1.5),
    },
}

In [None]:
# execute bayesian optimization grid search
machine.exec_bayes_optim_search(
    estimator_parameter_space=estimator_parameter_space,
    data=oof_train,
    target=machine.training_target,
    scoring="accuracy",
    n_folds=8,
    n_jobs=8,
    iters=1000,
    verbose=0,
)

In [None]:
# read scores summary table
bayes_optim_summary_meta = pd.read_csv("{}_hyperopt_meta_{}.csv".format(rundate, analysis))
bayes_optim_summary_meta[:5]

In [None]:
# model loss plot
for estimator in np.unique(bayes_optim_summary_meta["estimator"]):
    machine.model_loss_plot(bayes_optim_summary=bayes_optim_summary_meta, estimator=estimator)

In [None]:
# estimator parameter plots
for estimator in np.unique(bayes_optim_summary_meta["estimator"]):
    machine.modelParamPlot(
        bayes_optim_summary=bayes_optim_summary_meta,
        estimator=estimator,
        estimator_parameter_space=estimator_parameter_space,
        n_iter=100,
        chart_scale=15,
    )

## Model performance evaluation - stacked models

In [None]:
top_models = machine.top_bayes_optim_models(
    bayes_optim_summary=bayes_optim_summary_meta, num_models=1
)
top_models

In [None]:
# classification panel, single model
estimator = "SVC"; model_iter = 135
# estimator = 'GradientBoostingClassifier'; model_iter = 590
# estimator = 'XGBClassifier'; model_iter = 380

model = machine.BayesOptimClassifierBuilder(
    bayes_optim_summary=bayes_optim_summary_meta, estimator=estimator, model_iter=model_iter
)

machine.binary_classification_panel(
    model=model, X_train=oof_train, y_train=machine.training_target, labels=[0, 1], n_folds=4
)

In [None]:
# create classification reports for training data
for estimator, model_iters in top_models.items():
    for model_iter in model_iters:
        model = machine.BayesOptimClassifierBuilder(
            bayes_optim_summary=bayes_optim_summary_meta,
            estimator=estimator,
            model_iter=model_iter,
        )
        machine.binary_classification_panel(
            model=model, X_train=oof_train, y_train=machine.training_target, labels=[0, 1], n_folds=4
        )

## Submission - stacked models

In [None]:
# best second level learning model
# estimator = "LGBMClassifier"; model_iter = 876 #0.75119
# estimator = "XGBClassifier"; model_iter = 821, #0.779
# estimator = "RandomForestClassifier"; model_iter = 82 
# estimator = "GradientBoostingClassifier"; model_iter = 673 #0.77511
estimator = "SVC"; model_iter = 538 # 0.77511

# extract params and instantiate model
model = machine.BayesOptimClassifierBuilder(
    bayes_optim_summary=bayes_optim_summary_meta, estimator=estimator, model_iter=model_iter
)

model.fit(oof_train, machine.training_target.values)
y_pred = model.predict(oof_valid)
print(sum(y_pred))

In [None]:
# generate prediction submission file
submit = pd.DataFrame({"PassengerId": df_train.PassengerId, "Survived": y_pred})
submit.to_csv("submission.csv", index=False)