In [None]:
import os.path

import joblib
import mlflow
import plotly.express as px
import pandas as pd
import numpy as np
import plotly.graph_objects as go
from matplotlib import pyplot as plt
from plotly.subplots import make_subplots

from sklearn.compose import ColumnTransformer, make_column_selector
from sklearn.dummy import DummyClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import (
    accuracy_score,
    precision_score,
    recall_score,
    f1_score,
    make_scorer,
    confusion_matrix,
    RocCurveDisplay,
    ConfusionMatrixDisplay,
    PrecisionRecallDisplay,
)
from sklearn.model_selection import train_test_split, GridSearchCV
from imblearn.over_sampling import SMOTE
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import (
    StandardScaler,
    OneHotEncoder,
    PolynomialFeatures,
)
from warnings import filterwarnings

from sklearn.tree import DecisionTreeClassifier

filterwarnings("ignore")

### Understanding the business

It comes as no surprise that large marketing campaigns have negative sentiment amongst the general populace. Think about the last time you answered an unexpected phone call from an unknown number, if your experience is anything like my then, it was either a scam caller, telemarketer or survey taker. I find myself hanging up quickly when it comes to these types of calls, if I were to ever answer them. Every failed cold call costs the company commissioning the campaign time and money. The bank partner commissioning this study is seeking to increase campaign success and reduce costs by focusing on profiles that are more likely to accept their offerings. The bank partner would like a model that can better predict the type of person that would accept offers from our partner bank.

### Understanding the Features

```
Input variables:
# bank client data:
1 - age (numeric)
2 - job : type of job (categorical: 'admin.','blue-collar','entrepreneur','housemaid','management','retired','self-employed','services','student','technician','unemployed','unknown')
3 - marital : marital status (categorical: 'divorced','married','single','unknown'; note: 'divorced' means divorced or widowed)
4 - education (categorical: 'basic.4y','basic.6y','basic.9y','high.school','illiterate','professional.course','university.degree','unknown')
5 - default: has credit in default? (categorical: 'no','yes','unknown')
6 - housing: has housing loan? (categorical: 'no','yes','unknown')
7 - loan: has personal loan? (categorical: 'no','yes','unknown')
# related with the last contact of the current campaign:
8 - contact: contact communication type (categorical: 'cellular','telephone')
9 - month: last contact month of year (categorical: 'jan', 'feb', 'mar', ..., 'nov', 'dec')
10 - day_of_week: last contact day of the week (categorical: 'mon','tue','wed','thu','fri')
11 - duration: last contact duration, in seconds (numeric). Important note: this attribute highly affects the output target (e.g., if duration=0 then y='no'). Yet, the duration is not known before a call is performed. Also, after the end of the call y is obviously known. Thus, this input should only be included for benchmark purposes and should be discarded if the intention is to have a realistic predictive model.
# other attributes:
12 - campaign: number of contacts performed during this campaign and for this client (numeric, includes last contact)
13 - pdays: number of days that passed by after the client was last contacted from a previous campaign (numeric; 999 means client was not previously contacted)
14 - previous: number of contacts performed before this campaign and for this client (numeric)
15 - poutcome: outcome of the previous marketing campaign (categorical: 'failure','nonexistent','success')
# social and economic context attributes
16 - emp.var.rate: employment variation rate - quarterly indicator (numeric)
17 - cons.price.idx: consumer price index - monthly indicator (numeric)
18 - cons.conf.idx: consumer confidence index - monthly indicator (numeric)
19 - euribor3m: euribor 3 month rate - daily indicator (numeric)
20 - nr.employed: number of employees - quarterly indicator (numeric)

Output variable (desired target):
21 - y - has the client subscribed a term deposit? (binary: 'yes','no')
```



### Understanding the Data


In [None]:
df = pd.read_csv("data/bank-additional-full.csv", sep=";")

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe(include="object")

In [None]:
# Only 12 duplicates
print(
    f"Row count: {df.shape[0]}, Duplicate count: {df.shape[0] - df.drop_duplicates().shape[0]}"
)

In [None]:
# No columns missing data
df.isna().mean().round(2)

In [None]:
df.sample(5)

In [None]:
y_counts = df["y"].value_counts()
fig = px.bar(
    y_counts,
    y="count",
    title="Those that say yes are in a minority class, suggesting an imbalanced dataset",
    labels={"count": "Count", "y": "Accepted Campaign"},
)
fig.show()
fig.write_image("images/acceptance_count.png")

In [None]:
fig = px.box(
    df,
    y="age",
    title="Most potential calls are towards people aged 32-47",
    labels={"age": "Age"},
)
fig.show()
fig.write_image("images/age_box.png")

In [None]:
# Store commonly used group by
by_y_df = df.groupby("y")
by_y_df.describe(include="object")

In [None]:
# Ratio calculation, this should be a better metric to track over counts
age_ratio = (by_y_df["age"].value_counts() / df["age"].value_counts()).reset_index()
fig = px.bar(
    age_ratio.sort_values(by="count"),
    x="age",
    y="count",
    color="y",
    title="Older than 60 and younger than 23 gives at least a 20% success rate",
    labels={
        "age": "Age",
        "count": "Ratio accepting campaign",
        "y": "Accepted Campaign",
    },
)
fig.show()
fig.write_image("images/age_acceptance_ratio.png")

In [None]:
job_ratio = (by_y_df["job"].value_counts() / df["job"].value_counts()).reset_index()
fig = px.bar(
    job_ratio.sort_values(by="count"),
    x="job",
    y="count",
    color="y",
    title="Students and retirees are more likely to accept campaign",
    labels={
        "job": "Job",
        "count": "Ratio accepting campaign",
        "y": "Accepted Campaign",
    },
)
fig.show()
fig.write_image("images/job_acceptance_ratio.png")

In [None]:
education_ratio = (
    by_y_df["education"].value_counts() / df["education"].value_counts()
).reset_index()
fig = px.bar(
    education_ratio.sort_values(by="count"),
    x="education",
    y="count",
    color="y",
    title="People who are illiterate are more likely to accept campaign",
    labels={
        "education": "Education",
        "count": "Ratio accepting campaign",
        "y": "Accepted Campaign",
    },
)
fig.show()
fig.write_image("images/education_acceptance_ratio.png")

In [None]:
month_ratio = (
    by_y_df["month"].value_counts() / df["month"].value_counts()
).reset_index()
fig = px.bar(
    month_ratio.sort_values(by="count"),
    x="month",
    y="count",
    color="y",
    title="Month has a strong effect on acceptance",
    labels={
        "month": "Month",
        "count": "Ratio accepting campaign",
        "y": "Accepted Campaign",
    },
)
fig.show()
fig.write_image("images/month_acceptance_ratio.png")

In [None]:
fig = px.histogram(
    df,
    x="duration",
    color="y",
    title="The longer a call goes the higher the likelihood of acceptance",
    labels={"duration": "Duration", "y": "Accepted Campaign"},
)
fig.show()
fig.write_image("images/duration_acceptance_ratio.png")

In [None]:
fig = px.histogram(
    df,
    x="campaign",
    color="y",
    title="There are diminishing returns when making a call to the same person",
    labels={"campaign": "Number of calls", "y": "Accepted Campaign"},
)
fig.show()
fig.write_image("images/calls_acceptance_ratio.png")

In [None]:
fig = px.histogram(
    df[df["pdays"] != 999],
    x="pdays",
    color="y",
    title="Following up within a week increases the chance of success",
    labels={
        "pdays": "Days since last contact",
        "y": "Accepted Campaign",
        "count": "Count",
    },
)
fig.show()
fig.write_image("images/days_since_last_contact_acceptance_ratio.png")

In [None]:
# Correlation plots
corr_matrix = df.corr(numeric_only=True).round(2)
fig = px.imshow(
    corr_matrix,
    title="Unsurprisingly, social and economic attributes highly correlated",
    color_continuous_scale="RdBu_r",
    aspect="auto",
)
fig.update_layout(height=1000, width=1000, showlegend=False)
fig.show()
fig.write_image("images/correlation.png")

In [None]:
fig = px.histogram(
    by_y_df["cons.price.idx"].value_counts().reset_index(),
    x="cons.price.idx",
    y="count",
    color="y",
    title="Number of acceptors look static across consumer price index",
    labels={
        "cons.price.idx": "Consumer price index",
        "y": "Accepted Campaign",
        "count": "Count",
    },
)
fig.show()
fig.write_image("images/consumer_price_index_acceptance_ratio.png")

In [None]:
fig = px.histogram(
    by_y_df["cons.conf.idx"].value_counts().reset_index(),
    x="cons.conf.idx",
    y="count",
    color="y",
    title="Consumer confidence index may be used to improve acceptance",
    labels={
        "cons.conf.idx": "Consumer confidence index",
        "y": "Accepted Campaign",
        "count": "Count",
    },
)
fig.show()
fig.write_image("images/consumer_confidence_index_acceptance_ratio.png")

In [None]:
poutcome_ratio = (
    by_y_df["poutcome"].value_counts() / df["poutcome"].value_counts()
).reset_index()

fig = px.bar(
    poutcome_ratio,
    x="poutcome",
    y="count",
    color="y",
    title="Those that accept the previous campaign are more likely to accept the next campaign",
    labels={
        "poutcome": "Previous outcome",
        "y": "Accepted Campaign",
        "count": "Ratio accepting campaign",
    },
)
fig.update_layout(height=800, width=1000)
fig.show()
fig.write_image("images/poutcome_acceptance_ratio.png")

In [None]:
fig = px.scatter(
    df[df["duration"] != 0],
    x="duration",
    y="campaign",
    color="y",
    title="Number of calls does not correlate with duration of the call",
    labels={
        "campaign": "Number of calls",
        "duration": "Duration",
        "y": "Accepted Campaign",
    },
)
fig.show()
fig.write_image("images/calls_vs_duration.png")

### Understanding the data
The first thing that jumps out is how imbalanced the dataset is. This is to be expected considering that we are working with telemarketing data. I was also able to find some strong predictors for the accepting class. In particular, the following fields show strong promise: month, employment, number of contacts. It is also important to note that the pdays column using 999 to signify that fact the client was not priorly contacted. The author of this dataset also recommends avoiding the duration column as it highly affects the output.

### Data Preparation

In [None]:
# Remove outliers
q1 = df["age"].quantile(0.25)
q3 = df["age"].quantile(0.75)
iqr = q3 - q1
lower_bound = q1 - (1.5 * iqr)
upper_bound = q3 + (1.5 * iqr)
df = df.query(f"age > 0 and age >= {lower_bound} and age <= {upper_bound}")

In [None]:
df.drop("duration", axis=1, inplace=True)

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
# Enable MLflow's automatic experiment tracking for scikit-learn. This will help with tracking experiments
mlflow.sklearn.autolog()

In [None]:
# Make preprocessor
preprocessor = ColumnTransformer(
    transformers=[
        ("scaler", StandardScaler(), make_column_selector(dtype_include=np.number)),
        (
            "onehot",
            OneHotEncoder(handle_unknown="ignore"),
            make_column_selector(dtype_include=np.object_),
        ),
        (
            "poly",
            PolynomialFeatures(include_bias=False),
            make_column_selector(dtype_include=np.number),
        ),
    ]
)

In [None]:
# Split data
X = df.drop("y", axis=1)
y = df["y"]

X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.2, random_state=42
)

### Models

In [None]:
def eval_model(pipe, model_name, key, X_train, X_test, y_train, y_test, best_params):
    yes = "yes"
    train_accuracy = accuracy_score(pipe.predict(X_train), y_train)
    train_precision = precision_score(pipe.predict(X_train), y_train, pos_label=yes)
    train_recall = recall_score(pipe.predict(X_train), y_train, pos_label=yes)
    train_f1 = f1_score(pipe.predict(X_train), y_train, pos_label=yes)

    test_accuracy = accuracy_score(pipe.predict(X_test), y_test)
    test_precision = precision_score(pipe.predict(X_test), y_test, pos_label=yes)
    test_recall = recall_score(pipe.predict(X_test), y_test, pos_label=yes)
    test_f1 = f1_score(pipe.predict(X_test), y_test, pos_label=yes)

    return {
        "model_name": model_name,
        "train_accuracy": train_accuracy,
        "test_accuracy": test_accuracy,
        "train_precision": train_precision,
        "test_precision": test_precision,
        "train_recall": train_recall,
        "test_recall": test_recall,
        "train_f1": train_f1,
        "test_f1": test_f1,
        "coef": (
            pipe.named_steps[key].coef_
            if hasattr(pipe, 'named_steps') and hasattr(pipe.named_steps[key], "coef_")
            else None
        ),
        "best_params": best_params,
    }

In [None]:
precision_scorer = make_scorer(precision_score, pos_label="yes")

In [None]:
scores = []

In [None]:
dummy_clf = DummyClassifier(strategy="most_frequent")
dummy_clf.fit(X_train, y_train)

In [None]:
dummy_evaluation = eval_model(
    dummy_clf,
    "DummyClassifier",
    "dummy",
    X_train,
    X_test,
    y_train,
    y_test,
    {},
)
scores.append(dummy_evaluation)

In [None]:
tree_pipe = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("tree", DecisionTreeClassifier(random_state=42)),
    ]
)

params = {
    "smote__k_neighbors": [3, 5],
    "smote__sampling_strategy": [0.3, 0.5],
    "tree__criterion": ["gini", "entropy"],
    "tree__max_depth": [3, 4, 5],
    "tree__min_samples_split": [4, 5],
    "tree__min_samples_leaf": [3, 4, 5],
    "preprocessor__poly__degree": [2, 3],
}

grid_tree = GridSearchCV(tree_pipe, param_grid=params, scoring=precision_scorer, verbose=1)
grid_tree.fit(X_train, y_train)
print(grid_tree.best_params_)

In [None]:
tree_evaluation = eval_model(
    grid_tree.best_estimator_,
    "DecisionTreeClassifier",
    "tree",
    X_train,
    X_test,
    y_train,
    y_test,
    grid_tree.best_params_,
)
scores.append(tree_evaluation)

In [None]:
joblib.dump(grid_tree.best_estimator_, "bank_marketing_decision_tree.pkl")

In [None]:
lr_pipe = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("lr", LogisticRegression(random_state=42)),
    ]
)

params = {
    "smote__k_neighbors": [3, 5],
    "smote__sampling_strategy": [0.3, 0.5],
    "lr__penalty": ["l1", "l2", "elasticnet"],
    "lr__max_iter": [500, 1000],
    "lr__C": [0.01, 0.1, 1],
    "preprocessor__poly__degree": [2, 3],
}

grid_lr = GridSearchCV(lr_pipe, param_grid=params, scoring=precision_scorer, verbose=1)
grid_lr.fit(X_train, y_train)
print(grid_lr.best_params_)

In [None]:
lr_evaluation = eval_model(
    grid_lr.best_estimator_,
    "LogisticRegression",
    "lr",
    X_train,
    X_test,
    y_train,
    y_test,
    grid_lr.best_params_,
)
scores.append(lr_evaluation)

In [None]:
joblib.dump(grid_lr.best_estimator_, "bank_marketing_logistic_regression.pkl")

In [None]:
knn_pipe = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("knn", KNeighborsClassifier()),
    ]
)

params = {
    "smote__k_neighbors": [3, 5],
    "smote__sampling_strategy": [0.3, 0.5],
    "knn__n_neighbors": [3, 5],
    "preprocessor__poly__degree": [2, 3],
}

grid_knn = GridSearchCV(knn_pipe, param_grid=params, scoring=precision_scorer, verbose=1)
grid_knn.fit(X_train, y_train)
print(grid_knn.best_params_)

In [None]:
knn_evaluation = eval_model(
    grid_knn.best_estimator_,
    "KNeighborsClassifier",
    "knn",
    X_train,
    X_test,
    y_train,
    y_test,
    grid_knn.best_params_,
)
scores.append(knn_evaluation)

In [None]:
joblib.dump(grid_knn.best_estimator_, "bank_marketing_knn.pkl")

In [None]:
svc_pipe = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("svc", SVC(random_state=42)),
    ]
)

params = {
    "smote__k_neighbors": [3, 5],
    "smote__sampling_strategy": [0.3, 0.5],
    "svc__C": [0.01, 0.1, 1],
    "preprocessor__poly__degree": [2, 3],
}

grid_svc = GridSearchCV(svc_pipe, param_grid=params, scoring=precision_scorer, verbose=1)
grid_svc.fit(X_train, y_train)
print(grid_svc.best_params_)

In [None]:
svc_evaluation = eval_model(
    grid_svc.best_estimator_,
    "SVC",
    "svc",
    X_train,
    X_test,
    y_train,
    y_test,
    grid_svc.best_params_,
)
scores.append(svc_evaluation)

In [None]:
joblib.dump(grid_svc.best_estimator_, "bank_marketing_svc.pkl")

In [None]:
forest_pipe = Pipeline(
    [
        ("preprocessor", preprocessor),
        ("smote", SMOTE(random_state=42)),
        ("forest", RandomForestClassifier(random_state=42)),
    ]
)

params = {
    "smote__k_neighbors": [3, 5],
    "smote__sampling_strategy": [0.3, 0.5],
    "forest__n_estimators": [100, 200],
    "forest__max_depth": [3, 4, 5],
    "forest__min_samples_split": [4, 5],
    "forest__min_samples_leaf": [3, 4, 5],
    "preprocessor__poly__degree": [2, 3],
}

grid_forest = GridSearchCV(
    forest_pipe, param_grid=params, scoring=precision_scorer, verbose=1
)
grid_forest.fit(X_train, y_train)
print(grid_forest.best_params_)

In [None]:
forest_evaluation = eval_model(
    grid_forest.best_estimator_,
    "RandomForestClassifier",
    "forest",
    X_train,
    X_test,
    y_train,
    y_test,
    grid_forest.best_params_,
)
scores.append(forest_evaluation)

In [None]:
joblib.dump(grid_forest.best_estimator_, "bank_marketing_forest.pkl")

In [None]:
scores_df = pd.DataFrame(scores)
scores_df.to_csv("data/scores_df.csv")

### Evaluation

In [None]:
def load_scores_data():
    if os.path.exists("data/scores_df.csv"):
        return pd.read_csv("data/scores_df.csv")
    print("Scores csv doesn't exist, train and score models first")
    return None

In [None]:
# Numeric columns
numeric_columns = [
    "age",
    "campaign",
    "pdays",
    "previous",
    "emp.var.rate",
    "cons.price.idx",
    "cons.conf.idx",
    "euribor3m",
    "nr.employed",
]

# Non-numeric columns
category_columns = [
    "job",
    "marital",
    "education",
    "default",
    "housing",
    "loan",
    "contact",
    "month",
    "day_of_week",
    "poutcome",
]

In [None]:
def get_feature_names(grid):
    preprocessor = grid.best_estimator_.named_steps["preprocessor"]
    scaler_features = preprocessor.named_transformers_["scaler"].get_feature_names_out(
        numeric_columns
    )
    poly_features = preprocessor.named_transformers_["poly"].get_feature_names_out(
        numeric_columns
    )

    onehot_features = preprocessor.named_transformers_["onehot"].get_feature_names_out(
        category_columns
    )
    return np.concatenate(
        [
            scaler_features,
            poly_features,
            onehot_features,
        ]
    )

In [None]:
scores_df = load_scores_data()

In [None]:
fig = make_subplots(
    rows=4,
    cols=1,
    subplot_titles=(
        "Train vs Test Accuracy",
        "Train vs Test Precision",
        "Train vs Test Recall",
        "Train vs Test F1",
    ),
)
metrics = [
    ("train_accuracy", "test_accuracy"),
    ("train_precision", "test_precision"),
    ("train_recall", "test_recall"),
    ("train_f1", "test_f1"),
]


def title_case(word):
    word.replace("_", " ")
    return word.title()


for i, metric_tuple in enumerate(metrics):
    train_bar = go.Bar(
        x=scores_df["model_name"],
        y=scores_df[metric_tuple[0]],
        name=title_case(metric_tuple[0]),
    )
    fig.add_trace(train_bar, row=i + 1, col=1)
    test_bar = go.Bar(
        x=scores_df["model_name"],
        y=scores_df[metric_tuple[1]],
        name=title_case(metric_tuple[1]),
    )
    fig.add_trace(test_bar, row=i + 1, col=1)

fig.update_layout(
    title_text="Train vs Test Accuracy, Precision, Recall and F1",
    yaxis_title="Metric performance",
    height=1000,
    width=1800,
    showlegend=True,
    barmode="group",
    bargap=0.35,
)
fig.show()
fig.write_image("images/metric_comparison.png")

In [None]:
# Roc Curve Logistic Regression
fig, ax = plt.subplots()
RocCurveDisplay.from_estimator(
    grid_lr, X_test, y_test, pos_label="yes", ax=ax, name="LogisticRegression"
)
RocCurveDisplay.from_estimator(
    grid_knn, X_test, y_test, pos_label="yes", ax=ax, name="KNN"
)
RocCurveDisplay.from_estimator(
    grid_tree, X_test, y_test, pos_label="yes", ax=ax, name="DecisionTree"
)
RocCurveDisplay.from_estimator(
    grid_svc, X_test, y_test, pos_label="yes", ax=ax, name="SVC"
)
RocCurveDisplay.from_estimator(
    grid_forest, X_test, y_test, pos_label="yes", ax=ax, name="RandomForest"
)

plt.grid()
plt.plot(np.arange(0, 1.1, 0.1), np.arange(0, 1.1, 0.1), label="baseline")
ax.set_title("Roc Auc curve comparison")
plt.savefig("images/roc_curve_comparison.png")
plt.legend()
plt.show()

In [None]:
# Roc Curve Logistic Regression
fig, ax = plt.subplots()
PrecisionRecallDisplay.from_estimator(
    grid_lr, X_test, y_test, pos_label="yes", ax=ax, name="LogisticRegression"
)
PrecisionRecallDisplay.from_estimator(
    grid_knn, X_test, y_test, pos_label="yes", ax=ax, name="KNN"
)
PrecisionRecallDisplay.from_estimator(
    grid_tree, X_test, y_test, pos_label="yes", ax=ax, name="DecisionTree"
)
PrecisionRecallDisplay.from_estimator(
    grid_svc, X_test, y_test, pos_label="yes", ax=ax, name="SVC"
)
PrecisionRecallDisplay.from_estimator(
    grid_forest, X_test, y_test, pos_label="yes", ax=ax, name="RandomForest"
)

plt.grid()
plt.plot(np.arange(0, 1.1, 0.1), np.arange(0, 1.1, 0.1), label="baseline")
ax.set_title("Precision Recall curve comparison")
plt.savefig("images/pr_curve_comparison.png")
plt.legend()
plt.show()

In [None]:
logistic_importance = {
    "Feature": get_feature_names(grid_lr),
    "Importance - Coef": np.abs(grid_lr.best_estimator_.named_steps["lr"].coef_[0]),
}
logistic_importance_df = pd.DataFrame(logistic_importance)

In [None]:
logistic_importance_df = logistic_importance_df.sort_values(
    "Importance - Coef", ascending=False
)
fig = px.bar(
    logistic_importance_df[:10],
    x="Importance - Coef",
    y="Feature",
    orientation="h",
    title="Top Ten Features LogisticRegression",
)
fig.show()
fig.write_image("images/logistic_regression_top_features.png")

In [None]:
# Logistic Regression confusion matrix
cm = confusion_matrix(y_test, grid_lr.best_estimator_.predict(X_test))
fig = ConfusionMatrixDisplay(cm, display_labels=grid_lr.best_estimator_.classes_)
fig.plot()
fig.ax_.set_title("LogisticRegression Confusion Matrix")
plt.savefig("images/logistic_regression_cm.png", dpi=100)
plt.show()

In [None]:
tree_importance = {
    "Feature": get_feature_names(grid_tree),
    "Importance": np.abs(
        grid_tree.best_estimator_.named_steps["tree"].feature_importances_
    ),
}
tree_importance_df = pd.DataFrame(tree_importance)

In [None]:
tree_importance_df = tree_importance_df.sort_values("Importance", ascending=False)
fig = px.bar(
    tree_importance_df[:10],
    x="Importance",
    y="Feature",
    orientation="h",
    title="Top Ten Features DecisionTree",
)
fig.show()
fig.write_image("images/decision_tree_top_features.png")

In [None]:
# DecisionTree confusion matrix
cm = confusion_matrix(y_test, grid_tree.best_estimator_.predict(X_test))
fig = ConfusionMatrixDisplay(cm, display_labels=grid_tree.best_estimator_.classes_)
fig.plot()
fig.ax_.set_title("DecisionTree Confusion Matrix")
plt.savefig("images/tree_cm.png", dpi=100)
plt.show()

In [None]:
# KNN confusion matrix
cm = confusion_matrix(y_test, grid_knn.best_estimator_.predict(X_test))
fig = ConfusionMatrixDisplay(cm, display_labels=grid_knn.best_estimator_.classes_)
fig.plot()
fig.ax_.set_title("KNN Confusion Matrix")
plt.savefig("images/knn_cm.png", dpi=100)
plt.show()

In [None]:
# svc confusion matrix
cm = confusion_matrix(y_test, grid_svc.best_estimator_.predict(X_test))
fig = ConfusionMatrixDisplay(cm, display_labels=grid_svc.best_estimator_.classes_)
fig.plot()
fig.ax_.set_title("SVC Confusion Matrix")
plt.savefig("images/svc_cm.png", dpi=100)
plt.show()

In [None]:
forest_importance = {
    "Feature": get_feature_names(grid_forest),
    "Importance": np.abs(
        grid_forest.best_estimator_.named_steps["forest"].feature_importances_
    ),
}
forest_importance_df = pd.DataFrame(forest_importance)

In [None]:
forest_importance_df = forest_importance_df.sort_values("Importance", ascending=False)
fig = px.bar(
    forest_importance_df[:10],
    x="Importance",
    y="Feature",
    orientation="h",
    title="Top Ten Features RandomForest",
)
fig.show()
fig.write_image("images/forest_top_features.png")

In [None]:
# RandomForest confusion matrix
cm = confusion_matrix(y_test, grid_forest.best_estimator_.predict(X_test))
fig = ConfusionMatrixDisplay(cm, display_labels=grid_forest.best_estimator_.classes_)
fig.plot()
fig.ax_.set_title("RandomForest Confusion Matrix")
plt.savefig("images/forest_cm.png", dpi=100)
plt.show()