Skip to content

Commit

Permalink
ENH optimize & add treeple tutorial figures (#260)
Browse files Browse the repository at this point in the history
* DOC update section titles for ordering

* DOC specify sphinx thumbnail figure for pAUC&S@98

* ENH optimize figure style

* ENH optimize figure styles for single view

* ENH optimize figures for multiview

* ENH optimize figures styles

* ENH add 2D plot for permuted data

* ENH add hellinger distance tutorial

& DOC optimize text

* ENH add tight layout to prevent figure cutoff

* DOC modify code ref color

* DOC optimize figure size & legend
  • Loading branch information
PSSF23 committed Apr 24, 2024
1 parent 638dd7c commit 920a819
Show file tree
Hide file tree
Showing 10 changed files with 520 additions and 166 deletions.
6 changes: 3 additions & 3 deletions doc/_static/style.css
Expand Up @@ -42,17 +42,17 @@ h4 {

/* Links in the Note boxes */
.note a {
color: white;
color: blue;
text-decoration: underline;
}

.note a:hover {
color: white;
color: blue;
font-weight: bold;
text-decoration: underline;
}

/* Links in "Note" boxes */
.alert-info a code span {
color: white;
color: blue;
}
70 changes: 45 additions & 25 deletions examples/treeple/treeple_tutorial_CMI.py
@@ -1,18 +1,24 @@
"""
====================================
Treeple tutorial for calculating CMI
====================================
=====================
2-1b: Calculating CMI
=====================
"""

import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import seaborn as sns
from scipy.stats import entropy

from sktree.datasets import make_trunk_classification
from sktree.ensemble import HonestForestClassifier
from sktree.stats import build_hyppo_oob_forest
from sktree.tree import MultiViewDecisionTreeClassifier

sns.set(color_codes=True, style="white", context="talk", font_scale=1.5)
PALETTE = sns.color_palette("Set1")
sns.set_palette(PALETTE[1:5] + PALETTE[6:], n_colors=9)
sns.set_style("white", {"axes.edgecolor": "#dddddd"})
# %%
# CMI
# ---
Expand Down Expand Up @@ -44,29 +50,32 @@
seed=1,
)


# class one has a mean at two for X
X, y = make_trunk_classification(
n_samples=1000,
n_dim=1,
mu_0=0,
mu_1=2,
n_informative=1,
seed=1,
seed=2,
)

Z_X = np.hstack((Z, X))

# scatter plot the samples for Z
plt.hist(Z[:500], bins=15, alpha=0.6, color="blue", label="negative")
plt.hist(Z[500:], bins=15, alpha=0.6, color="red", label="positive")
plt.legend()
plt.show()

Z_X_y = np.hstack((Z_X, y.reshape(-1, 1)))
Z_X_y = pd.DataFrame(Z_X_y, columns=["Z", "X", "y"])
Z_X_y = Z_X_y.replace({"y": 0.0}, "Class Zero")
Z_X_y = Z_X_y.replace({"y": 1.0}, "Class One")

# scatter plot the samples for X
plt.hist(X[:500], bins=15, alpha=0.6, color="blue", label="negative")
plt.hist(X[500:], bins=15, alpha=0.6, color="red", label="positive")
plt.legend()
plt.show()
fig, ax = plt.subplots(figsize=(6, 6))
fig.tight_layout()
ax.tick_params(labelsize=15)
sns.scatterplot(data=Z_X_y, x="Z", y="X", hue="y", palette=PALETTE[:2], alpha=0.2)
sns.kdeplot(data=Z_X_y, x="Z", y="X", hue="y", palette=PALETTE[:2], alpha=0.6)
ax.set_ylabel("X", fontsize=15)
ax.set_xlabel("Z", fontsize=15)
plt.legend(frameon=False, fontsize=15)


# %%
Expand All @@ -86,16 +95,22 @@
)

# fit the model and obtain the tree posteriors
_, observe_proba = build_hyppo_oob_forest(est, np.hstack((X, Z)), y)
_, observe_proba = build_hyppo_oob_forest(est, Z_X, y)

# generate forest posteriors for the two classes
observe_proba = np.nanmean(observe_proba, axis=0)


# scatter plot the posterior probabilities for class one
plt.hist(observe_proba[:500][:, 1], bins=30, alpha=0.6, color="blue", label="negative")
plt.hist(observe_proba[500:][:, 1], bins=30, alpha=0.6, color="red", label="positive")
plt.legend()
fig, ax = plt.subplots(figsize=(6, 6))
fig.tight_layout()
ax.tick_params(labelsize=15)

# histogram plot the posterior probabilities for class one
ax.hist(observe_proba[:500][:, 1], bins=50, alpha=0.6, color=PALETTE[1], label="negative")
ax.hist(observe_proba[500:][:, 1], bins=50, alpha=0.3, color=PALETTE[0], label="positive")
ax.set_ylabel("# of Samples", fontsize=15)
ax.set_xlabel("Class One Posterior", fontsize=15)
plt.legend(frameon=False, fontsize=15)
plt.show()

# %%
Expand All @@ -120,12 +135,17 @@
single_proba = np.nanmean(single_proba, axis=0)


# scatter plot the posterior probabilities for class one
plt.hist(single_proba[:500][:, 1], bins=30, alpha=0.6, color="blue", label="negative")
plt.hist(single_proba[500:][:, 1], bins=30, alpha=0.6, color="red", label="positive")
plt.legend()
plt.show()
fig, ax = plt.subplots(figsize=(6, 6))
fig.tight_layout()
ax.tick_params(labelsize=15)

# histogram plot the posterior probabilities for class one
ax.hist(single_proba[:500][:, 1], bins=50, alpha=0.6, color=PALETTE[1], label="negative")
ax.hist(single_proba[500:][:, 1], bins=50, alpha=0.3, color=PALETTE[0], label="positive")
ax.set_xlabel("Z", fontsize=15)
ax.set_xlabel("Class One Posterior", fontsize=15)
plt.legend(frameon=False, fontsize=15)
plt.show()

# %%
# Calculate the statistic
Expand Down
67 changes: 50 additions & 17 deletions examples/treeple/treeple_tutorial_GMM.py
@@ -1,20 +1,26 @@
"""
============================================================
Treeple tutorial for estimating true posteriors & statistics
============================================================
==========================================
0: Estimating true posteriors & statistics
==========================================
"""

import warnings

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
from scipy.stats import entropy, multivariate_normal
from sklearn.metrics import RocCurveDisplay, roc_auc_score, roc_curve
from sklearn.metrics import roc_auc_score, roc_curve

from sktree.datasets import make_trunk_mixture_classification

sns.set(color_codes=True, style="white", context="talk", font_scale=1.5)
PALETTE = sns.color_palette("Set1")
sns.set_palette(PALETTE[1:5] + PALETTE[6:], n_colors=9)
sns.set_style("white", {"axes.edgecolor": "#dddddd"})
warnings.filterwarnings("ignore")


# %%
# True posterior estimation
# -------------------------
Expand All @@ -40,10 +46,16 @@
)


# scatter plot the samples
plt.hist(X[:5000], bins=15, alpha=0.6, color="blue", label="negative")
plt.hist(X[5000:], bins=25, alpha=0.6, color="red", label="positive")
plt.legend()
fig, ax = plt.subplots(figsize=(6, 6))
fig.tight_layout()
ax.tick_params(labelsize=15)

# histogram plot the samples
ax.hist(X[:5000], bins=50, alpha=0.6, color=PALETTE[1], label="negative")
ax.hist(X[5000:], bins=50, alpha=0.3, color=PALETTE[0], label="positive")
ax.set_xlabel("X", fontsize=15)
ax.set_ylabel("Likelihood", fontsize=15)
plt.legend(frameon=False, fontsize=15)
plt.show()

# %%
Expand Down Expand Up @@ -85,6 +97,7 @@


def Calculate_SA(y_true, y_pred_proba, max_fpr=0.02) -> float:
"""Calculate the sensitivity at a specific specificity"""
# check the shape of true labels
if y_true.squeeze().ndim != 1:
raise ValueError(f"y_true must be 1d, not {y_true.shape}")
Expand All @@ -99,24 +112,34 @@ def Calculate_SA(y_true, y_pred_proba, max_fpr=0.02) -> float:
y_true, y_pred_proba[:, 1], pos_label=2, drop_intermediate=False
)
sa98 = max([tpr for (fpr, tpr) in zip(fpr, tpr) if fpr <= max_fpr])
RocCurveDisplay(fpr=fpr, tpr=tpr).plot(label="ROC Curve")

fig, ax = plt.subplots(figsize=(6, 6))
fig.tight_layout()
ax.tick_params(labelsize=15)
ax.set_xlim([-0.005, 1.005])
ax.set_ylim([-0.005, 1.005])
ax.set_xlabel("False Positive Rate", fontsize=15)
ax.set_ylabel("True Positive Rate", fontsize=15)

ax.plot(fpr, tpr, label="ROC curve", color=PALETTE[1])

spec = int((1 - max_fpr) * 100)
plt.axvline(
ax.axvline(
x=max_fpr,
color="r",
color=PALETTE[0],
ymin=0,
ymax=sa98,
label="S@" + str(spec) + " = " + str(round(sa98, 2)),
linestyle="--",
)
plt.axhline(y=sa98, xmin=0, xmax=max_fpr, color="r", linestyle="--")
plt.legend()
ax.axhline(y=sa98, xmin=0, xmax=max_fpr, color="r", linestyle="--")
ax.legend(frameon=False, fontsize=15)

return sa98


sa98 = Calculate_SA(y, pos, max_fpr=0.02)
print("S@98 =", round(sa98, 2))

# %%
# Generate true statistic estimates: MI
Expand Down Expand Up @@ -157,21 +180,31 @@ def Calculate_pAUC(y_true, y_pred_proba, max_fpr=0.1) -> float:
fpr, tpr, thresholds = roc_curve(
y_true, y_pred_proba[:, 1], pos_label=2, drop_intermediate=False
)
RocCurveDisplay(fpr=fpr, tpr=tpr).plot(label="ROC Curve")

fig, ax = plt.subplots(figsize=(6, 6))
fig.tight_layout()
ax.tick_params(labelsize=15)
ax.set_xlim([-0.005, 1.005])
ax.set_ylim([-0.005, 1.005])
ax.set_xlabel("False Positive Rate", fontsize=15)
ax.set_ylabel("True Positive Rate", fontsize=15)

ax.plot(fpr, tpr, label="ROC curve", color=PALETTE[1])
# Calculate pAUC at the specific threshold
pAUC = roc_auc_score(y_true, y_pred_proba[:, 1], max_fpr=max_fpr)

pos = np.where(fpr == max_fpr)[0][-1]
plt.fill_between(
ax.fill_between(
fpr[:pos],
tpr[:pos],
color="r",
color=PALETTE[0],
alpha=0.6,
label="pAUC@90 = " + str(round(pAUC, 2)),
linestyle="--",
)
plt.legend()
ax.legend(frameon=False, fontsize=15)
return pAUC


pAUC = Calculate_pAUC(y, pos, max_fpr=0.1)
print("pAUC@90 =", round(pAUC, 2))
108 changes: 108 additions & 0 deletions examples/treeple/treeple_tutorial_HD.py
@@ -0,0 +1,108 @@
"""
====================================
1-1d: Calculating Hellinger Distance
====================================
"""

import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns

from sktree.datasets import make_trunk_classification
from sktree.ensemble import HonestForestClassifier
from sktree.stats import build_hyppo_oob_forest

sns.set(color_codes=True, style="white", context="talk", font_scale=1.5)
PALETTE = sns.color_palette("Set1")
sns.set_palette(PALETTE[1:5] + PALETTE[6:], n_colors=9)
sns.set_style("white", {"axes.edgecolor": "#dddddd"})

# %%
# Hellinger Distance
# ------------------
#
# Hellinger distance quantifies the similarity between the two posterior
# probability distributions (class zero and class one).
#
# .. math:: H(\eta(X), 1-\eta(X)) = \frac{1}{\sqrt{2}} \; \bigl\|\sqrt{\eta(X)} - \sqrt{1-\eta(X)} \bigr\|_2
#
# With a binary class simulation as an example, this tutorial will show
# how to use ``treeple`` to calculate the statistic.

# %%
# Create a simulation with two gaussians
# --------------------------------------


# create a binary class simulation with two gaussians
# 500 samples for each class, class zero is standard
# gaussian, and class one has a mean at one
X, y = make_trunk_classification(
n_samples=1000,
n_dim=1,
mu_0=0,
mu_1=1,
n_informative=1,
seed=1,
)


fig, ax = plt.subplots(figsize=(6, 6))
fig.tight_layout()
ax.tick_params(labelsize=15)

# histogram plot the samples
ax.hist(X[:500], bins=50, alpha=0.6, color=PALETTE[1], label="negative")
ax.hist(X[500:], bins=50, alpha=0.3, color=PALETTE[0], label="positive")
ax.set_xlabel("X", fontsize=15)
ax.set_ylabel("Likelihood", fontsize=15)
plt.legend(frameon=False, fontsize=15)
plt.show()

# %%
# Fit the model
# -------------


# initialize the forest with 100 trees
est = HonestForestClassifier(
n_estimators=100,
max_samples=1.6,
max_features=0.3,
bootstrap=True,
stratify=True,
random_state=1,
)

# fit the model and obtain the tree posteriors
_, observe_proba = build_hyppo_oob_forest(est, X, y)

# generate forest posteriors for the two classes
observe_proba = np.nanmean(observe_proba, axis=0)


fig, ax = plt.subplots(figsize=(6, 6))
fig.tight_layout()
ax.tick_params(labelsize=15)

# histogram plot the posterior probabilities for class one
ax.hist(observe_proba[:500][:, 1], bins=50, alpha=0.6, color=PALETTE[1], label="negative")
ax.hist(observe_proba[500:][:, 1], bins=50, alpha=0.3, color=PALETTE[0], label="positive")
ax.set_ylabel("# of Samples", fontsize=15)
ax.set_xlabel("Class One Posterior", fontsize=15)
plt.legend(frameon=False, fontsize=15)
plt.show()

# %%
# Calculate the statistic
# -----------------------


def Calculate_hd(y_pred_proba) -> float:
return np.sqrt(
np.sum((np.sqrt(y_pred_proba[:, 1]) - np.sqrt(y_pred_proba[:, 0])) ** 2)
) / np.sqrt(2)


hd = Calculate_hd(observe_proba)
print("Hellinger distance =", round(hd, 2))

0 comments on commit 920a819

Please sign in to comment.