ENH optimize & add treeple tutorial figures (#260)

* DOC update section titles for ordering * DOC specify sphinx thumbnail figure for pAUC&S@98 * ENH optimize figure style * ENH optimize figure styles for single view * ENH optimize figures for multiview * ENH optimize figures styles * ENH add 2D plot for permuted data * ENH add hellinger distance tutorial & DOC optimize text * ENH add tight layout to prevent figure cutoff * DOC modify code ref color * DOC optimize figure size & legend
neurodata · Apr 24, 2024 · 920a819 · 920a819
1 parent 638dd7c
commit 920a819
Show file tree

Hide file tree

Showing 10 changed files with 520 additions and 166 deletions.
diff --git a/doc/_static/style.css b/doc/_static/style.css
@@ -42,17 +42,17 @@ h4 {
 
 /* Links in the Note boxes */
 .note a {
-  color: white;
+  color: blue;
   text-decoration: underline;
 }
 
 .note a:hover {
-  color: white;
+  color: blue;
   font-weight: bold;
   text-decoration: underline;
 }
 
 /* Links in "Note" boxes */
 .alert-info a code span {
-  color: white;
+  color: blue;
 }
diff --git a/examples/treeple/treeple_tutorial_CMI.py b/examples/treeple/treeple_tutorial_CMI.py
@@ -1,18 +1,24 @@
 """
-====================================
-Treeple tutorial for calculating CMI
-====================================
+=====================
+2-1b: Calculating CMI
+=====================
 """
 
 import matplotlib.pyplot as plt
 import numpy as np
+import pandas as pd
+import seaborn as sns
 from scipy.stats import entropy
 
 from sktree.datasets import make_trunk_classification
 from sktree.ensemble import HonestForestClassifier
 from sktree.stats import build_hyppo_oob_forest
 from sktree.tree import MultiViewDecisionTreeClassifier
 
+sns.set(color_codes=True, style="white", context="talk", font_scale=1.5)
+PALETTE = sns.color_palette("Set1")
+sns.set_palette(PALETTE[1:5] + PALETTE[6:], n_colors=9)
+sns.set_style("white", {"axes.edgecolor": "#dddddd"})
 # %%
 # CMI
 # ---
@@ -44,29 +50,32 @@
     seed=1,
 )
 
-
+# class one has a mean at two for X
 X, y = make_trunk_classification(
     n_samples=1000,
     n_dim=1,
     mu_0=0,
     mu_1=2,
     n_informative=1,
-    seed=1,
+    seed=2,
 )
 
+Z_X = np.hstack((Z, X))
 
-# scatter plot the samples for Z
-plt.hist(Z[:500], bins=15, alpha=0.6, color="blue", label="negative")
-plt.hist(Z[500:], bins=15, alpha=0.6, color="red", label="positive")
-plt.legend()
-plt.show()
 
+Z_X_y = np.hstack((Z_X, y.reshape(-1, 1)))
+Z_X_y = pd.DataFrame(Z_X_y, columns=["Z", "X", "y"])
+Z_X_y = Z_X_y.replace({"y": 0.0}, "Class Zero")
+Z_X_y = Z_X_y.replace({"y": 1.0}, "Class One")
 
-# scatter plot the samples for X
-plt.hist(X[:500], bins=15, alpha=0.6, color="blue", label="negative")
-plt.hist(X[500:], bins=15, alpha=0.6, color="red", label="positive")
-plt.legend()
-plt.show()
+fig, ax = plt.subplots(figsize=(6, 6))
+fig.tight_layout()
+ax.tick_params(labelsize=15)
+sns.scatterplot(data=Z_X_y, x="Z", y="X", hue="y", palette=PALETTE[:2], alpha=0.2)
+sns.kdeplot(data=Z_X_y, x="Z", y="X", hue="y", palette=PALETTE[:2], alpha=0.6)
+ax.set_ylabel("X", fontsize=15)
+ax.set_xlabel("Z", fontsize=15)
+plt.legend(frameon=False, fontsize=15)
 
 
 # %%
@@ -86,16 +95,22 @@
 )
 
 # fit the model and obtain the tree posteriors
-_, observe_proba = build_hyppo_oob_forest(est, np.hstack((X, Z)), y)
+_, observe_proba = build_hyppo_oob_forest(est, Z_X, y)
 
 # generate forest posteriors for the two classes
 observe_proba = np.nanmean(observe_proba, axis=0)
 
 
-# scatter plot the posterior probabilities for class one
-plt.hist(observe_proba[:500][:, 1], bins=30, alpha=0.6, color="blue", label="negative")
-plt.hist(observe_proba[500:][:, 1], bins=30, alpha=0.6, color="red", label="positive")
-plt.legend()
+fig, ax = plt.subplots(figsize=(6, 6))
+fig.tight_layout()
+ax.tick_params(labelsize=15)
+
+# histogram plot the posterior probabilities for class one
+ax.hist(observe_proba[:500][:, 1], bins=50, alpha=0.6, color=PALETTE[1], label="negative")
+ax.hist(observe_proba[500:][:, 1], bins=50, alpha=0.3, color=PALETTE[0], label="positive")
+ax.set_ylabel("# of Samples", fontsize=15)
+ax.set_xlabel("Class One Posterior", fontsize=15)
+plt.legend(frameon=False, fontsize=15)
 plt.show()
 
 # %%
@@ -120,12 +135,17 @@
 single_proba = np.nanmean(single_proba, axis=0)
 
 
-# scatter plot the posterior probabilities for class one
-plt.hist(single_proba[:500][:, 1], bins=30, alpha=0.6, color="blue", label="negative")
-plt.hist(single_proba[500:][:, 1], bins=30, alpha=0.6, color="red", label="positive")
-plt.legend()
-plt.show()
+fig, ax = plt.subplots(figsize=(6, 6))
+fig.tight_layout()
+ax.tick_params(labelsize=15)
 
+# histogram plot the posterior probabilities for class one
+ax.hist(single_proba[:500][:, 1], bins=50, alpha=0.6, color=PALETTE[1], label="negative")
+ax.hist(single_proba[500:][:, 1], bins=50, alpha=0.3, color=PALETTE[0], label="positive")
+ax.set_xlabel("Z", fontsize=15)
+ax.set_xlabel("Class One Posterior", fontsize=15)
+plt.legend(frameon=False, fontsize=15)
+plt.show()
 
 # %%
 # Calculate the statistic

diff --git a/examples/treeple/treeple_tutorial_GMM.py b/examples/treeple/treeple_tutorial_GMM.py
@@ -1,20 +1,26 @@
 """
-============================================================
-Treeple tutorial for estimating true posteriors & statistics
-============================================================
+==========================================
+0: Estimating true posteriors & statistics
+==========================================
 """
 
 import warnings
 
 import matplotlib.pyplot as plt
 import numpy as np
+import seaborn as sns
 from scipy.stats import entropy, multivariate_normal
-from sklearn.metrics import RocCurveDisplay, roc_auc_score, roc_curve
+from sklearn.metrics import roc_auc_score, roc_curve
 
 from sktree.datasets import make_trunk_mixture_classification
 
+sns.set(color_codes=True, style="white", context="talk", font_scale=1.5)
+PALETTE = sns.color_palette("Set1")
+sns.set_palette(PALETTE[1:5] + PALETTE[6:], n_colors=9)
+sns.set_style("white", {"axes.edgecolor": "#dddddd"})
 warnings.filterwarnings("ignore")
 
+
 # %%
 # True posterior estimation
 # -------------------------
@@ -40,10 +46,16 @@
 )
 
 
-# scatter plot the samples
-plt.hist(X[:5000], bins=15, alpha=0.6, color="blue", label="negative")
-plt.hist(X[5000:], bins=25, alpha=0.6, color="red", label="positive")
-plt.legend()
+fig, ax = plt.subplots(figsize=(6, 6))
+fig.tight_layout()
+ax.tick_params(labelsize=15)
+
+# histogram plot the samples
+ax.hist(X[:5000], bins=50, alpha=0.6, color=PALETTE[1], label="negative")
+ax.hist(X[5000:], bins=50, alpha=0.3, color=PALETTE[0], label="positive")
+ax.set_xlabel("X", fontsize=15)
+ax.set_ylabel("Likelihood", fontsize=15)
+plt.legend(frameon=False, fontsize=15)
 plt.show()
 
 # %%
@@ -85,6 +97,7 @@
 
 
 def Calculate_SA(y_true, y_pred_proba, max_fpr=0.02) -> float:
+    """Calculate the sensitivity at a specific specificity"""
     # check the shape of true labels
     if y_true.squeeze().ndim != 1:
         raise ValueError(f"y_true must be 1d, not {y_true.shape}")
@@ -99,24 +112,34 @@ def Calculate_SA(y_true, y_pred_proba, max_fpr=0.02) -> float:
             y_true, y_pred_proba[:, 1], pos_label=2, drop_intermediate=False
         )
     sa98 = max([tpr for (fpr, tpr) in zip(fpr, tpr) if fpr <= max_fpr])
-    RocCurveDisplay(fpr=fpr, tpr=tpr).plot(label="ROC Curve")
+
+    fig, ax = plt.subplots(figsize=(6, 6))
+    fig.tight_layout()
+    ax.tick_params(labelsize=15)
+    ax.set_xlim([-0.005, 1.005])
+    ax.set_ylim([-0.005, 1.005])
+    ax.set_xlabel("False Positive Rate", fontsize=15)
+    ax.set_ylabel("True Positive Rate", fontsize=15)
+
+    ax.plot(fpr, tpr, label="ROC curve", color=PALETTE[1])
 
     spec = int((1 - max_fpr) * 100)
-    plt.axvline(
+    ax.axvline(
         x=max_fpr,
-        color="r",
+        color=PALETTE[0],
         ymin=0,
         ymax=sa98,
         label="S@" + str(spec) + " = " + str(round(sa98, 2)),
         linestyle="--",
     )
-    plt.axhline(y=sa98, xmin=0, xmax=max_fpr, color="r", linestyle="--")
-    plt.legend()
+    ax.axhline(y=sa98, xmin=0, xmax=max_fpr, color="r", linestyle="--")
+    ax.legend(frameon=False, fontsize=15)
 
     return sa98
 
 
 sa98 = Calculate_SA(y, pos, max_fpr=0.02)
+print("S@98 =", round(sa98, 2))
 
 # %%
 # Generate true statistic estimates: MI
@@ -157,21 +180,31 @@ def Calculate_pAUC(y_true, y_pred_proba, max_fpr=0.1) -> float:
         fpr, tpr, thresholds = roc_curve(
             y_true, y_pred_proba[:, 1], pos_label=2, drop_intermediate=False
         )
-    RocCurveDisplay(fpr=fpr, tpr=tpr).plot(label="ROC Curve")
+
+    fig, ax = plt.subplots(figsize=(6, 6))
+    fig.tight_layout()
+    ax.tick_params(labelsize=15)
+    ax.set_xlim([-0.005, 1.005])
+    ax.set_ylim([-0.005, 1.005])
+    ax.set_xlabel("False Positive Rate", fontsize=15)
+    ax.set_ylabel("True Positive Rate", fontsize=15)
+
+    ax.plot(fpr, tpr, label="ROC curve", color=PALETTE[1])
     # Calculate pAUC at the specific threshold
     pAUC = roc_auc_score(y_true, y_pred_proba[:, 1], max_fpr=max_fpr)
 
     pos = np.where(fpr == max_fpr)[0][-1]
-    plt.fill_between(
+    ax.fill_between(
         fpr[:pos],
         tpr[:pos],
-        color="r",
+        color=PALETTE[0],
         alpha=0.6,
         label="pAUC@90 = " + str(round(pAUC, 2)),
         linestyle="--",
     )
-    plt.legend()
+    ax.legend(frameon=False, fontsize=15)
     return pAUC
 
 
 pAUC = Calculate_pAUC(y, pos, max_fpr=0.1)
+print("pAUC@90 =", round(pAUC, 2))
diff --git a/examples/treeple/treeple_tutorial_HD.py b/examples/treeple/treeple_tutorial_HD.py
@@ -0,0 +1,108 @@
+"""
+====================================
+1-1d: Calculating Hellinger Distance
+====================================
+"""
+
+import matplotlib.pyplot as plt
+import numpy as np
+import seaborn as sns
+
+from sktree.datasets import make_trunk_classification
+from sktree.ensemble import HonestForestClassifier
+from sktree.stats import build_hyppo_oob_forest
+
+sns.set(color_codes=True, style="white", context="talk", font_scale=1.5)
+PALETTE = sns.color_palette("Set1")
+sns.set_palette(PALETTE[1:5] + PALETTE[6:], n_colors=9)
+sns.set_style("white", {"axes.edgecolor": "#dddddd"})
+
+# %%
+# Hellinger Distance
+# ------------------
+#
+# Hellinger distance quantifies the similarity between the two posterior
+# probability distributions (class zero and class one).
+#
+# .. math:: H(\eta(X), 1-\eta(X)) = \frac{1}{\sqrt{2}} \; \bigl\|\sqrt{\eta(X)} - \sqrt{1-\eta(X)} \bigr\|_2
+#
+# With a binary class simulation as an example, this tutorial will show
+# how to use ``treeple`` to calculate the statistic.
+
+# %%
+# Create a simulation with two gaussians
+# --------------------------------------
+
+
+# create a binary class simulation with two gaussians
+# 500 samples for each class, class zero is standard
+# gaussian, and class one has a mean at one
+X, y = make_trunk_classification(
+    n_samples=1000,
+    n_dim=1,
+    mu_0=0,
+    mu_1=1,
+    n_informative=1,
+    seed=1,
+)
+
+
+fig, ax = plt.subplots(figsize=(6, 6))
+fig.tight_layout()
+ax.tick_params(labelsize=15)
+
+# histogram plot the samples
+ax.hist(X[:500], bins=50, alpha=0.6, color=PALETTE[1], label="negative")
+ax.hist(X[500:], bins=50, alpha=0.3, color=PALETTE[0], label="positive")
+ax.set_xlabel("X", fontsize=15)
+ax.set_ylabel("Likelihood", fontsize=15)
+plt.legend(frameon=False, fontsize=15)
+plt.show()
+
+# %%
+# Fit the model
+# -------------
+
+
+# initialize the forest with 100 trees
+est = HonestForestClassifier(
+    n_estimators=100,
+    max_samples=1.6,
+    max_features=0.3,
+    bootstrap=True,
+    stratify=True,
+    random_state=1,
+)
+
+# fit the model and obtain the tree posteriors
+_, observe_proba = build_hyppo_oob_forest(est, X, y)
+
+# generate forest posteriors for the two classes
+observe_proba = np.nanmean(observe_proba, axis=0)
+
+
+fig, ax = plt.subplots(figsize=(6, 6))
+fig.tight_layout()
+ax.tick_params(labelsize=15)
+
+# histogram plot the posterior probabilities for class one
+ax.hist(observe_proba[:500][:, 1], bins=50, alpha=0.6, color=PALETTE[1], label="negative")
+ax.hist(observe_proba[500:][:, 1], bins=50, alpha=0.3, color=PALETTE[0], label="positive")
+ax.set_ylabel("# of Samples", fontsize=15)
+ax.set_xlabel("Class One Posterior", fontsize=15)
+plt.legend(frameon=False, fontsize=15)
+plt.show()
+
+# %%
+# Calculate the statistic
+# -----------------------
+
+
+def Calculate_hd(y_pred_proba) -> float:
+    return np.sqrt(
+        np.sum((np.sqrt(y_pred_proba[:, 1]) - np.sqrt(y_pred_proba[:, 0])) ** 2)
+    ) / np.sqrt(2)
+
+
+hd = Calculate_hd(observe_proba)
+print("Hellinger distance =", round(hd, 2))