# All geometries
> Comparison to decision trees/random forests in Hyperboloid, Klein, and Poincare ball models

In [1]:
%load_ext autoreload
%autoreload 2

In [3]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from hyperdt.toy_data import wrapped_normal_mixture
from hyperdt.tree import HyperbolicDecisionTreeClassifier
from hyperdt.forest import HyperbolicRandomForestClassifier
from hyperdt.conversions import convert

import sys
from tqdm.notebook import tqdm
import time

sys.path.append("..")
from HoroRF.datasets.gaussian import get_training_data

from geomstats.geometry.hyperbolic import Hyperbolic
from geomstats.learning.pca import TangentPCA
from geomstats.learning.frechet_mean import FrechetMean

INFO: Using numpy backend


In [16]:
# %%prun

results = pd.DataFrame(columns=["dataset", "seed", "n_dim", "model", "geometry", "f1_score", "time"])

seeds = list(range(10))
dims = [2, 4, 8, 16]
clfs = [
    HyperbolicDecisionTreeClassifier,
    HyperbolicRandomForestClassifier,
    DecisionTreeClassifier,
    RandomForestClassifier,
]
# geometries = ["euclidean", "tpca", "poincare", "hyperboloid", "klein"]
datasets = ["gaussian", "neuroseed", "polblogs"]
geometries = ["euclidean", "poincare", "hyperboloid", "klein"]
my_tqdm = tqdm(total=len(seeds) * np.sum(dims) * len(clfs) * len(geometries) * 5 * len(datasets))


for n_dim in dims:
    manifold = Hyperbolic(dim=n_dim, default_coords_type="extrinsic")
    metric = manifold.metric
    origin = np.array([1.0] + [0.0] * n_dim)

    for seed in seeds:
        my_tqdm.set_description(f"{n_dim}, {seed}")
        for dataset in datasets:
            if dataset == "gaussian":
                from HoroRF.datasets.gaussian import get_training_data
            elif dataset == "neuroseed":
                from HoroRF.datasets.neuroseed import get_training_data
            elif dataset == "polblogs":
                from HoroRF.datasets.polblogs_hypll import get_training_data

            if dataset != "polblogs":
                X, y = get_training_data(
                    class_label=n_dim, seed=seed, num_samples=int(800 / 0.8), convert_to_poincare=False
                )
                X, y = X.numpy(), y.numpy()
            else:
                X, y = get_training_data(class_label=n_dim, seed=seed)
                X = convert(X, initial="poincare", final="hyperboloid")


            for geometry in geometries:
                if geometry in ["poincare", "klein", "hyperboloid"]:
                    X_g = convert(X, initial="hyperboloid", final=geometry)
                elif geometry == "tpca":
                    # Tangent PCA
                    mean = FrechetMean(metric)
                    mean.fit(X)
                    tpca = TangentPCA(metric, n_components=n_dim)
                    try:
                        tpca.fit(X, base_point=mean.estimate_)
                        X_g = tpca.transform(X)
                    except:
                        my_tqdm.update(n_dim * 5 * len(clfs))
                        pass
                elif geometry == "euclidean":
                    # Euclidean : log map to origin tangent space
                    X_g = manifold.metric.log(X, base_point=origin)

                for clf_class in clfs:
                    if (
                        clf_class in [HyperbolicDecisionTreeClassifier, HyperbolicRandomForestClassifier]
                        and geometry != "hyperboloid"
                    ):
                        my_tqdm.update(n_dim * 5)
                        continue
                    my_tqdm.postfix = f"{clf_class.__name__}, {geometry}"
                    folds = KFold(n_splits=5, shuffle=True, random_state=seed)

                    for train_index, test_index in folds.split(X_g):
                        X_train, X_test = X_g[train_index], X_g[test_index]
                        y_train, y_test = y[train_index], y[test_index]

                        try:
                            t1 = time.time()
                            if clf_class in [RandomForestClassifier, HyperbolicRandomForestClassifier]:
                                clf = clf_class(max_depth=3, n_estimators=12, random_state=seed, min_samples_leaf=1)
                            else:
                                clf = clf_class(max_depth=3, min_samples_leaf=1)
                            clf.fit(X_train, y_train)
                            y_pred = clf.predict(X_test)
                            t2 = time.time()
                            score = f1_score(y_test, y_pred, average="micro")
                            results.loc[len(results)] = [dataset, seed, n_dim, clf_class.__name__, geometry, score, t2 - t1]
                        except ValueError:
                            pass  # Should happen when the model doesn't support the geometry

                        my_tqdm.update(n_dim)  # Using this makes our estimate more accurate, since it scales O(n_dim)

results.to_csv("../data/processed/all_geometries_all_datasets_800.csv")

  0%|          | 0/72000 [00:00<?, ?it/s]

  return func(x, *args, **kwargs)


In [4]:
# results.to_csv("../data/processed/all_geometries_6class.csv", index=False)
results = pd.read_csv("../data/processed/all_geometries_all_datasets_800.csv")

In [6]:
results

Unnamed: 0.1,Unnamed: 0,dataset,seed,n_dim,model,geometry,f1_score,time
0,0,gaussian,0,2,DecisionTreeClassifier,euclidean,0.937500,0.000921
1,1,gaussian,0,2,DecisionTreeClassifier,euclidean,0.937500,0.000858
2,2,gaussian,0,2,DecisionTreeClassifier,euclidean,0.931250,0.000831
3,3,gaussian,0,2,DecisionTreeClassifier,euclidean,0.962500,0.000896
4,4,gaussian,0,2,DecisionTreeClassifier,euclidean,0.925000,0.000837
...,...,...,...,...,...,...,...,...
5982,5982,polblogs,9,16,RandomForestClassifier,klein,0.770408,0.016497
5983,5983,polblogs,9,16,RandomForestClassifier,klein,0.739796,0.016386
5984,5984,polblogs,9,16,RandomForestClassifier,klein,0.770408,0.016284
5985,5985,polblogs,9,16,RandomForestClassifier,klein,0.775510,0.017323


In [10]:
pivot = (
    pd.pivot_table(results, index=["dataset", "n_dim"], columns=["model", "geometry"], values=["f1_score"], aggfunc=np.mean) * 100
)
# pivot.to_latex("../data/processed/all_geometries_pivot.tex", float_format="%.2f")
pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,f1_score,f1_score,f1_score,f1_score,f1_score,f1_score,f1_score,f1_score,f1_score
Unnamed: 0_level_1,model,DecisionTreeClassifier,DecisionTreeClassifier,DecisionTreeClassifier,DecisionTreeClassifier,HyperbolicDecisionTreeClassifier,HyperbolicRandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier,RandomForestClassifier
Unnamed: 0_level_2,geometry,euclidean,hyperboloid,klein,poincare,hyperboloid,hyperboloid,euclidean,hyperboloid,klein,poincare
dataset,n_dim,Unnamed: 2_level_3,Unnamed: 3_level_3,Unnamed: 4_level_3,Unnamed: 5_level_3,Unnamed: 6_level_3,Unnamed: 7_level_3,Unnamed: 8_level_3,Unnamed: 9_level_3,Unnamed: 10_level_3,Unnamed: 11_level_3
gaussian,2,91.8625,90.1375,91.8875,91.85,91.875,91.9875,91.7,89.325,91.925,92.275
gaussian,4,99.15,98.375,99.275,99.275,99.3,99.3625,99.1375,98.2125,99.4,99.3625
gaussian,8,99.9375,99.9,99.9625,99.9625,99.9625,99.9625,99.95,99.9125,100.0,99.9875
gaussian,16,99.975,99.9875,100.0,99.9875,100.0,100.0,100.0,99.9875,100.0,100.0
neuroseed,2,61.675,61.675,61.6875,61.6875,61.7,62.05,61.4375,59.75,61.7375,61.7375
neuroseed,4,84.4875,84.4875,84.5125,84.4875,84.4875,84.9625,82.275,82.025,82.875,82.875
neuroseed,8,80.7625,80.775,80.7625,80.775,80.7625,82.025,86.1875,86.3375,85.4625,85.4625
neuroseed,16,79.225,79.225,79.175,79.225,79.2125,82.375,84.4375,84.4375,85.675,85.675
polblogs,2,70.480534,70.348561,71.031816,71.145421,71.04202,71.410152,71.819362,71.645212,71.53192,71.522815
polblogs,4,71.614495,70.838409,71.542595,71.420722,71.522187,72.278441,71.951753,72.103925,72.51214,72.369963


In [13]:
# Flatten all multi-index columns and rows
pivot.columns = ["_".join(col) for col in pivot.columns]
pivot = pivot.reset_index()
pivot = pivot.set_index("dataset")
pivot = pivot.sort_index()
pivot

Unnamed: 0_level_0,n_dim,f1_score_DecisionTreeClassifier_euclidean,f1_score_DecisionTreeClassifier_hyperboloid,f1_score_DecisionTreeClassifier_klein,f1_score_DecisionTreeClassifier_poincare,f1_score_HyperbolicDecisionTreeClassifier_hyperboloid,f1_score_HyperbolicRandomForestClassifier_hyperboloid,f1_score_RandomForestClassifier_euclidean,f1_score_RandomForestClassifier_hyperboloid,f1_score_RandomForestClassifier_klein,f1_score_RandomForestClassifier_poincare
dataset,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
gaussian,2,91.8625,90.1375,91.8875,91.85,91.875,91.9875,91.7,89.325,91.925,92.275
gaussian,4,99.15,98.375,99.275,99.275,99.3,99.3625,99.1375,98.2125,99.4,99.3625
gaussian,8,99.9375,99.9,99.9625,99.9625,99.9625,99.9625,99.95,99.9125,100.0,99.9875
gaussian,16,99.975,99.9875,100.0,99.9875,100.0,100.0,100.0,99.9875,100.0,100.0
neuroseed,2,61.675,61.675,61.6875,61.6875,61.7,62.05,61.4375,59.75,61.7375,61.7375
neuroseed,4,84.4875,84.4875,84.5125,84.4875,84.4875,84.9625,82.275,82.025,82.875,82.875
neuroseed,8,80.7625,80.775,80.7625,80.775,80.7625,82.025,86.1875,86.3375,85.4625,85.4625
neuroseed,16,79.225,79.225,79.175,79.225,79.2125,82.375,84.4375,84.4375,85.675,85.675
polblogs,2,70.480534,70.348561,71.031816,71.145421,71.04202,71.410152,71.819362,71.645212,71.53192,71.522815
polblogs,4,71.614495,70.838409,71.542595,71.420722,71.522187,72.278441,71.951753,72.103925,72.51214,72.369963


In [17]:
# Print pivot as markdown
!pip install tabulate
print(pivot[
    [
        "f1_score_DecisionTreeClassifier_euclidean",
        "f1_score_DecisionTreeClassifier_hyperboloid",
        "f1_score_DecisionTreeClassifier_klein",
        "f1_score_DecisionTreeClassifier_poincare",
        "f1_score_HyperbolicDecisionTreeClassifier_hyperboloid",
    ]
].to_markdown(floatfmt=".2f"))
# ].to_markdown(floatfmt=".2f"))

| dataset   |   f1_score_DecisionTreeClassifier_euclidean |   f1_score_DecisionTreeClassifier_hyperboloid |   f1_score_DecisionTreeClassifier_klein |   f1_score_DecisionTreeClassifier_poincare |   f1_score_HyperbolicDecisionTreeClassifier_hyperboloid |
|:----------|--------------------------------------------:|----------------------------------------------:|----------------------------------------:|-------------------------------------------:|--------------------------------------------------------:|
| gaussian  |                                       91.86 |                                         90.14 |                                   91.89 |                                      91.85 |                                                   91.88 |
| gaussian  |                                       99.15 |                                         98.38 |                                   99.28 |                                      99.28 |                                               

In [18]:
# Print pivot as markdown
!pip install tabulate
print(pivot[
    [
        "f1_score_RandomForestClassifier_euclidean",
        "f1_score_RandomForestClassifier_hyperboloid",
        "f1_score_RandomForestClassifier_klein",
        "f1_score_RandomForestClassifier_poincare",
        "f1_score_HyperbolicRandomForestClassifier_hyperboloid",
    ]
].to_markdown(floatfmt=".2f"))
# ].to_markdown(floatfmt=".2f"))

| dataset   |   f1_score_RandomForestClassifier_euclidean |   f1_score_RandomForestClassifier_hyperboloid |   f1_score_RandomForestClassifier_klein |   f1_score_RandomForestClassifier_poincare |   f1_score_HyperbolicRandomForestClassifier_hyperboloid |
|:----------|--------------------------------------------:|----------------------------------------------:|----------------------------------------:|-------------------------------------------:|--------------------------------------------------------:|
| gaussian  |                                       91.70 |                                         89.33 |                                   91.92 |                                      92.28 |                                                   91.99 |
| gaussian  |                                       99.14 |                                         98.21 |                                   99.40 |                                      99.36 |                                               

In [189]:
# We need to throw some of these out because the run failed

rg = results.groupby(["model", "seed", "geometry", "n_dim"]).count()["f1_score"]
rg[rg < 5]

model                             seed  geometry     n_dim
HyperbolicDecisionTreeClassifier  8     hyperboloid  16       1
HyperbolicRandomForestClassifier  8     hyperboloid  16       1
Name: f1_score, dtype: int64

In [190]:
print(len(results))
results2 = results[(results["seed"] != 8) | (results["n_dim"] != 16)]
# results2 = results[results["seed"] < ]
# results2 = results.copy()

# Only get rid of (seed=8, n_dim=16) because it's the only one that failed for all models
# This should get rid of
print(len(results2))

2382
2340


In [197]:
# A bunch of t-tests:
from scipy.stats import ttest_rel
from warnings import filterwarnings

# Get rid of the pandas UserWarning
filterwarnings("ignore")

# Clean up results: each (n_dim, model, trial) pair should have 5 entries

for model in results2["model"].unique():
    if "Hyperbolic" in model:
        continue
    for geom in results2["geometry"].unique():
        for n_dim in results2["n_dim"].unique():
            df_filtered = results2[
                (results["model"] == model) & (results["geometry"] == geom) & (results["n_dim"] == n_dim)
            ]
            # print(len(df_filtered))
            df_matched = results2[
                (results["model"] == f"Hyperbolic{model}")
                & (results["geometry"] == "hyperboloid")
                & (results["n_dim"] == n_dim)
            ]
            try:
                t, p = ttest_rel(df_filtered["f1_score"], df_matched["f1_score"])
            except ValueError as e:
                print(e)
            if p < 0.05 and df_filtered["f1_score"].mean() > df_matched["f1_score"].mean():
                print(f"{model} {geom} {n_dim}\t> Hyperbolic{model} with p={p:.4f}")
                # results_aggregated.loc[
                #     (results_aggregated["model"] == model) & (results_aggregated["geometry"] == geom), "test"
                # ] = "<"
            elif p < 0.05 and df_filtered["f1_score"].mean() < df_matched["f1_score"].mean():
                print(f"{model} {geom} {n_dim}\t< Hyperbolic{model} with p={p:.4f}")
                # results_aggregated.loc[
                #     (results_aggregated["model"] == model) & (results_aggregated["geometry"] == geom), "test"
                # ] = ">"
            else:
                continue

# results_aggregated

DecisionTreeClassifier hyperboloid 2	< HyperbolicDecisionTreeClassifier with p=0.0016
DecisionTreeClassifier hyperboloid 4	< HyperbolicDecisionTreeClassifier with p=0.0032
RandomForestClassifier tpca 2	> HyperbolicRandomForestClassifier with p=0.0173
RandomForestClassifier poincare 2	> HyperbolicRandomForestClassifier with p=0.0243
RandomForestClassifier hyperboloid 2	< HyperbolicRandomForestClassifier with p=0.0013
RandomForestClassifier hyperboloid 4	< HyperbolicRandomForestClassifier with p=0.0015


In [194]:
pivot.to_latex("../data/processed/all_geometries_pivot.tex")