# All geometries
> Comparison to decision trees/random forests in Hyperboloid, Klein, and Poincare ball models

In [1]:
%load_ext autoreload
%autoreload 2

In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from hyperdt.toy_data import wrapped_normal_mixture
from hyperdt.tree import HyperbolicDecisionTreeClassifier
from hyperdt.forest import HyperbolicRandomForestClassifier
from hyperdt.conversions import convert

import sys
from tqdm.notebook import tqdm
import time

sys.path.append("..")
from HoroRF.datasets.gaussian import get_training_data

from geomstats.geometry.hyperboloid import Hyperboloid
from geomstats.learning.pca import TangentPCA
from geomstats.learning.frechet_mean import FrechetMean

In [83]:
# %%prun

results = pd.DataFrame(columns=["seed", "n_dim", "model", "geometry", "f1_score", "time"])

seeds = list(range(10))
dims = [2, 4, 8, 16]
clfs = [
    HyperbolicDecisionTreeClassifier,
    HyperbolicRandomForestClassifier,
    DecisionTreeClassifier,
    RandomForestClassifier,
]
# geometries = ["euclidean", "tpca", "poincare", "hyperboloid", "klein"]
geometries = ["euclidean", "poincare", "hyperboloid", "klein"]
my_tqdm = tqdm(total=len(seeds) * len(dims) * len(clfs) * len(geometries) * 5)


for n_dim in dims:
    manifold = Hyperbolic(dim=n_dim, default_coords_type="extrinsic")
    metric = manifold.metric
    origin = np.array([1.0] + [0.0] * n_dim)

    for seed in seeds:
        my_tqdm.set_description(f"{n_dim}, {seed}")
        # X, y = get_training_data(class_label=n_dim, seed=seed, num_samples=int(800 / 0.8), convert_to_poincare=False)
        X, y = wrapped_normal_mixture(num_points=1000, n_dim=n_dim, num_classes=6, seed=seed, noise_std=2.0)
        # X, y = X.numpy(), y.numpy()

        # Both models like hyperboloids, so this is easy
        folds = KFold(n_splits=5, shuffle=True, random_state=seed)

        for train_index, test_index in folds.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            for geometry in geometries:
                if geometry in ["poincare", "klein", "hyperboloid"]:
                    X_train_g = convert(X_train, initial="hyperboloid", final=geometry)
                    X_test_g = convert(X_test, initial="hyperboloid", final=geometry)
                elif geometry == "tpca":
                    # Tangent PCA
                    mean = FrechetMean(metric)
                    mean.fit(X_train)
                    tpca = TangentPCA(metric, n_components=n_dim)
                    tpca.fit(X_train, base_point=mean.estimate_)
                    X_train_g = tpca.transform(X_train)
                    X_test_g = tpca.transform(X_test)
                elif geometry == "euclidean":
                    # Euclidean : log map to origin tangent space
                    X_train_g = manifold.metric.log(X_train, base_point=origin)
                    X_test_g = manifold.metric.log(X_test, base_point=origin)

                for clf_class in clfs:
                    try:
                        t1 = time.time()
                        if clf in [RandomForestClassifier, HyperbolicRandomForestClassifier]:
                            clf = clf_class(max_depth=3, n_estimators=12, random_state=seed, min_samples_leaf=1)
                        else:
                            clf = clf_class(max_depth=3, min_samples_leaf=1)
                        clf.fit(X_train_g, y_train)
                        y_pred = clf.predict(X_test_g)
                        t2 = time.time()
                        score = f1_score(y_test, y_pred, average="micro")
                        results.loc[len(results)] = [seed, n_dim, clf_class.__name__, geometry, score, t2 - t1]
                    except ValueError:
                        pass  # Should happen when the model doesn't support the geometry

                    my_tqdm.update(1)

  0%|          | 0/3200 [00:00<?, ?it/s]

In [92]:
# results.to_csv("../data/processed/all_geometries_6class.csv", index=False)
results = pd.read_csv("../data/processed/all_geometries.csv")

In [93]:
pd.pivot_table(results, index=["n_dim", "geometry"], columns=["model"], values=["f1_score"], aggfunc=np.mean) * 100

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,f1_score,f1_score,f1_score
Unnamed: 0_level_1,model,DecisionTreeClassifier,HyperbolicDecisionTreeClassifier,HyperbolicRandomForestClassifier,RandomForestClassifier
n_dim,geometry,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
2,euclidean,91.8625,,,92.2125
2,hyperboloid,90.1,91.875,91.8,89.575
2,klein,91.8625,,,92.1125
2,poincare,91.85,,,92.25
4,euclidean,99.0875,,,99.2875
4,hyperboloid,98.35,99.3,99.4,98.3625
4,klein,99.2625,,,99.475
4,poincare,99.2875,,,99.4375
8,euclidean,99.95,,,99.975
8,hyperboloid,99.8875,99.9625,99.9625,99.9375


In [94]:
# Merge "model" and "geometry"

results_merged = results.copy()
results_merged["model"] = results_merged["model"] + "_" + results_merged["geometry"]
results_merged = results_merged.drop(columns=["geometry"])

pivot = pd.pivot_table(results_merged, index=["model"], columns=["n_dim"], values=["f1_score"], aggfunc=np.mean) * 100
pivot

Unnamed: 0_level_0,f1_score,f1_score,f1_score,f1_score
n_dim,2,4,8,16
model,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2
DecisionTreeClassifier_euclidean,91.8625,99.0875,99.95,99.975
DecisionTreeClassifier_hyperboloid,90.1,98.35,99.8875,100.0
DecisionTreeClassifier_klein,91.8625,99.2625,99.95,100.0
DecisionTreeClassifier_poincare,91.85,99.2875,99.9625,99.9875
HyperbolicDecisionTreeClassifier_hyperboloid,91.875,99.3,99.9625,100.0
HyperbolicRandomForestClassifier_hyperboloid,91.8,99.4,99.9625,100.0
RandomForestClassifier_euclidean,92.2125,99.2875,99.975,100.0
RandomForestClassifier_hyperboloid,89.575,98.3625,99.9375,100.0
RandomForestClassifier_klein,92.1125,99.475,99.975,100.0
RandomForestClassifier_poincare,92.25,99.4375,99.9875,100.0


In [138]:
# We need to throw some of these out because the run failed

rg = results.groupby(["model", "seed", "geometry", "n_dim"]).count()["f1_score"]
rg[rg < 5]

model                             seed  geometry     n_dim
HyperbolicDecisionTreeClassifier  8     hyperboloid  16       1
HyperbolicRandomForestClassifier  8     hyperboloid  16       1
Name: f1_score, dtype: int64

In [144]:
print(len(results))
results2 = results[(results["seed"] != 8) | (results["n_dim"] != 16)]
# Only get rid of (seed=8, n_dim=16) because it's the only one that failed for all models
# This should get rid of
print(len(results2))

1987
1950


In [149]:
# A bunch of t-tests:
from scipy.stats import ttest_rel
from warnings import filterwarnings

# Get rid of the pandas UserWarning
filterwarnings("ignore")

# Clean up results: each (n_dim, model, trial) pair should have 5 entries

for model in results2["model"].unique():
    if "Hyperbolic" in model:
        continue
    for geom in results2["geometry"].unique():
        for n_dim in results2["n_dim"].unique():
            df_filtered = results2[
                (results["model"] == model) & (results["geometry"] == geom) & (results["n_dim"] == n_dim)
            ]
            # print(len(df_filtered))
            df_matched = results2[
                (results["model"] == f"Hyperbolic{model}")
                & (results["geometry"] == "hyperboloid")
                & (results["n_dim"] == n_dim)
            ]
            try:
                t, p = ttest_rel(df_filtered["f1_score"], df_matched["f1_score"])
            except ValueError as e:
                print(e)
            if p < 0.05 and df_filtered["f1_score"].mean() > df_matched["f1_score"].mean():
                print(f"{model} {geom} {n_dim}\t> Hyperbolic{model} with p={p:.4f}")
            elif p < 0.05 and df_filtered["f1_score"].mean() < df_matched["f1_score"].mean():
                print(f"{model} {geom} {n_dim}\t< Hyperbolic{model} with p={p:.4f}")
            else:
                continue

DecisionTreeClassifier hyperboloid 2	< HyperbolicDecisionTreeClassifier with p=0.0012
DecisionTreeClassifier hyperboloid 4	< HyperbolicDecisionTreeClassifier with p=0.0036
RandomForestClassifier poincare 2	> HyperbolicRandomForestClassifier with p=0.0137
RandomForestClassifier hyperboloid 2	< HyperbolicRandomForestClassifier with p=0.0032
RandomForestClassifier hyperboloid 4	< HyperbolicRandomForestClassifier with p=0.0010
RandomForestClassifier klein 2	> HyperbolicRandomForestClassifier with p=0.0434
RandomForestClassifier klein 4	> HyperbolicRandomForestClassifier with p=0.0128
