# All geometries
> Comparison to decision trees/random forests in Hyperboloid, Klein, and Poincare ball models

In [1]:
%load_ext autoreload
%autoreload 2

In [27]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

from sklearn.model_selection import KFold
from sklearn.metrics import f1_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

from hyperdt.toy_data import wrapped_normal_mixture
from hyperdt.tree import HyperbolicDecisionTreeClassifier
from hyperdt.forest import HyperbolicRandomForestClassifier
from hyperdt.conversions import convert

import sys
from tqdm.notebook import tqdm
import time

sys.path.append("..")
from HoroRF.datasets.gaussian import get_training_data

from geomstats.geometry.hyperboloid import Hyperboloid
from geomstats.learning.pca import TangentPCA
from geomstats.learning.frechet_mean import FrechetMean

In [207]:
# %%prun

results = pd.DataFrame(columns=["seed", "n_dim", "model", "geometry", "f1_score", "time"])

seeds = list(range(10))
dims = [2, 4, 8, 16]
clfs = [
    HyperbolicDecisionTreeClassifier,
    HyperbolicRandomForestClassifier,
    DecisionTreeClassifier,
    RandomForestClassifier,
]
geometries = ["euclidean", "tpca", "poincare", "hyperboloid", "klein"]
datasets = ["gaussian", "neuroseed", "polblogs"]
# geometries = ["euclidean", "poincare", "hyperboloid", "klein"]
my_tqdm = tqdm(total=len(seeds) * np.sum(dims) * len(clfs) * len(geometries) * 5 * len(datasets))


for n_dim in dims:
    manifold = Hyperbolic(dim=n_dim, default_coords_type="extrinsic")
    metric = manifold.metric
    origin = np.array([1.0] + [0.0] * n_dim)

    for seed in seeds:
        my_tqdm.set_description(f"{n_dim}, {seed}")
        for dataset in datasets:
            if dataset == "gaussian":
                from HoroRF.datasets.gaussian import get_training_data
            elif dataset == "neuroseed":
                from HoroRF.datasets.neuroseed import get_training_data
            elif dataset == "polblogs":
                from HoroRF.datasets.polblogs import get_training_data
            
            X, y = get_training_data(class_label=n_dim, seed=seed, num_samples=int(800 / 0.8), convert_to_poincare=False)
            # X, y = wrapped_normal_mixture(num_points=1000, n_dim=n_dim, num_classes=6, seed=seed, noise_std=2.0)
            X, y = X.numpy(), y.numpy()

            for geometry in geometries:
                if geometry in ["poincare", "klein", "hyperboloid"]:
                    X_g = convert(X, initial="hyperboloid", final=geometry)
                elif geometry == "tpca":
                    # Tangent PCA
                    mean = FrechetMean(metric)
                    mean.fit(X)
                    tpca = TangentPCA(metric, n_components=n_dim)
                    try:
                        tpca.fit(X, base_point=mean.estimate_)
                        X_g = tpca.transform(X)
                    except:
                        my_tqdm.update(n_dim * 5 * len(clfs))
                        pass
                elif geometry == "euclidean":
                    # Euclidean : log map to origin tangent space
                    X_g = manifold.metric.log(X, base_point=origin)

                for clf_class in clfs:
                    if (
                        clf_class in [HyperbolicDecisionTreeClassifier, HyperbolicRandomForestClassifier]
                        and geometry != "hyperboloid"
                    ):
                        my_tqdm.update(n_dim * 5)
                        continue
                    my_tqdm.postfix = f"{clf_class.__name__}, {geometry}"
                    folds = KFold(n_splits=5, shuffle=True, random_state=seed)

                    for train_index, test_index in folds.split(X_g):
                        X_train, X_test = X_g[train_index], X_g[test_index]
                        y_train, y_test = y[train_index], y[test_index]

                        try:
                            t1 = time.time()
                            if clf in [RandomForestClassifier, HyperbolicRandomForestClassifier]:
                                clf = clf_class(max_depth=3, n_estimators=12, random_state=seed, min_samples_leaf=1)
                            else:
                                clf = clf_class(max_depth=3, min_samples_leaf=1)
                            clf.fit(X_train, y_train)
                            y_pred = clf.predict(X_test)
                            t2 = time.time()
                            score = f1_score(y_test, y_pred, average="micro")
                            results.loc[len(results)] = [seed, n_dim, clf_class.__name__, geometry, score, t2 - t1]
                        except ValueError:
                            pass  # Should happen when the model doesn't support the geometry

                        my_tqdm.update(n_dim)  # Using this makes our estimate more accurate, since it scales O(n_dim)

results.to_csv("../data/processed/all_geometries_all_datasets_800.csv")

  0%|          | 0/90000 [00:00<?, ?it/s]



TypeError: get_training_data() got an unexpected keyword argument 'num_samples'

In [204]:
# results.to_csv("../data/processed/all_geometries_6class.csv", index=False)
results = pd.read_csv("../data/processed/all_geometries3.csv")

In [205]:
pivot = (
    pd.pivot_table(results, index=["model", "geometry"], columns=["n_dim"], values=["f1_score"], aggfunc=np.mean) * 100
)
# pivot.to_latex("../data/processed/all_geometries_pivot.tex", float_format="%.2f")
pivot

Unnamed: 0_level_0,Unnamed: 1_level_0,f1_score,f1_score,f1_score,f1_score
Unnamed: 0_level_1,n_dim,2,4,8,16
model,geometry,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2
DecisionTreeClassifier,euclidean,91.8625,99.15,99.9375,99.975
DecisionTreeClassifier,hyperboloid,90.1625,98.3375,99.9125,99.9875
DecisionTreeClassifier,klein,91.8875,99.2875,99.9625,99.9875
DecisionTreeClassifier,poincare,91.85,99.3,99.9625,100.0
DecisionTreeClassifier,tpca,91.8875,99.325,99.975,99.975
HyperbolicDecisionTreeClassifier,hyperboloid,91.875,99.3,99.9625,100.0
HyperbolicRandomForestClassifier,hyperboloid,91.9125,99.425,99.9625,100.0
RandomForestClassifier,euclidean,92.0875,99.275,99.975,100.0
RandomForestClassifier,hyperboloid,89.8,98.4,99.95,100.0
RandomForestClassifier,klein,92.05,99.4625,100.0,100.0


In [189]:
# We need to throw some of these out because the run failed

rg = results.groupby(["model", "seed", "geometry", "n_dim"]).count()["f1_score"]
rg[rg < 5]

model                             seed  geometry     n_dim
HyperbolicDecisionTreeClassifier  8     hyperboloid  16       1
HyperbolicRandomForestClassifier  8     hyperboloid  16       1
Name: f1_score, dtype: int64

In [190]:
print(len(results))
results2 = results[(results["seed"] != 8) | (results["n_dim"] != 16)]
# results2 = results[results["seed"] < ]
# results2 = results.copy()

# Only get rid of (seed=8, n_dim=16) because it's the only one that failed for all models
# This should get rid of
print(len(results2))

2382
2340


In [197]:
# A bunch of t-tests:
from scipy.stats import ttest_rel
from warnings import filterwarnings

# Get rid of the pandas UserWarning
filterwarnings("ignore")

# Clean up results: each (n_dim, model, trial) pair should have 5 entries

for model in results2["model"].unique():
    if "Hyperbolic" in model:
        continue
    for geom in results2["geometry"].unique():
        for n_dim in results2["n_dim"].unique():
            df_filtered = results2[
                (results["model"] == model) & (results["geometry"] == geom) & (results["n_dim"] == n_dim)
            ]
            # print(len(df_filtered))
            df_matched = results2[
                (results["model"] == f"Hyperbolic{model}")
                & (results["geometry"] == "hyperboloid")
                & (results["n_dim"] == n_dim)
            ]
            try:
                t, p = ttest_rel(df_filtered["f1_score"], df_matched["f1_score"])
            except ValueError as e:
                print(e)
            if p < 0.05 and df_filtered["f1_score"].mean() > df_matched["f1_score"].mean():
                print(f"{model} {geom} {n_dim}\t> Hyperbolic{model} with p={p:.4f}")
                # results_aggregated.loc[
                #     (results_aggregated["model"] == model) & (results_aggregated["geometry"] == geom), "test"
                # ] = "<"
            elif p < 0.05 and df_filtered["f1_score"].mean() < df_matched["f1_score"].mean():
                print(f"{model} {geom} {n_dim}\t< Hyperbolic{model} with p={p:.4f}")
                # results_aggregated.loc[
                #     (results_aggregated["model"] == model) & (results_aggregated["geometry"] == geom), "test"
                # ] = ">"
            else:
                continue

# results_aggregated

DecisionTreeClassifier hyperboloid 2	< HyperbolicDecisionTreeClassifier with p=0.0016
DecisionTreeClassifier hyperboloid 4	< HyperbolicDecisionTreeClassifier with p=0.0032
RandomForestClassifier tpca 2	> HyperbolicRandomForestClassifier with p=0.0173
RandomForestClassifier poincare 2	> HyperbolicRandomForestClassifier with p=0.0243
RandomForestClassifier hyperboloid 2	< HyperbolicRandomForestClassifier with p=0.0013
RandomForestClassifier hyperboloid 4	< HyperbolicRandomForestClassifier with p=0.0015


In [194]:
pivot.to_latex("../data/processed/all_geometries_pivot.tex")