# HoroRF
> Comparing two hyperbolic RF methods

In [1]:
%load_ext autoreload
%autoreload 2

In [6]:
%%bash

# Run HoroRF:at 1000 
cd HoroRF
/home/phil/mambaforge/envs/hdt/bin/python train_hyp_rf.py -h
mv ./logs/output ./logs/output_$(date +%Y%m%d_%H%M%S)_hororf

[32;1m2023-09-18 09:03:44,089 [hororf.rf_trainer][0m 1 GPUs available
[32;1m2023-09-18 09:03:44,089 [hororf.rf_trainer][0m Using seed 17 on class 2
[32;1m2023-09-18 09:03:44,094 [hororf.utils][0m 977 datapoints in dataset 'datasets.polblogs_geomstats'
[32;1m2023-09-18 09:03:44,094 [hororf.utils][0m 977 datapoints in test dataset 'datasets.polblogs_geomstats'
[32;1m2023-09-18 09:03:44,095 [hororf.rf_trainer][0m 781 train and 196 test samples for fold 0
[32;1m2023-09-18 09:06:06,657 [hororf.rf_trainer][0m Hyperbolic tree f1 micro: 0.8878, f1 macro: 0.8870, AUPR: 0.0000. Mean depth of 6.00
[32;1m2023-09-18 09:06:06,658 [hororf.rf_trainer][0m 781 train and 196 test samples for fold 1
[32;1m2023-09-18 09:08:48,420 [hororf.rf_trainer][0m Hyperbolic tree f1 micro: 0.9337, f1 macro: 0.9332, AUPR: 0.0000. Mean depth of 6.00
[32;1m2023-09-18 09:08:48,421 [hororf.rf_trainer][0m 782 train and 195 test samples for fold 2
[32;1m2023-09-18 09:11:20,500 [hororf.rf_trainer][0m Hyper

In [30]:
# For using hororf outputs
# vals = [
#     0.8878, 0.9337, 0.9385, 0.8974, 0.9385
# ]

dim = 8
dataname = "neuroseed"

suffix = "hrf"
# suffix = "results_micro"
# suffix = "rf"

for suffix in ["hrf", "results_micro", "rf"]:
    vals = np.loadtxt(f"./HoroRF/logs/big_bench/hororf_{dataname}_{dim}/{suffix}.txt", delimiter="\t")
    print(suffix, f"{np.mean(vals) * 100:.2f}", f"{np.std(vals)*100:.2f}")

hrf 71.25 8.48
results_micro 67.50 12.12
rf 76.25 10.00


In [11]:
# For 16-dimensional embeddings, HoroRF had a micro-F1 score of 0.675. Let's try ours:

import numpy as np
import yaml

from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from src.hyperdt.forest import HyperbolicRandomForestClassifier
from src.hyperdt.conversions import convert

# Read params from yml file


def evaluate_hdt():
    params = yaml.safe_load(open("HoroRF/params.yml", "r"))

    # Dataset
    print(f"Using loader from file: {params['dataset_file']}")
    print()  # For tqdm compatibility
    if params["dataset_file"] == "datasets.gaussian":
        from HoroRF.datasets.gaussian import get_training_data, get_testing_data
    elif params["dataset_file"] == "datasets.neuroseed":
        from HoroRF.datasets.neuroseed import get_training_data, get_testing_data
    elif params["dataset_file"] == "datasets.polblogs_geomstats":
        from HoroRF.datasets.polblogs_geomstats import get_training_data, get_testing_data

    # Get data
    X_train, y_train = get_training_data(class_label=params["class_label"], seed=params["seed"])
    X_train = convert(X_train.numpy(), "poincare", "hyperboloid")
    X_test, y_test = get_testing_data(class_label=params["class_label"], seed=params["seed"])
    X_test = convert(X_test.numpy(), "poincare", "hyperboloid")

    # Hyperparams
    args = {
        "n_estimators": params["num_trees"],
        "max_depth": params["max_depth"],
        "min_samples_leaf": params["min_samples_leaf"],
    }

    # 5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=params["seed"])
    f1_scores_hrf = []
    f1_scores_rf = []
    for train_index, test_index in kf.split(X_train):
        # Hyperbolic
        hrf = HyperbolicRandomForestClassifier(**args)
        hrf.fit(X_train[train_index], y_train[train_index], use_tqdm=True, seed=params["seed"])
        y_pred = hrf.predict(X_train[test_index])
        f1_scores_hrf.append(f1_score(y_train[test_index], y_pred, average="micro"))

        # Euclidean
        rf = RandomForestClassifier(**args, random_state=params["seed"])
        rf.fit(X_train[train_index], y_train[train_index])
        y_pred = rf.predict(X_train[test_index])
        f1_scores_rf.append(f1_score(y_train[test_index], y_pred, average="micro"))

    return f1_scores_hrf, f1_scores_rf


f1_scores_hrf, f1_scores_rf = evaluate_hdt()
print(f"Hyperbolic: {np.mean(f1_scores_hrf):.3f} +/- {np.std(f1_scores_hrf):.3f}")
print(f"Euclidean: {np.mean(f1_scores_rf):.3f} +/- {np.std(f1_scores_rf):.3f}")

Using loader from file: datasets.gaussian



100%|██████████| 24/24 [00:02<00:00, 11.45it/s]
100%|██████████| 24/24 [00:00<00:00, 959.58it/s]
100%|██████████| 24/24 [00:00<00:00, 979.62it/s]
100%|██████████| 24/24 [00:00<00:00, 952.35it/s]
100%|██████████| 24/24 [00:00<00:00, 937.57it/s]


Hyperbolic: 0.988 +/- 0.025
Euclidean: 0.988 +/- 0.025
