# HoroRF
> Comparing two hyperbolic RF methods

In [6]:
%load_ext autoreload
%autoreload 2

In [6]:
%%bash

# Run HoroRF:at 1000 
cd HoroRF
/home/phil/mambaforge/envs/hdt/bin/python train_hyp_rf.py -h
mv ./logs/output ./logs/output_$(date +%Y%m%d_%H%M%S)_hororf

[32;1m2023-09-18 09:03:44,089 [hororf.rf_trainer][0m 1 GPUs available
[32;1m2023-09-18 09:03:44,089 [hororf.rf_trainer][0m Using seed 17 on class 2
[32;1m2023-09-18 09:03:44,094 [hororf.utils][0m 977 datapoints in dataset 'datasets.polblogs_geomstats'
[32;1m2023-09-18 09:03:44,094 [hororf.utils][0m 977 datapoints in test dataset 'datasets.polblogs_geomstats'
[32;1m2023-09-18 09:03:44,095 [hororf.rf_trainer][0m 781 train and 196 test samples for fold 0
[32;1m2023-09-18 09:06:06,657 [hororf.rf_trainer][0m Hyperbolic tree f1 micro: 0.8878, f1 macro: 0.8870, AUPR: 0.0000. Mean depth of 6.00
[32;1m2023-09-18 09:06:06,658 [hororf.rf_trainer][0m 781 train and 196 test samples for fold 1
[32;1m2023-09-18 09:08:48,420 [hororf.rf_trainer][0m Hyperbolic tree f1 micro: 0.9337, f1 macro: 0.9332, AUPR: 0.0000. Mean depth of 6.00
[32;1m2023-09-18 09:08:48,421 [hororf.rf_trainer][0m 782 train and 195 test samples for fold 2
[32;1m2023-09-18 09:11:20,500 [hororf.rf_trainer][0m Hyper

In [48]:
# For using hororf outputs
# vals = [
#     0.8878, 0.9337, 0.9385, 0.8974, 0.9385
# ]

import numpy as np

dim = 16
dataname = "gaussian"
# dataname = "neuroseed"

for suffix in ["hrf", "results_micro", "rf"]:
    vals = np.loadtxt(f"./HoroRF/logs/big_bench/hororf_{dataname}_{dim}/{suffix}.txt", delimiter="\t")
    print(suffix, f"{np.mean(vals) * 100:.2f}", f"{np.std(vals)*100:.2f}")

hrf 99.79 0.25
results_micro 98.66 0.90
rf 99.79 0.25


In [13]:
# For 16-dimensional embeddings, HoroRF had a micro-F1 score of 0.675. Let's try ours:

import numpy as np
import yaml

from sklearn.metrics import f1_score
from sklearn.model_selection import KFold
from sklearn.ensemble import RandomForestClassifier
from src.hyperdt.forest import HyperbolicRandomForestClassifier
from src.hyperdt.conversions import convert

# Read params from yml file


def evaluate_hdt():
    params = yaml.safe_load(open("HoroRF/params.yml", "r"))

    # Dataset
    print(f"Using loader from file: {params['dataset_file']}")
    print()  # For tqdm compatibility
    if params["dataset_file"] == "datasets.gaussian":
        from HoroRF.datasets.gaussian import get_training_data, get_testing_data
    elif params["dataset_file"] == "datasets.neuroseed":
        from HoroRF.datasets.neuroseed import get_training_data, get_testing_data
    elif params["dataset_file"] == "datasets.polblogs_geomstats":
        from HoroRF.datasets.polblogs_geomstats import get_training_data, get_testing_data

    # Get data
    X_train, y_train = get_training_data(class_label=params["class_label"], seed=params["seed"])
    X_train = convert(X_train.numpy(), "poincare", "hyperboloid")
    X_test, y_test = get_testing_data(class_label=params["class_label"], seed=params["seed"])
    X_test = convert(X_test.numpy(), "poincare", "hyperboloid")

    # Hyperparams
    args = {
        "n_estimators": params["num_trees"],
        "max_depth": params["max_depth"],
        "min_samples_leaf": params["min_samples_leaf"],
    }

    # 5-fold cross-validation
    kf = KFold(n_splits=5, shuffle=True, random_state=params["seed"])
    f1_scores_hrf = []
    f1_scores_rf = []
    for train_index, test_index in kf.split(X_train):
        # Hyperbolic
        hrf = HyperbolicRandomForestClassifier(**args)
        hrf.fit(X_train[train_index], y_train[train_index], use_tqdm=True, seed=params["seed"])
        y_pred = hrf.predict(X_train[test_index])
        f1_scores_hrf.append(f1_score(y_train[test_index], y_pred, average="micro"))

        # Euclidean
        rf = RandomForestClassifier(**args, random_state=params["seed"])
        rf.fit(X_train[train_index], y_train[train_index])
        y_pred = rf.predict(X_train[test_index])
        f1_scores_rf.append(f1_score(y_train[test_index], y_pred, average="micro"))

    return f1_scores_hrf, f1_scores_rf


f1_scores_hrf, f1_scores_rf = evaluate_hdt()
print(f"Hyperbolic: {np.mean(f1_scores_hrf):.3f} +/- {np.std(f1_scores_hrf):.3f}")
print(f"Euclidean: {np.mean(f1_scores_rf):.3f} +/- {np.std(f1_scores_rf):.3f}")

Using loader from file: datasets.neuroseed



100%|██████████| 24/24 [00:00<00:00, 28.18it/s]
100%|██████████| 24/24 [00:00<00:00, 28.01it/s]
100%|██████████| 24/24 [00:00<00:00, 30.22it/s]
100%|██████████| 24/24 [00:00<00:00, 28.65it/s]
100%|██████████| 24/24 [00:00<00:00, 29.08it/s]


Hyperbolic: 0.867 +/- 0.012
Euclidean: 0.885 +/- 0.016


In [12]:
from HoroRF.datasets.gaussian import get_training_data, get_testing_data

get_training_data(class_label=2, seed=0)[0].shape

torch.Size([1000, 2])

In [42]:
from HoroRF.datasets.neuroseed import get_training_data, get_testing_data

get_training_data(class_label=2, seed=0)[1]

tensor([44, 44,  9, 22, 44, 44,  9, 44, 44,  9,  9, 22, 22, 44, 22, 44, 22, 22,
        44, 22,  3, 22, 22, 44, 44, 44,  2,  3, 22, 22, 22,  2, 22,  2, 44, 44,
        22,  9, 44, 44, 22, 44, 22, 22, 22,  9,  9, 44, 44, 22, 44,  3, 44, 22,
         9, 43, 44, 22, 22, 44,  9,  3,  9,  2, 44, 43, 44, 22,  9,  3, 44,  9,
        44, 44, 22,  9,  3,  3, 44,  2, 22,  2,  3, 22, 44,  3,  3,  3, 44, 44,
        44,  2,  2, 22, 22, 22,  3,  2,  9, 22])

In [39]:
labels = adata.var["taxonomy_1"]
labels_counts = labels.value_counts()
keep = labels_counts[labels_counts > 1000].index

labels_filtered = labels[labels.isin(keep)]
labels_filtered.index

Index(['776992', '1050608', '190299', '358030', '239283', '4030157', '35786',
       '174924', '370251', '191389',
       ...
       '268328', '228988', '155616', '158709', '299059', '515774', '311952',
       '568082', '1112813', '562583'],
      dtype='object', length=32863)

In [41]:
import numpy as np

indices = np.random.choice(labels_filtered.index, 125, replace=False)
indices

array(['145236', '4442899', '112801', '269532', '95741', '301910',
       '74869', '1967053', '768535', '691952', '294040', '470879',
       '358439', '95522', '268755', '4448558', '593016', '318205',
       '4298060', '4475224', '1096766', '1108726', '3219862', '193763',
       '2545365', '252198', '516020', '271500', '354401', '241499',
       '4437436', '971971', '344456', '322087', '4371949', '554911',
       '202816', '4444213', '4416974', '548878', '164915', '370295',
       '4445508', '4321043', '4416763', '1087825', '997439', '4256699',
       '3862524', '47181', '174004', '407459', '683241', '4364083',
       '115049', '206331', '343699', '964799', '1667530', '4459355',
       '583472', '4377731', '1105919', '814570', '709691', '145786',
       '332210', '228043', '810672', '199344', '904468', '668257',
       '4322804', '4320437', '4367317', '807112', '280233', '147940',
       '1066654', '4469223', '563671', '2838675', '4468097', '4349553',
       '1074801', '1117187', '9985

In [9]:
# Figure out error with my method:

seed = 15
dim = 4
from HoroRF.datasets.neuroseed import get_training_data, get_testing_data

X, y = get_training_data(class_label=dim, seed=seed, num_samples=800)

In [10]:
from src.hyperdt.forest import HyperbolicRandomForestClassifier

hrf = HyperbolicRandomForestClassifier(n_estimators=24, max_depth=6)

hrf.fit(X, y, use_tqdm=True, seed=seed)

100%|██████████| 24/24 [00:01<00:00, 15.47it/s]


In [11]:
hrf.predict(X)

array([ 9, 44,  9, 44, 44,  9, 44, 43,  9, 22, 44, 44, 44,  9, 44,  2,  2,
        9,  9, 22, 22, 44,  2, 22, 44, 22, 22,  9, 22, 22, 44, 22, 22, 44,
        9, 22, 22, 22,  3, 44, 22,  9, 22, 22,  3, 43, 44, 44, 22, 22,  9,
       22, 22, 44, 22, 22, 22, 44, 22, 22, 22, 22,  9,  2, 22, 44, 44,  3,
       22, 22, 44, 44, 44, 43, 44, 22, 22, 44, 22,  3, 22, 44, 44, 22, 44,
       22, 22, 44, 44, 44, 44, 22, 44, 44,  3, 44,  3,  9, 44, 44, 44, 44,
       22, 44, 44,  3, 22, 44, 22,  3, 22,  9, 44, 22, 22, 22, 22,  2, 44,
        9,  9, 44, 22, 22, 22, 44, 43,  3, 44, 22, 22, 44, 44,  9, 44, 22,
        9,  9,  9, 44, 22, 44, 22,  3, 22, 22, 22, 22,  9,  9,  9,  9,  9,
       22, 44, 22, 22, 22, 22, 44, 44, 22,  9, 44, 22, 44, 22, 22, 43,  3,
       44, 44, 44, 44, 44, 22, 44, 44, 44, 43, 44, 44,  9, 22, 22, 44,  3,
        9, 44,  2, 22, 44, 44, 22, 22, 44, 44, 44, 22,  9,  2, 22, 22, 44,
        3,  9, 22, 44, 22,  9, 22, 44, 44, 44,  9, 44,  3, 44, 22, 22, 44,
       44, 44, 22, 44, 22

In [12]:
evaluate_hdt()

NameError: name 'evaluate_hdt' is not defined