# Regression

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from itertools import product

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split

from src.hyperdt.toy_data import wrapped_normal_mixture
from src.hyperdt.conversions import convert
from src.hyperdt.tree import HyperbolicDecisionTreeRegressor, DecisionTreeRegressor, DecisionTreeClassifier

INFO: Using numpy backend


In [3]:
def euclidean_distance_from_origin(X):
    """Compute the euclidean distance from the origin for each row in X"""
    return np.sum(X ** 2, axis=1)

def poincare_distance_from_origin(X):
    """Compute the hyperbolic distance from the origin for each row in X (in the Poincare ball model)."""
    l2_norm = np.sum(X ** 2, axis=1)
    return np.arccosh(1 + 2 * l2_norm / (1 - l2_norm))

In [4]:
def get_data(n_samples=1000, n_classes=2, noise_std=0.5, n_dim=3, seed=None):
    """Get data for regression experiments."""
    
    X, _ = wrapped_normal_mixture(
        num_points=n_samples, num_classes=n_classes, noise_std=noise_std, n_dim=n_dim, seed=seed
    )
    
    X_poincare = convert(X, "hyperboloid", "poincare")
    y_edist = euclidean_distance_from_origin(X_poincare)
    y_hdist = poincare_distance_from_origin(X_poincare)

    return X, y_edist, y_hdist

In [5]:
def train_and_test_dts(X, y, seed=None, max_depth=8):
    """Train and test euclidean and hyperbolic decision trees on the given data.
    Return the mean squared error for both models."""
    
    # Split data
    X_train, X_test, y_train, y_test = train_test_split(
        X, y, test_size=0.2, random_state=seed
    )
    
    
    # Hyperbolic decision tree
    hdt_reg = HyperbolicDecisionTreeRegressor(
        max_depth=max_depth, candidates="data"
    )
    # Euclidean decision tree
    edt_reg = DecisionTreeRegressor(
        max_depth=max_depth
    )

    # Fit, predict, and score hyperbolic decision tree
    hdt_reg.fit(X_train, y_train)
    hdt_reg_mse = mean_squared_error(y_test, hdt_reg.predict(X_test))
    
    # Fit, predict, and score euclidean decision tree
    edt_reg.fit(X_train, y_train)
    edt_reg_mse = mean_squared_error(y_test, edt_reg.predict(X_test))
    
    return hdt_reg_mse, edt_reg_mse

In [1]:
# minimal example with bug
from sklearn.model_selection import train_test_split

from src.hyperdt.toy_data import wrapped_normal_mixture
from src.hyperdt.tree import HyperbolicDecisionTreeRegressor

X, y = wrapped_normal_mixture(num_points=1000, num_classes=2, noise_std=0.1, n_dim=32, seed=42)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
hdt_reg = HyperbolicDecisionTreeRegressor(max_depth=8, candidates="data")
hdt_reg.fit(X_train, y_train)

INFO: Using numpy backend


ValueError: Points must lie on a hyperboloid: Lorentzian Inner Product does not equal the curvature of -1.

In [None]:
n_dims = [32, 64, 128, 256, 512, 1024, 2, 4, 8, 16]
seeds = [42, 43, 44]
noise_std = 0.1
total = len(n_dims) * len(seeds)
results = []

with tqdm(product(n_dims, seeds), total=total) as pbar:
    for n_dim, seed in pbar:
        pbar.set_description('Dim={}, Seed={}'.format(n_dim, seed))

        # generate data
        X, y_edist, y_hdist = get_data(seed=seed, n_dim=n_dim, noise_std=noise_std)
        
        # train and test euclidean and hyperbolic decision trees on euclidean distance from origin
        hdt_edist_mse, edt_edist_mse = train_and_test_dts(X, y_edist, seed=seed)
        
        # train and test euclidean and hyperbolic decision trees on hyperbolic distance from origin
        hdt_hdist_mse, edt_hdist_mse = train_and_test_dts(X, y_hdist, seed=seed)

        # save results
        result = {}
        result['hdt_edist_mse'] = hdt_edist_mse
        result['edt_edist_mse'] = edt_edist_mse
        result['hdt_hdist_mse'] = hdt_hdist_mse
        result['edt_hdist_mse'] = edt_hdist_mse
        result['seed'] = seed
        result['n_dim'] = n_dim
        results.append(result)
            
results = pd.DataFrame(results)

In [None]:
results