# Regression From Origin

In [1]:
%load_ext autoreload
%autoreload 2

In [2]:
from itertools import product

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from tqdm import tqdm
from sklearn.metrics import mean_squared_error, accuracy_score
from sklearn.model_selection import train_test_split

from src.hyperdt.toy_data import wrapped_normal_mixture
from src.hyperdt.conversions import convert
from src.hyperdt.tree import HyperbolicDecisionTreeRegressor, DecisionTreeRegressor, DecisionTreeClassifier

INFO: Using numpy backend


In [3]:
def euclidean_distance_from_origin(X):
    """Compute the euclidean distance from the origin for each row in X."""
    return np.sum(X ** 2, axis=1)

def poincare_distance_from_origin(X):
    """Compute the hyperbolic distance from the origin for each row in X in the
    Poincare ball model."""
    l2_norm = np.sum(X ** 2, axis=1)
    return np.arccosh(1 + 2 * l2_norm / (1 - l2_norm))

def hyperboloid_distance_from_origin(X, curvature, timelike_dim=0):
    """Compute the hyperbolic distance from the origin for each row in X in the
    hyperboloid model."""
    
    space_idxs = np.delete(np.arange(X.shape[1]), timelike_dim)
    X_space, X_time = X[:, space_idxs], X[:, timelike_dim]
    lorentzian_inner_product = (X_space @ X_space.T - X_time @ X_time.T).diagonal()
    return np.sqrt(1 / curvature) * np.arccosh(-curvature * lorentzian_inner_product)

In [4]:
def get_distance_from_origin(n_samples=1000, n_classes=2, noise_std=0.5, n_dim=3, seed=None):
    """Get data for regression experiments."""
    
    # generate X
    X_hyperboloid, _ = wrapped_normal_mixture(
        num_points=n_samples, num_classes=n_classes, noise_std=noise_std, n_dim=n_dim, seed=seed
    )
    X_poincare = convert(X_hyperboloid, "hyperboloid", "poincare")
    
    data = {
        ('poincare', 'euclidean'): (X_hyperboloid, euclidean_distance_from_origin(X_poincare)),
        ('poincare', 'poincare'): (X_hyperboloid, poincare_distance_from_origin(X_poincare)),
        ('hyperboloid', 'euclidean'): (X_hyperboloid, euclidean_distance_from_origin(X_hyperboloid)),
        ('hyperboloid', 'hyperboloid'): (X_hyperboloid, hyperboloid_distance_from_origin(X_hyperboloid, curvature=1)),
    }
        
    return data

In [5]:
small = True

params = {
    'max_depth': [5, 10, 20],
    'n_dim': [2, 4, 8, 16, 32],
    'seed': [42, 43, 44],
    'noise_std': [0.1, 0.5],
    'n_samples': [1000],
    'n_classes': [2, 5, 10],
    }

if small:
    params['max_depth'] = [3]
    params['n_dim'] = [2]
    params['seed'] = [42]

In [6]:
param_keys = list(params.keys())
param_values_list = list(product(*params.values()))
params = [{key: val for key, val in zip(param_keys, param_values)} for param_values in param_values_list]
results = []

with tqdm(params) as pbar:
    for params in pbar:
        data = get_distance_from_origin(
            seed=params['seed'],
            n_dim=params['n_dim'],
            noise_std=params['noise_std'],
            n_samples=params['n_samples'],
            n_classes=params['n_classes'],
            )
        for (geo_model, dist_func_name), (X, y) in data.items():
            pbar.set_description('Dim={}, Seed={}, max_depth={}'.format(
                params['n_dim'], params['seed'], params['max_depth']))
            
            # Split data
            X_train, X_test, y_train, y_test = train_test_split(
                X, y, test_size=0.2, random_state=params['seed']
            )
            
            # Fit, predict, and score hyperbolic decision tree
            try:
                hdt_reg = HyperbolicDecisionTreeRegressor(max_depth=params['max_depth'], candidates="data")
                hdt_reg.fit(X_train, y_train)
                hdt_reg_mse = mean_squared_error(y_test, hdt_reg.predict(X_test))
            except ValueError as e:
                error_msg = str(e)
                if 'Points must lie on a hyperboloid:' in error_msg: # error due to data not being on hyperboloid
                    hdt_reg_mse = np.nan
                else:
                    raise e
                
            # Fit, predict, and score euclidean decision tree
            edt_reg = DecisionTreeRegressor(max_depth=params['max_depth'])
            edt_reg.fit(X_train, y_train)
            edt_reg_mse = mean_squared_error(y_test, edt_reg.predict(X_test))

            # save results
            # geo_model is the model of geometry that data is in when computing the distance 
            # don't worry, the data is always in the hyperboloid model when inputted into hdt
            result = {
                "model of geometry": geo_model,
                "distance function": dist_func_name,
                "hdt_reg_mse": hdt_reg_mse,
                "edt_reg_mse": edt_reg_mse,
            } | params
            results.append(result)
                
results = pd.DataFrame(results)

Dim=2, Seed=42, max_depth=3: 100%|██████████| 1/1 [00:04<00:00,  4.70s/it]


In [7]:
results

Unnamed: 0,model of geometry,distance function,hdt_reg_mse,edt_reg_mse,max_depth,n_dim,seed,noise_std,n_samples,n_classes
0,poincare,euclidean,0.002386806,218021.792915,3,2,42,0.1,1000,2
1,poincare,poincare,0.01878014,217062.466522,3,2,42,0.1,1000,2
2,hyperboloid,euclidean,15.28856,206246.044153,3,2,42,0.1,1000,2
3,hyperboloid,hyperboloid,1.077631e-07,188075.469753,3,2,42,0.1,1000,2


### Save Results

In [8]:
from pathlib import Path 

outpath = Path('results/origin_distance_mse.tsv')
if not outpath.parent.exists():
    outpath.parent.mkdir(parents=True)
results.to_csv(sep='\t', path_or_buf=outpath, index=False)

In [9]:
# read results back in for sanity check
results = pd.read_csv(outpath, sep='\t')
results

Unnamed: 0,model of geometry,distance function,hdt_reg_mse,edt_reg_mse,max_depth,n_dim,seed,noise_std,n_samples,n_classes
0,poincare,euclidean,0.002386806,218021.792915,3,2,42,0.1,1000,2
1,poincare,poincare,0.01878014,217062.466522,3,2,42,0.1,1000,2
2,hyperboloid,euclidean,15.28856,206246.044153,3,2,42,0.1,1000,2
3,hyperboloid,hyperboloid,1.077631e-07,188075.469753,3,2,42,0.1,1000,2
