# Classification on Clip

In [1]:
%load_ext autoreload
%autoreload 2

### Get Data

In [2]:
import pickle
import numpy as np

In [3]:
def get_time(X, curvature=1):
    curvature = abs(curvature)
    x_time = np.sqrt(1 / curvature + np.sum(X**2, axis=-1))
    return x_time

In [4]:
def get_data(seed=None, model_name='meru', projection=False, normal=False, curvature=1, subset=False):
    
    # Define path to load data from
    projection_str = 'projection' if projection else 'noprojection'
    normal_str = 'normal' if normal else 'nonormal'
    path = 'cifar100/{}_{}_{}'.format(model_name, projection_str, normal_str)
    
    # load data
    with open(path + '/image_feats.pickle', 'rb') as f:
        X_splits = pickle.load(f)
    with open(path + '/labels.pickle', 'rb') as f:
        y_splits = pickle.load(f)
    
    # convert to numpy
    X_splits = {split: X.numpy() for split, X in X_splits.items()}
    y_splits = {split: y.numpy() for split, y in y_splits.items()}

    # take subset
    if subset:
        X_splits = {split: X[:subset] for split, X in X_splits.items()}
        y_splits = {split: y[:subset] for split, y in y_splits.items()}
        
    # if meru, get time components on hyperboloid (we only loaded space
    # components)
    if model_name == 'meru':
        X_splits = {
            split: np.concatenate([get_time(X, curvature)[:, None], X], axis=-1)
            for split, X in X_splits.items()
            }
        
    return X_splits, y_splits

In [5]:
model_name = 'clip'
projection=False
normal=False
curvature = -0.10000000149011612
subset = False

X_splits, y_splits = get_data(
    model_name=model_name,
    projection=projection,
    normal=normal,
    curvature=curvature,
    subset=subset
    )

### Train Model

In [6]:
from sklearn.metrics import accuracy_score
from src.hyperdt.tree import HyperbolicDecisionTreeClassifier, DecisionTreeClassifier

In [None]:
dt = DecisionTreeClassifier(max_depth=3)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_val)
dt_acc = accuracy_score(y_val, y_pred)
dt_acc

In [None]:
hdt = HyperbolicDecisionTreeClassifier(max_depth=3, curvature=curvature)
hdt.fit(X_train, y_train)
y_pred = hdt.predict(X_val)
hdt_acc = accuracy_score(y_val, y_pred)
hdt_acc

In [10]:
from sklearn.tree import DecisionTreeClassifier as SklearnDecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier as SklearnRandomForestClassifier

In [28]:
sdt = SklearnDecisionTreeClassifier(max_depth=15, random_state=42)
sdt.fit(X_splits['train'], y_splits['train'])
y_pred = sdt.predict(X_splits['val'])

In [29]:
std_train_acc = accuracy_score(y_splits['train'], sdt.predict(X_splits['train']))
sdt_val_acc = accuracy_score(y_splits['val'], sdt.predict(X_splits['val']))

print('sklearn train acc: ', std_train_acc)
print('sklearn val acc: ', sdt_val_acc)

sklearn train acc:  0.4078
sklearn val acc:  0.1802


max_depth = 3 -> sdt_acc = 0.0472 

| max_depth | train acc | val acc | time (sec) |
| 3 | nan | 0.0472 | 10|
| 10 | 0.226 | 0.16 | 34|
| 15 | 0.4078 | 0.18 | 47 |
| 20 | 0.579 | 0.1916 | 56.1 |

In [None]:
max_depth=6

In [7]:
from itertools import product

import pandas as pd
from tqdm import tqdm

In [13]:
model_params = {
    'max_depth': [6],
    'seed': [42, 43],
    }

data_params = {
    'model_name': ['clip'],
    'projection': [False],
    'normal': [False],
    'curvature': [-0.10000000149011612],
    'subset': [False],
}

In [14]:
param_keys = list(model_params.keys())
param_values_list = list(product(*model_params.values()))
params_list = [{key: val for key, val in zip(param_keys, param_values)} for param_values in param_values_list]
results = []

with tqdm(params_list) as pbar:
    for params in pbar:
        X_splits, y_splits = get_data(
            model_name=data_params['model_name'][0],
            projection=data_params['projection'][0],
            normal=data_params['normal'][0],
            curvature=data_params['curvature'][0],
            subset=data_params['subset'][0]
            )
        pbar.set_description('Seed={}, max_depth={}'.format(
            params['seed'], params['max_depth']))
            
        # unpack data
        X_train, X_val = X_splits['train'], X_splits['val']
        y_train, y_val = y_splits['train'], y_splits['val']
        
        # Fit, predict, and score sklearn decision tree
        dt = SklearnDecisionTreeClassifier(max_depth=params['max_depth'], random_state=params['seed'])
        dt.fit(X_train, y_train)
        dt_acc = accuracy_score(y_val, dt.predict(X_val))
            
        # Fit, predict, and score sklearn random forest
        rf = SklearnRandomForestClassifier(max_depth=params['max_depth'], random_state=params['seed'])
        rf.fit(X_train, y_train)
        rf_acc = accuracy_score(y_val, rf.predict(X_val))

        # save results
        # geo_model is the model of geometry that data is in when computing the distance 
        # don't worry, the data is always in the hyperboloid model when inputted into hdt
        result =model_params | data_params | {'dt_acc': dt_acc, 'rf_acc': rf_acc}
        results.append(result)
            
results = pd.DataFrame(results)

  0%|          | 0/2 [00:00<?, ?it/s]

Seed=43, max_depth=6: 100%|██████████| 2/2 [02:57<00:00, 88.65s/it]


In [16]:
results

Unnamed: 0,max_depth,seed,model_name,projection,normal,curvature,subset,dt_acc,rf_acc
0,[6],"[42, 43]",[clip],[False],[False],[-0.10000000149011612],[False],0.1046,0.3714
1,[6],"[42, 43]",[clip],[False],[False],[-0.10000000149011612],[False],0.1046,0.3676
