# Classification on Clip

In [1]:
%load_ext autoreload
%autoreload 2

### Get Data

In [2]:
import pickle
import numpy as np

In [3]:
def get_time(X, curvature=1):
    curvature = abs(curvature)
    x_time = np.sqrt(1 / curvature + np.sum(X**2, axis=-1))
    return x_time

In [4]:
def get_data(model_dir, binarize=True, curvature=1, subset=False):
    
    # Define path to load data from
    model_name = model_dir.split('/')[-1].split('_')[0]
    
    # load data
    with open(model_dir + '/image_feats.pickle', 'rb') as f:
        X_splits = pickle.load(f)
    with open(model_dir + '/labels.pickle', 'rb') as f:
        y_splits = pickle.load(f)
    
    # convert to numpy
    X_splits = {split: X.numpy() for split, X in X_splits.items()}
    y_splits = {split: y.numpy() for split, y in y_splits.items()}
    
    if binarize:
        # source for idxs: https://keras.io/api/datasets/cifar10/
        cat_idx, dog_idx = 3, 5
        X_splits = {
            split: X[(y_splits[split] == cat_idx) | (y_splits[split] == dog_idx)]
            for split, X in X_splits.items()
            }
        y_splits = {
            split: y[(y_splits[split] == cat_idx) | (y_splits[split] == dog_idx)]
            for split, y in y_splits.items()
        }

    # take subset
    if subset:
        X_splits = {split: X[:subset[split]] for split, X in X_splits.items()}
        y_splits = {split: y[:subset[split]] for split, y in y_splits.items()}
        
    # if meru, get time components on hyperboloid (we only loaded space
    # components)
    if model_name == 'meru':
        X_splits = {
            split: np.concatenate([get_time(X, curvature)[:, None], X], axis=-1)
            for split, X in X_splits.items()
            }
        
    return X_splits, y_splits

In [5]:
model_name = 'meru'
projection = True
norm = True
model_dir = '../hdt_meru/datasets/eval/cifar10/{}_E{}{}'.format(
    model_name,
    'P' if projection else 'X',
    'N' if norm else 'X'
    )

curvature = 0.10000000149011612
# subset = {'train': 100, 'val': 10, 'test': 20}
subset = False
binarize = False

X_splits, y_splits = get_data(
    model_dir, curvature=curvature, subset=subset
    )

### Hyperbolic RF

In [6]:
from sklearn.metrics import accuracy_score

from src.hyperdt.forest import HyperbolicRandomForestClassifier

In [7]:
from sklearn.metrics import confusion_matrix

def top1perclassaccuracy(y_pred, y_true):
    cm = confusion_matrix(y_true, y_pred, labels=np.unique(y_true))
    per_class = np.diag(cm) / np.sum(cm, axis=1)

    return np.mean(per_class)

In [8]:
hrf = HyperbolicRandomForestClassifier(random_state=42, curvature=curvature, n_estimators=10, max_depth=5)
hrf.fit(X_splits['train'], y_splits['train'], use_tqdm=True)

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:00<00:00, 213.35it/s]


In [14]:
y_pred = hrf.predict(X_splits['test'])
# hrf_acc = accuracy_score(y_splits['test'], y_pred)
hrf_acc = top1perclassaccuracy(y_pred, y_splits['test'])
hrf_acc

0.862

In [16]:
pickle.dump(hrf, open('/home/phil/hrf_cifar10.pickle', 'wb')) # phil because I'm running this as myself -phil

In [17]:
from sklearn.ensemble import RandomForestClassifier as SklearnRandomForestClassifier

In [18]:
erf = SklearnRandomForestClassifier(random_state=42, n_estimators=10, max_depth=5)
erf.fit(X_splits['train'], y_splits['train'])
y_pred = erf.predict(X_splits['test'])
# erf_acc = accuracy_score(y_splits['test'], y_pred)
erf_acc = top1perclassaccuracy(y_pred, y_splits['test'])
erf_acc

0.8505

### Train Model

In [12]:
from sklearn.metrics import accuracy_score
from src.hyperdt.tree import HyperbolicDecisionTreeClassifier, DecisionTreeClassifier

In [13]:
dt = DecisionTreeClassifier(max_depth=3)
dt.fit(X_train, y_train)
y_pred = dt.predict(X_val)
dt_acc = accuracy_score(y_val, y_pred)
dt_acc

NameError: name 'X_train' is not defined

In [None]:
hdt = HyperbolicDecisionTreeClassifier(max_depth=3, curvature=curvature)
hdt.fit(X_train, y_train)
y_pred = hdt.predict(X_val)
hdt_acc = accuracy_score(y_val, y_pred)
hdt_acc

In [None]:
from sklearn.tree import DecisionTreeClassifier as SklearnDecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier as SklearnRandomForestClassifier

In [None]:
sdt = SklearnDecisionTreeClassifier(max_depth=15, random_state=42)
sdt.fit(X_splits['train'], y_splits['train'])
y_pred = sdt.predict(X_splits['val'])

In [None]:
std_train_acc = accuracy_score(y_splits['train'], sdt.predict(X_splits['train']))
sdt_val_acc = accuracy_score(y_splits['val'], sdt.predict(X_splits['val']))

print('sklearn train acc: ', std_train_acc)
print('sklearn val acc: ', sdt_val_acc)

In [None]:
max_depth=6

In [None]:
from itertools import product

import pandas as pd
from tqdm import tqdm

In [None]:
model_params = {
    'max_depth': [6],
    'seed': [42, 43],
    }

data_params = {
    'model_name': ['clip'],
    'projection': [False],
    'normal': [False],
    'curvature': [-0.10000000149011612],
    'subset': [False],
}

In [None]:
param_keys = list(model_params.keys())
param_values_list = list(product(*model_params.values()))
params_list = [{key: val for key, val in zip(param_keys, param_values)} for param_values in param_values_list]
results = []

with tqdm(params_list) as pbar:
    for params in pbar:
        X_splits, y_splits = get_data(
            model_name=data_params['model_name'][0],
            projection=data_params['projection'][0],
            normal=data_params['normal'][0],
            curvature=data_params['curvature'][0],
            subset=data_params['subset'][0]
            )
        pbar.set_description('Seed={}, max_depth={}'.format(
            params['seed'], params['max_depth']))
            
        # unpack data
        X_train, X_val = X_splits['train'], X_splits['val']
        y_train, y_val = y_splits['train'], y_splits['val']
        
        # Fit, predict, and score sklearn decision tree
        dt = SklearnDecisionTreeClassifier(max_depth=params['max_depth'], random_state=params['seed'])
        dt.fit(X_train, y_train)
        dt_acc = accuracy_score(y_val, dt.predict(X_val))
            
        # Fit, predict, and score sklearn random forest
        rf = SklearnRandomForestClassifier(max_depth=params['max_depth'], random_state=params['seed'])
        rf.fit(X_train, y_train)
        rf_acc = accuracy_score(y_val, rf.predict(X_val))

        # save results
        # geo_model is the model of geometry that data is in when computing the distance 
        # don't worry, the data is always in the hyperboloid model when inputted into hdt
        result =model_params | data_params | {'dt_acc': dt_acc, 'rf_acc': rf_acc}
        results.append(result)
            
results = pd.DataFrame(results)

In [None]:
results