# hSVM and logistic regression
> Benchmarking two more hyperbolic classifiers

In [13]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# hSVM and hMLR benchmark:

This code should be run using the `hsvm` conda environment instead of the `hdt` conda environment.

In [47]:
import sys
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

# hSVM stuff
sys.path.append("../hsvm")
from hsvm import LinearHSVM

# hLR stuff
sys.path.append("../HyperbolicCV/code")
from lib.lorentz.layers.LMLR import LorentzMLR
from lib.lorentz.manifold import CustomLorentz
import torch

# Euclidean versions
from sklearn.svm import SVC
from sklearn.linear_model import LogisticRegression

# For benchmarking
# from hyperdt.toy_data import wrapped_normal_mixture
sys.path.append("../HoroRF")
from datasets.gaussian import get_training_data, get_testing_data

# from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score, KFold
from sklearn.metrics import f1_score
import time
from tqdm import tqdm_notebook as tqdm

In [15]:
# Suppress UserWarning from sklearn and FutureWarning from numba
import warnings

warnings.filterwarnings("ignore", category=UserWarning)
warnings.filterwarnings("ignore", category=FutureWarning)

In [16]:
# Train hMLR function


def train_hmlr(X, y, steps=1000):
    # Init class...
    hmlr = LorentzMLR(num_features=X.shape[1], num_classes=2, manifold=CustomLorentz())

    # hMLR outputs logits; labels are {0, 1}
    opt = torch.optim.Adam(hmlr.parameters(), lr=0.01)
    loss_fn = torch.nn.BCEWithLogitsLoss()

    for _ in range(steps):
        opt.zero_grad()
        logits = hmlr(X)
        loss = loss_fn(logits[:, 1], y)
        loss.backward()
        opt.step()

    return hmlr

In [54]:
results = pd.DataFrame(columns=["seed", "n_dim", "model", "f1_score", "time"])

seeds = [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]
dims = [2, 4, 8, 16]
my_tqdm = tqdm(total=len(seeds) * len(dims) * 4 * 5)

for n_dim in dims:
    for seed in seeds:
        # print(n_dim, seed)
        my_tqdm.set_description(f"{n_dim}, {seed}")
        X, y = get_training_data(class_label=n_dim, seed=seed, num_samples=int(800 / 0.8), convert_to_poincare=False)

        # Both models like hyperboloids, so this is easy
        folds = KFold(n_splits=5, shuffle=True, random_state=seed)

        for train_index, test_index in folds.split(X):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]

            X_train_torch = torch.tensor(X_train, dtype=torch.float)
            y_train_torch = torch.tensor(y_train, dtype=torch.float)
            X_test_torch = torch.tensor(X_test, dtype=torch.float)
            y_test_torch = torch.tensor(y_test, dtype=torch.float)

            # hSVM
            t1 = time.time()
            hsvm = LinearHSVM() # From grid search
            y_train_hsvm = y_train.detach().clone()
            y_train_hsvm[y_train_hsvm == 0] = -1
            hsvm.fit(X_train, y_train_hsvm)
            y_pred = hsvm.predict(X_test)
            t2 = time.time()
            y_pred[y_pred == -1] = 0  # hMLR outputs {-1, 1}, but we want {, 1}
            hsvm_score = f1_score(y_test, y_pred, average="micro")
            results.loc[len(results)] = [seed, n_dim, "hSVM", hsvm_score, t2 - t1]
            my_tqdm.update()

            # SVM
            t1 = time.time()
            svm = SVC(kernel="linear")
            svm.fit(X_train, y_train)
            y_pred = svm.predict(X_test)
            t2 = time.time()
            svm_score = f1_score(y_test, y_pred, average="micro")
            results.loc[len(results)] = [seed, n_dim, "SVM", svm_score, t2 - t1]

            # hMLR
            t1 = time.time()
            hmlr = train_hmlr(X_train_torch, y_train_torch)
            y_pred = hmlr(X_test_torch).argmax(dim=1).clone().detach().numpy()
            t2 = time.time()
            hmlr_score = f1_score(y_test, y_pred, average="micro")
            results.loc[len(results)] = [seed, n_dim, "hMLR", hmlr_score, t2 - t1]
            my_tqdm.update()

            # Logistic Regression
            t1 = time.time()
            lr = LogisticRegression()
            lr.fit(X_train, y_train)
            y_pred = lr.predict(X_test)
            t2 = time.time()
            lr_score = f1_score(y_test, y_pred, average="micro")
            results.loc[len(results)] = [seed, n_dim, "LR", lr_score, t2 - t1]

            # Postfix
            my_tqdm.set_postfix(hSVM=hsvm_score, hMLR=hmlr_score, SVM=svm_score, LR=lr_score)

results.to_csv("../data/processed/hsvm_hmlr_results.csv")

HBox(children=(IntProgress(value=0, max=800), HTML(value='')))

In [55]:
results.groupby(["model", "n_dim"]).mean()["f1_score"] * 100

# Compare to horoDT:
#   2   91.88
#   4   99.30
#   8   99.96
#  16   100.00

model  n_dim
LR     2         90.1125
       4         99.2000
       8         99.9750
       16        99.9875
SVM    2         90.1000
       4         99.2125
       8         99.9500
       16        99.9875
hMLR   2         89.5000
       4         98.6750
       8         99.9750
       16       100.0000
hSVM   2         81.4250
       4         97.1125
       8         99.9875
       16        99.0750
Name: f1_score, dtype: float64