In [1]:
import math
import multiprocessing
from functools import partial

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from sklearn.model_selection import GroupShuffleSplit
from sklearn.preprocessing import minmax_scale

#python -m pip install git+https://github.com/nalexai/hyperlib.git@main
from hyperlib.nn.layers.lin_hyp import LinearHyperbolic
from hyperlib.nn.optimizers.rsgd import RSGD
from hyperlib.manifold.poincare import Poincare

from train import make_X_y, encode_y, scale_X, grouped_train_test_split
from hyperlib_eval import get_all_pos_in_neg, get_pos_greater_than_quant

import tensorflow as tf
from tensorflow.data import Dataset
from tensorflow import keras
from tensorflow.keras.models import Model

from tensorflow_addons.optimizers import AdamW

seed = 123
np.random.seed(seed)
tf.random.set_seed(seed)

In [2]:
df = pd.read_parquet("../../../../data/clean/clean_sample.parquet")
df.shape

(197305, 991)

In [3]:
X, y = make_X_y(df)
X_train, X_test, y_train, y_test = grouped_train_test_split(X, y, y, test_size=0.2)
X_train_scale, X_test_scale = scale_X(X_train, X_test)

num_classes = len(np.unique(y_train))
y_train_encode = encode_y(y_train)

In [4]:
BATCH_SIZE = 256

train_dataset = Dataset.from_tensor_slices((X_train_scale, y_train_encode)).shuffle(100).batch(BATCH_SIZE)

In [5]:
hyperbolic_layer_1 = LinearHyperbolic(978, Poincare(), 1)
hyperbolic_layer_2 = LinearHyperbolic(512, Poincare(), 1)
hyperbolic_layer_3 = LinearHyperbolic(128, Poincare(), 1)
hyperbolic_layer_4 = LinearHyperbolic(64, Poincare(), 1)
hyperbolic_layer_5 = LinearHyperbolic(32, Poincare(), 1)
output_layer = LinearHyperbolic(num_classes, Poincare(), 1)

In [6]:
# Create model architecture
model = tf.keras.models.Sequential([
    hyperbolic_layer_1,
    hyperbolic_layer_2,
    hyperbolic_layer_3,
    hyperbolic_layer_4,
    hyperbolic_layer_5,
    output_layer
    ])

# Create optimizer
optimizer = RSGD(learning_rate=0.02)

# Compile the model with the Riemannian optimizer            
model.compile(
    optimizer=optimizer,
    loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True),
    metrics=[tf.keras.metrics.SparseCategoricalAccuracy()],
)

In [7]:
EPOCHS = 20

model.fit(train_dataset,
        batch_size=BATCH_SIZE,
        epochs=EPOCHS,
        verbose=1)

Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20

In [None]:
embeddings_model = Model(inputs=model.get_layer('linear_hyperbolic_1').input, outputs=model.get_layer('linear_hyperbolic_3').output) #get network up to embedding layer
embedded = embeddings_model.predict(X_test_scale, verbose=1) #can set batch_size if mem probs

In [None]:
def take_sample(embedded, labs, n_groups=1000):
    """
    Take sample from test set without splitting up perturbagens
    """
    _, embedded_sample, _, labs_sample = grouped_train_test_split(embedded, labs, labs, test_size=n_groups)
    return embedded_sample, labs_sample

In [None]:
df_eval = pd.DataFrame(embedded, y_test)
embedded_sample, labs_sample = take_sample(embedded, y_test, n_groups=20)
df_eval_sample = pd.DataFrame(embedded_sample, labs_sample)

In [None]:
#def vector_distance_batch(vector_1, vectors_all):
#    """
#    Return poincare distances between one vector and a set of other vectors.
#    Parameters
#    ----------
#    vector_1 : numpy.array
#        vector from which Poincare distances are to be computed.
#        expected shape (dim,)
#    vectors_all : numpy.array
#        for each row in vectors_all, distance from vector_1 is computed.
#        expected shape (num_vectors, dim)
#    Returns
#    -------
#    numpy.array
#        Contains Poincare distance between vector_1 and each row in vectors_all.
#        shape (num_vectors,)
#    """
#    euclidean_dists = np.linalg.norm(vector_1 - vectors_all, axis=1)
#    norm = np.linalg.norm(vector_1)
#    all_norms = np.linalg.norm(vectors_all, axis=1)
#    return np.arccosh(
#        1 + 2 * (
#            (euclidean_dists ** 2) / ((1 - norm ** 2) * (1 - all_norms ** 2))
#        )
#    )

In [None]:
import os
os.environ['CUDA_VISIBLE_DEVICES'] = '-1'

In [None]:
num_cores = multiprocessing.cpu_count()-1
num_partitions = num_cores
df_split = np.array_split(df_eval_sample, num_partitions)

with multiprocessing.get_context('spawn').Pool() as pool: #avoids CUDA_ERROR_NOT_INITIALIZED
    pos_in_negs = np.concatenate(pool.map(partial(get_all_pos_in_neg, df=df_eval), df_split)) #parallel function must be imported of avoid AttributeError: Can't get attribute 'get_all_pos_in_neg' on <module '__main__' (built-in)>
    #pool.close()
    #pool.join()

In [None]:
from hyperlib_eval import get_pos_greater_than_quant

incriment = 0.05
quants = np.arange(0, 1+incriment, incriment)
pos_quant = get_pos_greater_than_quant(quants, pos_in_negs)
auc = np.trapz(pos_quant, quants)

In [None]:
fig, ax = plt.subplots()
ax.plot(quants, pos_quant)

props = dict(boxstyle='round', facecolor='white', alpha=0.5)
auc_lab = f"AUC {auc:.2f}"
ax.text(0.73, 0.1, auc_lab, transform=ax.transAxes, fontsize=14,
        verticalalignment='bottom', bbox=props)

plt.title("Quantile/Recall for test set pertubagens")
plt.xlabel("Quantile")
plt.ylabel("Recall")