In [None]:
import tensorflow as tf
import tensorflow_probability as tfp
from tensorflow.keras import backend as K
from tensorflow.python.client import device_lib

import os
os.environ['TF_XLA_FLAGS'] = '--tf_xla_enable_xla_devices'
import sys
import pandas as pd, numpy as np,  matplotlib.pyplot as plt
import optuna
from optuna.samplers import RandomSampler
import timeit
import joblib
from copy import deepcopy

from sklearn.metrics import accuracy_score, roc_auc_score

import pathlib
sys.path.insert(0, os.path.abspath(os.getcwd()).split('src')[0])
from src import utils_multitask

from src.SoftTrees import losses
from src.SoftTrees import models_multitask
from src.SoftTrees import layers
import sparse_soft_trees

_DUMMY_RESPONSE = 1e8


2023-10-08 22:32:00.311053: I tensorflow/core/util/util.cc:169] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-10-08 22:32:05.921420: W tensorflow/stream_executor/platform/default/dso_loader.cc:64] Could not load dynamic library 'libcudart.so.11.0'; dlerror: libcudart.so.11.0: cannot open shared object file: No such file or directory
2023-10-08 22:32:05.967078: I tensorflow/stream_executor/cuda/cudart_stub.cc:29] Ignore above cudart dlerror if you do not have a GPU set up on your machine.


In [2]:
def get_available_gpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'GPU']

def get_available_cpus():
    local_device_protos = device_lib.list_local_devices()
    return [x.name for x in local_device_protos if x.device_type == 'CPU']


In [3]:
num_features = 22000
num_train_samples = 1000
num_valid_samples = 1000
num_test_samples = 1000

x_train = np.random.uniform(size=(num_train_samples, num_features))
x_valid = np.random.uniform(size=(num_valid_samples, num_features))
x_test = np.random.uniform(size=(num_test_samples, num_features))
y_train = np.round(np.random.uniform(size=(num_train_samples, )))
y_valid = np.round(np.random.uniform(size=(num_valid_samples, )))
y_test = np.round(np.random.uniform(size=(num_test_samples, )))

In [4]:
x_train.shape, y_train.shape, x_valid.shape, y_valid.shape, x_test.shape, y_test.shape

((1000, 22000), (1000,), (1000, 22000), (1000,), (1000, 22000), (1000,))

In [39]:
num_tasks = 1
num_classes = 2
constant_batch_size = 64
batch_size_scaler = 16
batch_size = constant_batch_size*batch_size_scaler
constant_learning_rate = 1e0
learning_rate_scheduling = True
use_annealing = True
epochs = 10
kernel_l2 = 1e-1
if use_annealing:
    kernel_constraint = 100
    temperature = 1e-3
else:
    kernel_constraint = 1e2
early_stopping = False



if num_train_samples % batch_size == 0:
    epoch_step = num_train_samples / batch_size
else:
    epoch_step = int(num_train_samples / batch_size) + 1

if learning_rate_scheduling:
    learning_rate = np.sqrt(batch_size)*constant_learning_rate
    max_steps = epoch_step * epochs
    lr_schedule = sparse_soft_trees.LinearEpochGradualWarmupPolynomialDecayLearningRate(
        1e-2,
        learning_rate,
        5*epoch_step,
        max_steps,
        power=2.0
    )
else:
#         epochs = trial.suggest_int('epochs_new', 5, epochs)
    print("==============No LR scheduler, Epochs:", epochs, "Batch-size:", batch_size)
    print("==============epochs:", epochs)
    learning_rate = constant_learning_rate
    lr_schedule = sparse_soft_trees.ConstantLearningRate(
        learning_rate
    )
optim = tf.keras.optimizers.SGD(lr_schedule)

### Soft Decision Tree parameters 
num_trees = 50
depth = 6
num_layers = 1

activation = tf.keras.activations.sigmoid
group_sparsity = 'GroupL0L2'
kernel_l2 = kernel_l2/(num_trees*(2**depth - 1))
kernel_regularizer = tf.keras.regularizers.L2(kernel_l2)
if use_annealing:
    kernel_constraint = kernel_constraint/num_features
    temperature = temperature
    kernel_constraint=sparse_soft_trees.ProximalGroupL0(lr=lr_schedule, lam=kernel_constraint, temperature=temperature, use_annealing=True, name='ProximalGroupL0')
else:
    kernel_constraint = kernel_constraint/num_features
    kernel_constraint=sparse_soft_trees.ProximalGroupL0(lr=lr_schedule, lam=kernel_constraint, use_annealing=False, name='ProximalGroupL0')
print("===========kernel_regularizer:", kernel_regularizer)
print("===========kernel_constraint:", kernel_constraint)

### Loss parameters
task_weights = np.ones(num_tasks)
output_activation = 'linear'
loss = losses.NegativeLogLikelihood()

### Optimization parameters
model_type = None

leaf_dims = (num_classes, )
x = tf.keras.layers.Input(name='input', shape=(num_features,))
submodel = models_multitask.create_multitask_sparse_submodel(
    x,
    num_layers,
    num_trees,
    depth,
    num_tasks,
    leaf_dims,
    "Classification",
    activation=activation,
    model_type=model_type,
    kernel_regularizer=kernel_regularizer,
    kernel_constraint=kernel_constraint,
)
x = submodel.input
outputs = submodel(x)
# print(outputs)
ypred = tf.keras.layers.Activation('linear')(outputs)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
model = tf.keras.Model(inputs=x, outputs=ypred)
model.summary()


monitor = 'val_accuracy'
metrics = ['accuracy']
model.compile(loss=loss, optimizer=optim, metrics=metrics)


Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 22000)]           0         
                                                                 
 Classification (Functional)  (None, 2)                69309550  
                                                                 
 activation_6 (Activation)   (None, 2)                 0         
                                                                 
Total params: 69,309,550
Trainable params: 69,309,550
Non-trainable params: 0
_________________________________________________________________


In [40]:
compression = 620
xc = tf.keras.layers.Input(name='input', shape=((int)(num_features/compression),))
submodelc = models_multitask.create_multitask_sparse_submodel(
    xc,
    num_layers,
    num_trees,
    depth,
    num_tasks,
    leaf_dims,
    "Classification",
    activation=activation,
    model_type=model_type,
    kernel_regularizer=kernel_regularizer,
    kernel_constraint=kernel_constraint,
)
xc = submodelc.input
outputsc = submodelc(xc)
# print(outputs)
ypredc = tf.keras.layers.Activation('linear')(outputsc)
loss = tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True)
modelc = tf.keras.Model(inputs=xc, outputs=ypredc)
modelc.summary()


monitor = 'val_accuracy'
metrics = ['accuracy']
modelc.compile(loss=loss, optimizer=optim, metrics=metrics)


Model: "model_7"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 35)]              0         
                                                                 
 Classification (Functional)  (None, 2)                119800    
                                                                 
 activation_7 (Activation)   (None, 2)                 0         
                                                                 
Total params: 119,800
Trainable params: 119,800
Non-trainable params: 0
_________________________________________________________________


In [41]:
cb = sparse_soft_trees.SparsityHistory()
callbacks = [
    tf.keras.callbacks.TerminateOnNaN(),
    cb
]    
if early_stopping:
    callbacks.append(
        tf.keras.callbacks.EarlyStopping(
            monitor=monitor, patience=50, verbose=1, mode='auto', restore_best_weights=True
        ),
    )
# print("====================y.shape", data_processed.y_train_processed.shape)
if len(get_available_gpus())==0:
    history = model.fit(x=x_train, 
              y=y_train,
              epochs=1, 
              batch_size=batch_size, 
              shuffle=True,
              callbacks=callbacks,
              validation_data=(x_valid, y_valid),
              verbose=1, 
              )  
else:
    with tf.device(get_available_gpus()[0]):
        history = model.fit(x=x_train, 
                  y=y_train,
                  epochs=epochs, 
                  batch_size=batch_size, 
                  shuffle=True,
                  callbacks=callbacks,
                  validation_data=(x_valid, y_valid),
                  verbose=1, 
                  )  
number_of_epochs_it_ran = len(history.history['loss'])


2023-10-08 22:30:20.911449: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /device:GPU:0 with 30976 MB memory:  -> device: 0, name: Tesla V100-PCIE-32GB, pci bus id: 0000:d8:00.0, compute capability: 7.0
2023-10-08 22:30:20.912650: I tensorflow/core/common_runtime/gpu/gpu_device.cc:1532] Created device /device:GPU:0 with 30976 MB memory:  -> device: 0, name: Tesla V100-PCIE-32GB, pci bus id: 0000:d8:00.0, compute capability: 7.0


Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


In [42]:
%%timeit
model.predict(x_valid, batch_size=num_test_samples)

179 ms ± 1.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [43]:
a = np.arange(num_features)
np.random.shuffle(a) 

In [44]:
weights = model.layers[1].layers[1].dense_layer.get_weights()
beta = weights[0]
bias = weights[1]
betac = beta[a[:((int)(num_features/compression))],:]

In [45]:
betac.shape

(35, 3150)

In [46]:
modelc.layers[1].layers[1].dense_layer.set_weights([betac, bias]) 

In [47]:
model.summary()

Model: "model_6"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 input (InputLayer)          [(None, 22000)]           0         
                                                                 
 Classification (Functional)  (None, 2)                69309550  
                                                                 
 activation_6 (Activation)   (None, 2)                 0         
                                                                 
Total params: 69,309,550
Trainable params: 69,309,550
Non-trainable params: 0
_________________________________________________________________


In [48]:
x_validc = x_valid[:,a[:((int)(num_features/compression))]]

In [49]:
%%timeit
modelc.predict(x_validc, batch_size=num_test_samples)

45.2 ms ± 1.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
