In [1]:
import tensorflow as tf 
from utilities import softplus, get_max_with_structure, get_cross_logits_y

The binary cross entropy from logits is realized by 
$$
\begin{aligned}
L &= -y * \log\left(\frac{1}{1+e^{-x}}\right) - (1-y) * \log\left(1-\frac{1}{1+e^{-x}}\right) \\
  &= y * \log (1+e^{-x}) - (1-y) \log e^{-x} + (1-y) \log (1+e^{-x}) \\
  &= \log (1+e^{-x}) + (1-y)x
\end{aligned}
$$

In [2]:
y = tf.convert_to_tensor([[1.0, 0.0], [0.0, 0.0]], tf.float32)
logits = tf.random.uniform([2,2], -100000, 100000, tf.float32)
f = tf.losses.BinaryCrossentropy(from_logits=True)
print("official = {}".format(f(y, logits)))

def my_loss(y_true, logits):
    return tf.reduce_mean(softplus(logits) + (1-y) * logits)

print("my loss = {}".format(my_loss(y_true=y, logits=logits)))

official = 1080.99609375
my loss = 1080.99609375


The special loss function developed by C-HMC is 
$$
\begin{aligned}
L_i &= -y_i \log \left(\max_{j\leq i}\frac{y_j}{1+e^{-x_j}}\right) - (1-y_i) \log \left(1 - \max_{j\leq i}\frac{1}{1+e^{-x_j}}\right) \\
&= -y_i \log \left(\frac{1}{1+e^{-\max_{j\leq i}z_j}}\right) - (1-y_i) \log \left(1 - \frac{1}{1+e^{-\max_{j\leq i}x_j}}\right)
\end{aligned}
$$
where 
$$
z_j = \begin{cases} x_j, &y_i=1 \\ -\infty &y_i=0\end{cases}
$$
Then 
$$
\begin{aligned}
L_i &= y_i \log (1+e^{-\max_{j\leq i}z_j}) - (1-y_i) \log e^{-\max_{j\leq i}x_j} + (1-y_i) \log(1+e^{-\max_{j\leq i}x_j}) \\
&= y_i \log (1+e^{-\max_{j\leq i}z_j}) + (1-y_i) \max_{j\leq i}x_j + (1-y_i) \log(1+e^{-\max_{j\leq i}x_j})
\end{aligned}
$$

In [76]:
from utils import datasets
from utils.parser import * 
import networkx as nx 

dataset_name = 'seq_FUN'
train, valid, test = initialize_dataset(dataset_name, datasets)

train.Y = tf.convert_to_tensor(train.Y, tf.float32)
valid.Y = tf.convert_to_tensor(valid.Y, tf.float32)
test.Y = tf.convert_to_tensor(test.Y, tf.float32)

def get_structure_from_adajancency(adajancency):
    structure = np.zeros(adajancency.shape)
    g = nx.DiGraph(adajancency) # train.A is the matrix where the direct connections are stored 
    for i in range(len(adajancency)):
        ancestors = list(nx.descendants(g, i)) #here we need to use the function nx.descendants() because in the directed graph the edges have source from the descendant and point towards the ancestor 
        if ancestors:
            structure[i, ancestors] = 1
    return structure 

structure = get_structure_from_adajancency(train.A)

In [81]:
def loss_1(y, logits, structure):
    # use a modified version
    def max_with_structure(prob, structure):
        prob = prob.numpy()
        structure = (structure + np.eye(structure.shape[0])).astype(np.int16)
        prob1 = np.zeros_like(prob)
        for i in range(structure.shape[0]):
            prob1[:,i] = np.max(prob[:,structure[i]==1],axis=1)
        return tf.convert_to_tensor(prob1,tf.float32)

    prob = tf.nn.sigmoid(logits)
    y_prob = y * prob 
    def mylog(x):
        x = tf.where(x < 1e-20, 1e-20, x)
        return tf.math.log(x)
    def cross_y_log(y,prob):
        ''' return y * log(prob), to avoid 0 * log(0)'''
        r = tf.where(y != 0, y * mylog(prob), 0)
        return r 
    part1 = cross_y_log(y,max_with_structure(y_prob,structure)) 
    part2 = (1-y) * mylog(1 - max_with_structure(prob,structure))
    return tf.reduce_mean(-part1 -part2)

logits = tf.random.uniform(shape=train.Y.shape, minval=-1, maxval=1)
loss_1(train.Y, logits, structure)

<tf.Tensor: shape=(), dtype=float32, numpy=1.0505993>

In [82]:
def loss_2(y_true, logits, strucutre):
    cross_logits_y = get_cross_logits_y(y_true)
    max_with_structure = get_max_with_structure(structure)

    loss1 = y_true * softplus(max_with_structure(cross_logits_y(logits)))
    loss2 = (1-y_true) * softplus(max_with_structure(logits)) 
    loss3 = (1-y_true) * max_with_structure(logits)
    return tf.reduce_mean(loss1+loss2+loss3)
loss_2(train.Y, logits, structure)

<tf.Tensor: shape=(), dtype=float32, numpy=0.83531344>