In [2]:
import tensorflow as tf 
from utilities import softplus, get_max_with_structure, get_cross_logits_y
from models import HMCModel
from utils import datasets
from utils.parser import * 
import networkx as nx 

In [11]:

dataset_name = 'seq_FUN'
train, valid, test = initialize_dataset(dataset_name, datasets)

train.Y = tf.convert_to_tensor(train.Y, tf.float32)
valid.Y = tf.convert_to_tensor(valid.Y, tf.float32)
test.Y = tf.convert_to_tensor(test.Y, tf.float32)

def get_structure_from_adajancency(adajancency):
    structure = np.zeros(adajancency.shape)
    g = nx.DiGraph(adajancency) # train.A is the matrix where the direct connections are stored 
    for i in range(len(adajancency)):
        ancestors = list(nx.descendants(g, i)) #here we need to use the function nx.descendants() because in the directed graph the edges have source from the descendant and point towards the ancestor 
        if ancestors:
            structure[i, ancestors] = 1
    return structure 

def mymax1(prob, structure):
    structure = (structure + np.eye(structure.shape[0])).astype(np.float32)
    structure = structure[np.newaxis,:,:]
    prob = prob[:,:,tf.newaxis]
    prob1 = tf.reduce_max(prob * structure, axis=1)
    return prob1

def mylog(x):
    x = tf.where(x < 1e-30, 1e-30, x)
    return tf.math.log(x)

def loss_fn(y_true, y_logits, structure):
    idx = tf.cast(tf.reduce_sum(y_true, axis=0) != y_true.shape[0], tf.float32) # remove root nodes
    prob = tf.nn.sigmoid(y_logits)
    part1 = - y_true * mylog(mymax1(prob * y_true, structure))
    part2 = - (1-y_true) * mylog(1 - mymax1(prob, structure))
    loss = part1 + part2
    loss = tf.reduce_mean(loss*idx)
    return loss 

def loss_fn_funny(y_true, y_logits, structure):
    idx = tf.squeeze(tf.where(tf.reduce_sum(y_true, axis=0) != y_true.shape[0])) # remove root nodes
    prob = tf.nn.sigmoid(y_logits)
    c_out = mymax1(prob, structure)
    t_out = y_true * prob
    t_out = mymax1(t_out, structure)
    t_out = (1-y_true) * c_out + y_true * t_out
    y_true1 = tf.gather(y_true, idx, axis=1)
    t_out1 = tf.gather(t_out, idx, axis=1)
    loss = tf.losses.BinaryCrossentropy(from_logits=False)(
        y_true1, t_out1
    )
    return loss

from utilities import softplus

def get_z(y_true, logits):
    return tf.where(y_true == 1, logits, -np.inf)
def mymax2(logits, structure):
    structure = (structure + np.eye(structure.shape[0])).astype(np.float32)
    structure = np.where(structure == 1, structure, -np.inf)
    structure = np.where(structure==1, 0, structure)
    structure = structure[np.newaxis,:,:]
    logits = logits[:,:,tf.newaxis]
    outputs = tf.reduce_max(structure + logits, axis=1)
    return outputs
def loss_fn_logits(y_true, y_logits, structure):
    z = get_z(y_true, y_logits)
    max_z = mymax2(z, structure)
    max_z = tf.where(tf.math.is_inf(max_z), np.inf, max_z)
    max_x = mymax2(y_logits, structure)
    outputs = y_true * softplus(max_z) + (1-y_true)*max_x + (1-y_true)*softplus(max_x)
    return tf.reduce_mean(outputs)

structure = get_structure_from_adajancency(train.A)
model = HMCModel(structure, 500, [2000,2000], 0.7)
logits= model(train.X)
print(loss_fn_logits(train.Y, logits, structure))
print(loss_fn(train.Y, logits, structure))
print(loss_fn_funny(train.Y, logits, structure))

tf.Tensor(1392.6467, shape=(), dtype=float32)


The binary cross entropy from logits is realized by 
$$
\begin{aligned}
L &= -y * \log\left(\frac{1}{1+e^{-x}}\right) - (1-y) * \log\left(1-\frac{1}{1+e^{-x}}\right) \\
  &= y * \log (1+e^{-x}) - (1-y) \log e^{-x} + (1-y) \log (1+e^{-x}) \\
  &= \log (1+e^{-x}) + (1-y)x
\end{aligned}
$$

In [2]:
y = tf.convert_to_tensor([[1.0, 0.0], [0.0, 0.0]], tf.float32)
logits = tf.random.uniform([2,2], -100000, 100000, tf.float32)
f = tf.losses.BinaryCrossentropy(from_logits=True)
print("official = {}".format(f(y, logits)))

def my_loss(y_true, logits):
    return tf.reduce_mean(softplus(logits) + (1-y) * logits)

print("my loss = {}".format(my_loss(y_true=y, logits=logits)))

official = 1080.99609375
my loss = 1080.99609375


The special loss function developed by C-HMC is 
$$
\begin{aligned}
L_i &= -y_i \log \left(\max_{j\leq i}\frac{y_j}{1+e^{-x_j}}\right) - (1-y_i) \log \left(1 - \max_{j\leq i}\frac{1}{1+e^{-x_j}}\right) \\
&= -y_i \log \left(\frac{1}{1+e^{-\max_{j\leq i}z_j}}\right) - (1-y_i) \log \left(1 - \frac{1}{1+e^{-\max_{j\leq i}x_j}}\right)
\end{aligned}
$$
where 
$$
z_j = \begin{cases} x_j, &y_i=1 \\ -\infty &y_i=0\end{cases}
$$
Then 
$$
\begin{aligned}
L_i &= y_i \log (1+e^{-\max_{j\leq i}z_j}) - (1-y_i) \log e^{-\max_{j\leq i}x_j} + (1-y_i) \log(1+e^{-\max_{j\leq i}x_j}) \\
&= y_i \log (1+e^{-\max_{j\leq i}z_j}) + (1-y_i) \max_{j\leq i}x_j + (1-y_i) \log(1+e^{-\max_{j\leq i}x_j})
\end{aligned}
$$

In [25]:
def mylog(x):
    x = tf.where(x < 1e-10, 1e-10, x)
    return tf.math.log(x)
a1 = -(1-train.Y) * mylog(1 - mymax1(tf.nn.sigmoid(logits), structure))
a2 = (1-train.Y) * mymax2(logits, structure) + (1-train.Y) * softplus(mymax2(logits, structure))


In [26]:
a1 

<tf.Tensor: shape=(1701, 500), dtype=float32, numpy=
array([[23.025852,  0.      , 23.025852, ..., -0.      , -0.      ,
        -0.      ],
       [23.025852,  0.      ,  0.      , ..., -0.      , -0.      ,
        -0.      ],
       [23.025852, 23.025852, 23.025852, ..., -0.      , -0.      ,
        -0.      ],
       ...,
       [23.025852, 23.025852, 23.025852, ..., -0.      , -0.      ,
        -0.      ],
       [23.025852, 23.025852, 23.025852, ..., -0.      , -0.      ,
        -0.      ],
       [23.025852, 23.025852, 23.025852, ..., -0.      , -0.      ,
        -0.      ]], dtype=float32)>

In [24]:
a2 

<tf.Tensor: shape=(1701, 500), dtype=float32, numpy=
array([[ 6857.3125 ,     0.     ,  4710.999  , ...,     0.     ,
            0.     ,     0.     ],
       [11204.918  ,     0.     ,     0.     , ...,     0.     ,
            0.     ,     0.     ],
       [ 4930.644  ,  2479.6807 ,  3386.169  , ...,     0.     ,
            0.     ,     0.     ],
       ...,
       [ 3874.703  ,  1948.2192 ,  2660.057  , ...,     0.     ,
            0.     ,     0.     ],
       [ 1734.8218 ,   872.83765,  1190.6101 , ...,     0.     ,
            0.     ,     0.     ],
       [ 5344.9736 ,  2688.7131 ,  3674.2456 , ...,     0.     ,
            0.     ,     0.     ]], dtype=float32)>

In [27]:
mymax2(logits, structure)

<tf.Tensor: shape=(1701, 500), dtype=float32, numpy=
array([[ 6857.3125 ,  3451.138  ,  4710.999  , ..., -2666.988  ,
        -3162.3025 , -1660.6555 ],
       [11204.918  ,  5631.376  ,  7698.882  , ..., -4354.895  ,
        -5151.0503 , -2719.425  ],
       [ 4930.644  ,  2479.6807 ,  3386.169  , ..., -1917.3706 ,
        -2271.438  , -1196.0271 ],
       ...,
       [ 3874.703  ,  1948.2192 ,  2660.057  , ..., -1505.5505 ,
        -1781.5996 ,  -941.77686],
       [ 1734.8218 ,   872.83765,  1190.6101 , ...,  -674.4989 ,
         -799.27716,  -422.12793],
       [ 5344.9736 ,  2688.7131 ,  3674.2456 , ..., -2078.768  ,
        -2465.588  , -1297.5853 ]], dtype=float32)>