In [1]:
%matplotlib inline

In [2]:
from sklearn import datasets
import numpy as np
import matplotlib.pyplot as plt

_Iris dataset_ has **too few** instances. Let's take _Fashion MNIST_ instead.

In [3]:
import tensorflow as tf

In [4]:
a = tf.constant([0,1,2], dtype=tf.float32)
tf.math.reduce_sum(a)

<tf.Tensor: shape=(), dtype=float32, numpy=3.0>

In [5]:
import tensorflow.keras as keras
K = keras.backend
#K.math.reduce_sum(a)

```python
---------------------------------------------------------------------------
AttributeError                            Traceback (most recent call last)
<ipython-input-16-9c036f0ad0a4> in <module>
      1 import tensorflow.keras as keras
      2 K = keras.backend
----> 3 K.math.reduce_sum(a)

AttributeError: module 'tensorflow.keras.backend' has no attribute 'math'
```

#### Instead, just do `K.sum(a)`

In [6]:
K.sum(a)

<tf.Tensor: shape=(), dtype=float32, numpy=3.0>

In [7]:
def wicked_sigmoid(y_true, y_pred):
    """
    Our customized loss function
    Example: (4-class classification)
        y_pred = [ -500.0, -37.92, 99.99]
        y_true = [0, 0, 0]  # meaning that the true class is the fourth one.
    """
    s = tf.math.reduce_sum(y_pred)
    print(f"s = {s}")
    last_proba = 1 - tf.math.sigmoid(s)
    print(f"tf.math.sigmoid(s) = {tf.math.sigmoid(s)}")
    print(f"last_proba = 1 - tf.math.sigmoid(s) = {last_proba}")
    first_probas = tf.math.sigmoid(s) * tf.nn.softmax(y_pred)
    print(f"tf.nn.softmax(y_pred) = {tf.nn.softmax(y_pred)}")
    print(f"first_probas = {first_probas}")
    yy_pred = tf.concat([first_probas, [last_proba]], axis=0)
    print(f"yy_pred = {yy_pred}")
    yy_true = tf.cond(tf.math.reduce_sum(y_true) < 0.01,
                      true_fn=lambda: tf.concat([y_true, [1]], 0),
                      false_fn=lambda: tf.concat([y_true, [0]], 0),
    )
    print(f"yy_true = {yy_true}")
    return tf.nn.softmax_cross_entropy_with_logits(
        labels=yy_true,
        logits=yy_pred,
    )

In [8]:
wicked_sigmoid(tf.constant([0,0,0], dtype=tf.float32), tf.constant([-500.0, -37.92, 99.99]))
#crooked_sigmoid(tf.constant([0,0,0], dtype=tf.float32), tf.constant([-500.0, -37.92, 99.99]))

s = -437.92999267578125
tf.math.sigmoid(s) = 0.0
last_proba = 1 - tf.math.sigmoid(s) = 1.0
tf.nn.softmax(y_pred) = [0. 0. 1.]
first_probas = [0. 0. 0.]
yy_pred = [0. 0. 0. 1.]
yy_true = [0. 0. 0. 1.]


<tf.Tensor: shape=(), dtype=float32, numpy=0.74366844>

### But...
We are not so sure about the shape of `y_true` and `y_pred` in keras. How can we make sure of that?

In [135]:
fashion_mnist = tf.keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
train_labels.shape

(60000,)

In [136]:
train_labels[0]

9

In [137]:
tf.one_hot(9, depth=10)

<tf.Tensor: shape=(10,), dtype=float32, numpy=array([0., 0., 0., 0., 0., 0., 0., 0., 0., 1.], dtype=float32)>

In [138]:
tf.one_hot(train_labels, depth=10)

<tf.Tensor: shape=(60000, 10), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [139]:
tf.one_hot(train_labels, depth=10)[:,:-1]

<tf.Tensor: shape=(60000, 9), dtype=float32, numpy=
array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)>

In [140]:
tf.one_hot(train_labels, depth=10)[:,:-1].shape

TensorShape([60000, 9])

In [141]:
type(train_images)

numpy.ndarray

In [142]:
from sklearn.preprocessing import OneHotEncoder

In [143]:
encoder = OneHotEncoder()
#hey = encoder.fit_transform(train_labels[np.newaxis])
hey = encoder.fit_transform(train_labels.reshape((-1,1)))
hey

<60000x10 sparse matrix of type '<class 'numpy.float64'>'
	with 60000 stored elements in Compressed Sparse Row format>

In [144]:
train_images = train_images / 255.0
test_images = test_images / 255.0

model = tf.keras.Sequential([
    tf.keras.layers.Flatten(input_shape=(28, 28)),
    tf.keras.layers.Dense(128, activation='relu'),
    tf.keras.layers.Dense(10)
])

model.compile(optimizer='adam',
              loss=wicked_sigmoid,
              metrics=['accuracy'])

In [145]:
a

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([0., 1., 2.], dtype=float32)>

In [146]:
tf.ones(a.shape[0])

<tf.Tensor: shape=(3,), dtype=float32, numpy=array([1., 1., 1.], dtype=float32)>

In [147]:
tf.one_hot(train_labels, depth=10)[:,:-1].numpy()

array([[0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [148]:
hey.toarray().shape

(60000, 10)

In [149]:
hey.toarray()

array([[0., 0., 0., ..., 0., 0., 1.],
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       ...,
       [0., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]])

### We don't need `sklearn.preprocessing.OneHotEncoder`

In [150]:
tf.one_hot(train_labels, depth=10)[:,:-1].numpy().shape

(60000, 9)

In [151]:
model.fit(train_images, tf.one_hot(train_labels, depth=10)[:,:-1].numpy(), epochs=10)

Epoch 1/10
s = Tensor("wicked_sigmoid/Sum:0", shape=(), dtype=float32)
tf.math.sigmoid(s) = Tensor("wicked_sigmoid/Sigmoid_1:0", shape=(), dtype=float32)
last_proba = 1 - tf.math.sigmoid(s) = Tensor("wicked_sigmoid/sub:0", shape=(), dtype=float32)
tf.nn.softmax(y_pred) = Tensor("wicked_sigmoid/Softmax_1:0", shape=(32, 10), dtype=float32)
first_probas = Tensor("wicked_sigmoid/mul:0", shape=(32, 10), dtype=float32)


ValueError: in user code:

    /home/phunc20/.virtualenvs/tf2.3.0-py3.8/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py:806 train_function  *
        return step_function(self, iterator)
    <ipython-input-7-ec2c57535c9c>:16 wicked_sigmoid  *
        yy_pred = tf.concat([first_probas, [last_proba]], axis=0)
    /home/phunc20/.virtualenvs/tf2.3.0-py3.8/lib/python3.8/site-packages/tensorflow/python/util/dispatch.py:201 wrapper  **
        return target(*args, **kwargs)
    /home/phunc20/.virtualenvs/tf2.3.0-py3.8/lib/python3.8/site-packages/tensorflow/python/ops/array_ops.py:1654 concat
        return gen_array_ops.concat_v2(values=values, axis=axis, name=name)
    /home/phunc20/.virtualenvs/tf2.3.0-py3.8/lib/python3.8/site-packages/tensorflow/python/ops/gen_array_ops.py:1221 concat_v2
        _, _, _op, _outputs = _op_def_library._apply_op_helper(
    /home/phunc20/.virtualenvs/tf2.3.0-py3.8/lib/python3.8/site-packages/tensorflow/python/framework/op_def_library.py:742 _apply_op_helper
        op = g._create_op_internal(op_type_name, inputs, dtypes=None,
    /home/phunc20/.virtualenvs/tf2.3.0-py3.8/lib/python3.8/site-packages/tensorflow/python/framework/func_graph.py:591 _create_op_internal
        return super(FuncGraph, self)._create_op_internal(  # pylint: disable=protected-access
    /home/phunc20/.virtualenvs/tf2.3.0-py3.8/lib/python3.8/site-packages/tensorflow/python/framework/ops.py:3477 _create_op_internal
        ret = Operation(
    /home/phunc20/.virtualenvs/tf2.3.0-py3.8/lib/python3.8/site-packages/tensorflow/python/framework/ops.py:1974 __init__
        self._c_op = _create_c_op(self._graph, node_def, inputs,
    /home/phunc20/.virtualenvs/tf2.3.0-py3.8/lib/python3.8/site-packages/tensorflow/python/framework/ops.py:1815 _create_c_op
        raise ValueError(str(e))

    ValueError: Shape must be rank 2 but is rank 1 for '{{node wicked_sigmoid/concat}} = ConcatV2[N=2, T=DT_FLOAT, Tidx=DT_INT32](wicked_sigmoid/mul, wicked_sigmoid/concat/values_1, wicked_sigmoid/concat/axis)' with input shapes: [32,10], [1], [].


In [152]:
help(tf.math.reduce_sum)

Help on function reduce_sum in module tensorflow.python.ops.math_ops:

reduce_sum(input_tensor, axis=None, keepdims=False, name=None)
    Computes the sum of elements across dimensions of a tensor.
    
    Reduces `input_tensor` along the dimensions given in `axis`.
    Unless `keepdims` is true, the rank of the tensor is reduced by 1 for each
    entry in `axis`. If `keepdims` is true, the reduced dimensions
    are retained with length 1.
    
    If `axis` is None, all dimensions are reduced, and a
    tensor with a single element is returned.
    
    For example:
    
    >>> # x has a shape of (2, 3) (two rows and three columns):
    >>> x = tf.constant([[1, 1, 1], [1, 1, 1]])
    >>> x.numpy()
    array([[1, 1, 1],
           [1, 1, 1]], dtype=int32)
    >>> # sum all the elements
    >>> # 1 + 1 + 1 + 1 + 1+ 1 = 6
    >>> tf.reduce_sum(x).numpy()
    6
    >>> # reduce along the first dimension
    >>> # the result is [1, 1, 1] + [1, 1, 1] = [2, 2, 2]
    >>> tf.reduce_sum(x, 

In [153]:
def crooked_sigmoid(y_true, y_pred):
    """
    Our customized loss function
    Example: (4-class classification)
        y_pred = [ -500.0, -37.92, 99.99]
        y_true = [0, 0, 0]  # meaning that the true class is the fourth one.
    """
    # I guess y_pred.shape equals (m, 9) for Fashion MNIST
    s = tf.math.reduce_sum(y_pred, axis=1)
    print(f"s = {s}")
    last_proba = 1 - tf.math.sigmoid(s)
    print(f"tf.math.sigmoid(s) = {tf.math.sigmoid(s)}")
    print(f"last_proba = 1 - tf.math.sigmoid(s) = {last_proba}")

In [154]:
y_true = tf.one_hot(train_labels, depth=10)[:,:-1].numpy()
y_pred = np.empty_like(y_true)

In [155]:
y_true.shape, y_pred.shape

((60000, 9), (60000, 9))

In [32]:
crooked_sigmoid(y_true, y_pred)

s = [-2.8311398e-09  1.0000000e+00  1.0000000e+00 ...  1.0000000e+00
  1.0000000e+00  1.0000000e+00]
tf.math.sigmoid(s) = [0.5       0.7310586 0.7310586 ... 0.7310586 0.7310586 0.7310586]
last_proba = 1 - tf.math.sigmoid(s) = [0.5       0.2689414 0.2689414 ... 0.2689414 0.2689414 0.2689414]


In [33]:
def crooked_sigmoid(y_true, y_pred):
    """
    Our customized loss function
    Example: (4-class classification)
        y_pred = [ -500.0, -37.92, 99.99]
        y_true = [0, 0, 0]  # meaning that the true class is the fourth one.
    """
    # I guess y_pred.shape equals (m, 9) for Fashion MNIST
    s = tf.math.reduce_sum(y_pred, axis=1)
    print(f"s = {s}")
    print(f"s.shape = {s.shape}")
    last_proba = 1 - tf.math.sigmoid(s)
    print(f"tf.math.sigmoid(s) = {tf.math.sigmoid(s)}")
    print(f"tf.math.sigmoid(s).shape = {tf.math.sigmoid(s).shape}")
    print(f"last_proba = 1 - tf.math.sigmoid(s) = {last_proba}")
    print(f"last_proba.shape = {last_proba.shape}")

In [34]:
crooked_sigmoid(y_true, y_pred)

s = [-2.8311398e-09  1.0000000e+00  1.0000000e+00 ...  1.0000000e+00
  1.0000000e+00  1.0000000e+00]
s.shape = (60000,)
tf.math.sigmoid(s) = [0.5       0.7310586 0.7310586 ... 0.7310586 0.7310586 0.7310586]
tf.math.sigmoid(s).shape = (60000,)
last_proba = 1 - tf.math.sigmoid(s) = [0.5       0.2689414 0.2689414 ... 0.2689414 0.2689414 0.2689414]
last_proba.shape = (60000,)


In [35]:
def crooked_sigmoid(y_true, y_pred):
    """
    Our customized loss function
    Example: (4-class classification)
        y_pred = [ -500.0, -37.92, 99.99]
        y_true = [0, 0, 0]  # meaning that the true class is the fourth one.
    """
    # I guess y_pred.shape equals (m, 9) for Fashion MNIST
    s = tf.math.reduce_sum(y_pred, axis=1)
    print(f"s = {s}")
    print(f"s.shape = {s.shape}")
    last_proba = 1 - tf.math.sigmoid(s)
    print(f"tf.math.sigmoid(s) = {tf.math.sigmoid(s)}")
    print(f"tf.math.sigmoid(s).shape = {tf.math.sigmoid(s).shape}")
    print(f"last_proba = 1 - tf.math.sigmoid(s) = {last_proba}")
    print(f"last_proba.shape = {last_proba.shape}")
    first_probas = tf.math.sigmoid(s) * tf.nn.softmax(y_pred)
    print(f"tf.nn.softmax(y_pred) = {tf.nn.softmax(y_pred)}")
    print(f"first_probas = {first_probas}")
    print(f"first_probas.shape = {first_probas.shape}")

```python
InvalidArgumentError: Incompatible shapes: [60000] vs. [60000,9] [Op:Mul]
```

In [37]:
def crooked_sigmoid(y_true, y_pred):
    """
    Our customized loss function
    Example: (4-class classification)
        y_pred = [ -500.0, -37.92, 99.99]
        y_true = [0, 0, 0]  # meaning that the true class is the fourth one.
    """
    # I guess y_pred.shape equals (m, 9) for Fashion MNIST
    s = tf.math.reduce_sum(y_pred, axis=1)
    print(f"s = {s}")
    print(f"s.shape = {s.shape}")
    last_proba = 1 - tf.math.sigmoid(s)
    print(f"tf.math.sigmoid(s) = {tf.math.sigmoid(s)}")
    print(f"tf.math.sigmoid(s).shape = {tf.math.sigmoid(s).shape}")
    print(f"last_proba = 1 - tf.math.sigmoid(s) = {last_proba}")
    print(f"last_proba.shape = {last_proba.shape}")
    first_probas = tf.math.multiply(tf.math.sigmoid(s), tf.nn.softmax(y_pred))
    print(f"tf.nn.softmax(y_pred) = {tf.nn.softmax(y_pred)}")
    print(f"first_probas = {first_probas}")
    print(f"first_probas.shape = {first_probas.shape}")

```python
InvalidArgumentError: Incompatible shapes: [60000] vs. [60000,9] [Op:Mul]
```

In [39]:
tf.constant([1,2,3]) * tf.constant([1,2,3])

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([1, 4, 9], dtype=int32)>

In [40]:
help(tf.stack)

Help on function stack in module tensorflow.python.ops.array_ops:

stack(values, axis=0, name='stack')
    Stacks a list of rank-`R` tensors into one rank-`(R+1)` tensor.
    
    See also `tf.concat`, `tf.tile`, `tf.repeat`.
    
    Packs the list of tensors in `values` into a tensor with rank one higher than
    each tensor in `values`, by packing them along the `axis` dimension.
    Given a list of length `N` of tensors of shape `(A, B, C)`;
    
    if `axis == 0` then the `output` tensor will have the shape `(N, A, B, C)`.
    if `axis == 1` then the `output` tensor will have the shape `(A, N, B, C)`.
    Etc.
    
    For example:
    
    >>> x = tf.constant([1, 4])
    >>> y = tf.constant([2, 5])
    >>> z = tf.constant([3, 6])
    >>> tf.stack([x, y, z])
    <tf.Tensor: shape=(3, 2), dtype=int32, numpy=
    array([[1, 4],
           [2, 5],
           [3, 6]], dtype=int32)>
    >>> tf.stack([x, y, z], axis=1)
    <tf.Tensor: shape=(2, 3), dtype=int32, numpy=
    array([[1, 2,

In [41]:
def crooked_sigmoid(y_true, y_pred):
    """
    Our customized loss function
    Example: (4-class classification)
        y_pred = [ -500.0, -37.92, 99.99]
        y_true = [0, 0, 0]  # meaning that the true class is the fourth one.
    """
    # I guess y_pred.shape equals (m, 9) for Fashion MNIST
    s = tf.math.reduce_sum(y_pred, axis=1)
    print(f"s = {s}")
    print(f"s.shape = {s.shape}")
    last_proba = 1 - tf.math.sigmoid(s)
    print(f"tf.math.sigmoid(s) = {tf.math.sigmoid(s)}")
    print(f"tf.math.sigmoid(s).shape = {tf.math.sigmoid(s).shape}")
    print(f"last_proba = 1 - tf.math.sigmoid(s) = {last_proba}")
    print(f"last_proba.shape = {last_proba.shape}")
    #first_probas = tf.math.sigmoid(s) * tf.nn.softmax(y_pred)
    print(f"y_pred[1] = {y_pred[1]}")
    print(f"tf.nn.softmax(y_pred)[1] = {tf.nn.softmax(y_pred)[1]}")
    #print(f"first_probas = {first_probas}")
    #print(f"first_probas.shape = {first_probas.shape}")
    #yy_pred = tf.concat([first_probas, [last_proba]], axis=0)
    #print(f"yy_pred = {yy_pred}")
    #yy_true = tf.cond(tf.math.reduce_sum(y_true) < 0.01,
    #                  true_fn=lambda: tf.concat([y_true, [1]], 0),
    #                  false_fn=lambda: tf.concat([y_true, [0]], 0),
    #)
    #print(f"yy_true = {yy_true}")
    #return tf.nn.softmax_cross_entropy_with_logits(
    #    labels=yy_true,
    #    logits=yy_pred,
    #)

In [42]:
crooked_sigmoid(y_true, y_pred)

s = [-2.8311398e-09  1.0000000e+00  1.0000000e+00 ...  1.0000000e+00
  1.0000000e+00  1.0000000e+00]
s.shape = (60000,)
tf.math.sigmoid(s) = [0.5       0.7310586 0.7310586 ... 0.7310586 0.7310586 0.7310586]
tf.math.sigmoid(s).shape = (60000,)
last_proba = 1 - tf.math.sigmoid(s) = [0.5       0.2689414 0.2689414 ... 0.2689414 0.2689414 0.2689414]
last_proba.shape = (60000,)
y_pred[1] = [1. 0. 0. 0. 0. 0. 0. 0. 0.]
tf.nn.softmax(y_pred)[1] = [0.2536117  0.09329854 0.09329854 0.09329854 0.09329854 0.09329854
 0.09329854 0.09329854 0.09329854]


In [44]:
1 / (np.e + 8*1)

0.09329853571724647

In [45]:
np.e / (np.e + 8*1)

0.2536117142620283

In [47]:
tf.stack([np.arange(10)]*9)

<tf.Tensor: shape=(9, 10), dtype=int32, numpy=
array([[0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
       [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]], dtype=int32)>

In [48]:
tf.stack([np.arange(10)]*9, axis=1)

<tf.Tensor: shape=(10, 9), dtype=int64, numpy=
array([[0, 0, 0, 0, 0, 0, 0, 0, 0],
       [1, 1, 1, 1, 1, 1, 1, 1, 1],
       [2, 2, 2, 2, 2, 2, 2, 2, 2],
       [3, 3, 3, 3, 3, 3, 3, 3, 3],
       [4, 4, 4, 4, 4, 4, 4, 4, 4],
       [5, 5, 5, 5, 5, 5, 5, 5, 5],
       [6, 6, 6, 6, 6, 6, 6, 6, 6],
       [7, 7, 7, 7, 7, 7, 7, 7, 7],
       [8, 8, 8, 8, 8, 8, 8, 8, 8],
       [9, 9, 9, 9, 9, 9, 9, 9, 9]])>

In [50]:
tf.shape(np.arange(3*5).reshape((3,5)))

<tf.Tensor: shape=(2,), dtype=int32, numpy=array([3, 5], dtype=int32)>

In [51]:
tf.shape([[1,2,3], [4,5,6]])

<tf.Tensor: shape=(2,), dtype=int32, numpy=array([2, 3], dtype=int32)>

In [54]:
tf.shape([[1,2,3], [4,5,6]])[0]

<tf.Tensor: shape=(), dtype=int32, numpy=2>

In [55]:
tf.shape([[1,2,3], [4,5,6]])[1]

<tf.Tensor: shape=(), dtype=int32, numpy=3>

In [56]:
def crooked_sigmoid(y_true, y_pred):
    """
    Our customized loss function
    Example: (4-class classification)
        y_pred = [ -500.0, -37.92, 99.99]
        y_true = [0, 0, 0]  # meaning that the true class is the fourth one.
    """
    # I guess y_pred.shape equals (m, 9) for Fashion MNIST
    s = tf.math.reduce_sum(y_pred, axis=1)
    print(f"s = {s}")
    print(f"s.shape = {s.shape}")
    last_proba = 1 - tf.math.sigmoid(s)
    print(f"tf.math.sigmoid(s) = {tf.math.sigmoid(s)}")
    print(f"tf.math.sigmoid(s).shape = {tf.math.sigmoid(s).shape}")
    print(f"last_proba = 1 - tf.math.sigmoid(s) = {last_proba}")
    print(f"last_proba.shape = {last_proba.shape}")
    #first_probas = tf.math.sigmoid(s) * tf.nn.softmax(y_pred)
    first_probas = tf.stack([tf.math.sigmoid(s)]*tf.shape(y_pred)[1] , axis=1) * tf.nn.softmax(y_pred)
    print(f"y_pred[1] = {y_pred[1]}")
    print(f"tf.nn.softmax(y_pred)[1] = {tf.nn.softmax(y_pred)[1]}")
    #print(f"first_probas = {first_probas}")
    print(f"first_probas[1] = {first_probas[1]}")
    print(f"first_probas.shape = {first_probas.shape}")
    #yy_pred = tf.concat([first_probas, [last_proba]], axis=0)
    #print(f"yy_pred = {yy_pred}")
    #yy_true = tf.cond(tf.math.reduce_sum(y_true) < 0.01,
    #                  true_fn=lambda: tf.concat([y_true, [1]], 0),
    #                  false_fn=lambda: tf.concat([y_true, [0]], 0),
    #)
    #print(f"yy_true = {yy_true}")
    #return tf.nn.softmax_cross_entropy_with_logits(
    #    labels=yy_true,
    #    logits=yy_pred,
    #)

In [64]:
?tf.float32

In [65]:
tf.as_dtype(3)

tf.int32

In [66]:
tf.cast(3, tf.float32)

<tf.Tensor: shape=(), dtype=float32, numpy=3.0>

In [73]:
def crooked_sigmoid(y_true, y_pred):
    """
    Our customized loss function
    Example: (4-class classification)
        y_pred = [ -500.0, -37.92, 99.99]
        y_true = [0, 0, 0]  # meaning that the true class is the fourth one.
    """
    # I guess y_pred.shape equals (m, 9) for Fashion MNIST
    s = tf.math.reduce_sum(y_pred, axis=1)
    print(f"s = {s}")
    print(f"s.shape = {s.shape}")
    last_proba = 1 - tf.math.sigmoid(s)
    print(f"tf.math.sigmoid(s) = {tf.math.sigmoid(s)}")
    print(f"tf.math.sigmoid(s).shape = {tf.math.sigmoid(s).shape}")
    print(f"last_proba = 1 - tf.math.sigmoid(s) = {last_proba}")
    print(f"last_proba.shape = {last_proba.shape}")
    #first_probas = tf.math.sigmoid(s) * tf.nn.softmax(y_pred)
    #print()
    first_probas = tf.stack([tf.math.sigmoid(s)]*tf.cast(tf.shape(y_pred)[1], tf.float32) , axis=1) * tf.nn.softmax(y_pred)
    print(f"y_pred[1] = {y_pred[1]}")
    print(f"tf.nn.softmax(y_pred)[1] = {tf.nn.softmax(y_pred)[1]}")
    #print(f"first_probas = {first_probas}")
    print(f"first_probas[1] = {first_probas[1]}")
    print(f"first_probas.shape = {first_probas.shape}")
    #yy_pred = tf.concat([first_probas, [last_proba]], axis=0)
    #print(f"yy_pred = {yy_pred}")
    #yy_true = tf.cond(tf.math.reduce_sum(y_true) < 0.01,
    #                  true_fn=lambda: tf.concat([y_true, [1]], 0),
    #                  false_fn=lambda: tf.concat([y_true, [0]], 0),
    #)
    #print(f"yy_true = {yy_true}")
    #return tf.nn.softmax_cross_entropy_with_logits(
    #    labels=yy_true,
    #    logits=yy_pred,
    #)

In [74]:
crooked_sigmoid(y_true, y_pred)

s = [-2.8311398e-09  1.0000000e+00  1.0000000e+00 ...  1.0000000e+00
  1.0000000e+00  1.0000000e+00]
s.shape = (60000,)
tf.math.sigmoid(s) = [0.5       0.7310586 0.7310586 ... 0.7310586 0.7310586 0.7310586]
tf.math.sigmoid(s).shape = (60000,)
last_proba = 1 - tf.math.sigmoid(s) = [0.5       0.2689414 0.2689414 ... 0.2689414 0.2689414 0.2689414]
last_proba.shape = (60000,)
y_pred[1] = [1. 0. 0. 0. 0. 0. 0. 0. 0.]
tf.nn.softmax(y_pred)[1] = [0.2536117  0.09329854 0.09329854 0.09329854 0.09329854 0.09329854
 0.09329854 0.09329854 0.09329854]
first_probas[1] = [1.6686453 0.6138603 0.6138603 0.6138603 0.6138603 0.6138603 0.6138603
 0.6138603 0.6138603]
first_probas.shape = (60000, 9)


In [89]:
def crooked_sigmoid(y_true, y_pred):
    """
    Our customized loss function
    Example: (4-class classification)
        y_pred = [ -500.0, -37.92, 99.99]
        y_true = [0, 0, 0]  # meaning that the true class is the fourth one.
    """
    # I guess y_pred.shape equals (m, 9) for Fashion MNIST
    s = tf.math.reduce_sum(y_pred, axis=1)
    print(f"s = {s}")
    print(f"s.shape = {s.shape}")
    last_proba = 1 - tf.math.sigmoid(s)
    print(f"tf.math.sigmoid(s) = {tf.math.sigmoid(s)}")
    print(f"tf.math.sigmoid(s).shape = {tf.math.sigmoid(s).shape}")
    print(f"last_proba = 1 - tf.math.sigmoid(s) = {last_proba}")
    print(f"last_proba.shape = {last_proba.shape}")
    #first_probas = tf.math.sigmoid(s) * tf.nn.softmax(y_pred)
    #print()
    #first_probas = tf.stack([tf.math.sigmoid(s)]*tf.cast(tf.shape(y_pred)[1], tf.float32) , axis=1) * tf.nn.softmax(y_pred)
    #sigmoid_s_good_shape = tf.stack([tf.math.sigmoid(s)]*tf.cast(tf.shape(y_pred)[1], tf.float32) , axis=1)
    print(f"tf.shape(y_pred) = {tf.shape(y_pred)}")
    print(f"tf.shape(y_pred)[1] = {tf.shape(y_pred)[1]}")
    print(f"tf.math.sigmoid(s) = {tf.math.sigmoid(s)}")
    print(f"[tf.math.sigmoid(s)]*tf.shape(y_pred)[1] = {[tf.math.sigmoid(s)]*tf.shape(y_pred)[1]}")
    print(f"[tf.math.sigmoid(s)]*tf.shape(y_pred)[1].numpy() = {[tf.math.sigmoid(s)]*tf.shape(y_pred)[1].numpy()}")
    sigmoid_s_good_shape = tf.stack([tf.math.sigmoid(s)]*tf.shape(y_pred)[1].numpy(), axis=1)
    print(f"sigmoid_s_good_shape = {sigmoid_s_good_shape}")
    print(f"sigmoid_s_good_shape.shape = {sigmoid_s_good_shape.shape}")
    print(f"tf.cast(tf.shape(y_pred)[1], tf.float32) = {tf.cast(tf.shape(y_pred)[1], tf.float32)}")
    first_probas = sigmoid_s_good_shape * tf.nn.softmax(y_pred)
    print(f"y_pred[1] = {y_pred[1]}")
    print(f"tf.nn.softmax(y_pred)[1] = {tf.nn.softmax(y_pred)[1]}")
    #print(f"first_probas = {first_probas}")
    print(f"first_probas[1] = {first_probas[1]}")
    print(f"first_probas.shape = {first_probas.shape}")
    #yy_pred = tf.concat([first_probas, [last_proba]], axis=0)
    #print(f"yy_pred = {yy_pred}")
    #yy_true = tf.cond(tf.math.reduce_sum(y_true) < 0.01,
    #                  true_fn=lambda: tf.concat([y_true, [1]], 0),
    #                  false_fn=lambda: tf.concat([y_true, [0]], 0),
    #)
    #print(f"yy_true = {yy_true}")
    #return tf.nn.softmax_cross_entropy_with_logits(
    #    labels=yy_true,
    #    logits=yy_pred,
    #)

In [90]:
crooked_sigmoid(y_true, y_pred)

s = [-2.8311398e-09  1.0000000e+00  1.0000000e+00 ...  1.0000000e+00
  1.0000000e+00  1.0000000e+00]
s.shape = (60000,)
tf.math.sigmoid(s) = [0.5       0.7310586 0.7310586 ... 0.7310586 0.7310586 0.7310586]
tf.math.sigmoid(s).shape = (60000,)
last_proba = 1 - tf.math.sigmoid(s) = [0.5       0.2689414 0.2689414 ... 0.2689414 0.2689414 0.2689414]
last_proba.shape = (60000,)
tf.shape(y_pred) = [60000     9]
tf.shape(y_pred)[1] = 9
tf.math.sigmoid(s) = [0.5       0.7310586 0.7310586 ... 0.7310586 0.7310586 0.7310586]
[tf.math.sigmoid(s)]*tf.shape(y_pred)[1] = [[0 0 0 ... 0 0 0]]
[tf.math.sigmoid(s)]*tf.shape(y_pred)[1].numpy() = [<tf.Tensor: shape=(60000,), dtype=float32, numpy=
array([0.5      , 0.7310586, 0.7310586, ..., 0.7310586, 0.7310586,
       0.7310586], dtype=float32)>, <tf.Tensor: shape=(60000,), dtype=float32, numpy=
array([0.5      , 0.7310586, 0.7310586, ..., 0.7310586, 0.7310586,
       0.7310586], dtype=float32)>, <tf.Tensor: shape=(60000,), dtype=float32, numpy=
array([0.5

In [92]:
tf.constant([3])[tf.newaxis]

<tf.Tensor: shape=(1, 1), dtype=int32, numpy=array([[3]], dtype=int32)>

In [100]:
def crooked_sigmoid(y_true, y_pred):
    """
    Our customized loss function
    Example: (4-class classification)
        y_pred = [ -500.0, -37.92, 99.99]
        y_true = [0, 0, 0]  # meaning that the true class is the fourth one.
    """
    # I guess y_pred.shape equals (m, 9) for Fashion MNIST
    s = tf.math.reduce_sum(y_pred, axis=1)
    print(f"s = {s}")
    print(f"s.shape = {s.shape}")
    last_proba = 1 - tf.math.sigmoid(s)
    print(f"tf.math.sigmoid(s) = {tf.math.sigmoid(s)}")
    print(f"tf.math.sigmoid(s).shape = {tf.math.sigmoid(s).shape}")
    print(f"last_proba = 1 - tf.math.sigmoid(s) = {last_proba}")
    print(f"last_proba.shape = {last_proba.shape}")
    #first_probas = tf.math.sigmoid(s) * tf.nn.softmax(y_pred)
    #print()
    #first_probas = tf.stack([tf.math.sigmoid(s)]*tf.cast(tf.shape(y_pred)[1], tf.float32) , axis=1) * tf.nn.softmax(y_pred)
    #sigmoid_s_good_shape = tf.stack([tf.math.sigmoid(s)]*tf.cast(tf.shape(y_pred)[1], tf.float32) , axis=1)
    print(f"tf.shape(y_pred) = {tf.shape(y_pred)}")
    print(f"tf.shape(y_pred)[1] = {tf.shape(y_pred)[1]}")
    print(f"tf.math.sigmoid(s) = {tf.math.sigmoid(s)}")
    print(f"[tf.math.sigmoid(s)]*tf.shape(y_pred)[1] = {[tf.math.sigmoid(s)]*tf.shape(y_pred)[1]}")
    print(f"[tf.math.sigmoid(s)]*tf.shape(y_pred)[1].numpy() = {[tf.math.sigmoid(s)]*tf.shape(y_pred)[1].numpy()}")
    sigmoid_s_good_shape = tf.stack([tf.math.sigmoid(s)]*tf.shape(y_pred)[1].numpy(), axis=1)
    print(f"sigmoid_s_good_shape = {sigmoid_s_good_shape}")
    print(f"sigmoid_s_good_shape.shape = {sigmoid_s_good_shape.shape}")
    print(f"tf.cast(tf.shape(y_pred)[1], tf.float32) = {tf.cast(tf.shape(y_pred)[1], tf.float32)}")
    first_probas = sigmoid_s_good_shape * tf.nn.softmax(y_pred)
    print(f"y_pred[1] = {y_pred[1]}")
    print(f"tf.nn.softmax(y_pred)[1] = {tf.nn.softmax(y_pred)[1]}")
    #print(f"first_probas = {first_probas}")
    print(f"first_probas[1] = {first_probas[1]}")
    print(f"first_probas.shape = {first_probas.shape}")
    #yy_pred = tf.concat([first_probas, [last_proba]], axis=0)
    #print(f"last_probas.shape = {last_probas.shape}")
    yy_pred = tf.concat([first_probas, last_proba[...,tf.newaxis]], axis=1)
    print(f"yy_pred[:3] = {yy_pred[:3]}")
    #yy_true = tf.cond(tf.math.reduce_sum(y_true) < 0.01,
    #                  true_fn=lambda: tf.concat([y_true, [1]], 0),
    #                  false_fn=lambda: tf.concat([y_true, [0]], 0),
    #)
    #print(f"yy_true = {yy_true}")
    #return tf.nn.softmax_cross_entropy_with_logits(
    #    labels=yy_true,
    #    logits=yy_pred,
    #)

In [101]:
crooked_sigmoid(y_true, y_pred)

s = [-2.8311398e-09  1.0000000e+00  1.0000000e+00 ...  1.0000000e+00
  1.0000000e+00  1.0000000e+00]
s.shape = (60000,)
tf.math.sigmoid(s) = [0.5       0.7310586 0.7310586 ... 0.7310586 0.7310586 0.7310586]
tf.math.sigmoid(s).shape = (60000,)
last_proba = 1 - tf.math.sigmoid(s) = [0.5       0.2689414 0.2689414 ... 0.2689414 0.2689414 0.2689414]
last_proba.shape = (60000,)
tf.shape(y_pred) = [60000     9]
tf.shape(y_pred)[1] = 9
tf.math.sigmoid(s) = [0.5       0.7310586 0.7310586 ... 0.7310586 0.7310586 0.7310586]
[tf.math.sigmoid(s)]*tf.shape(y_pred)[1] = [[0 0 0 ... 0 0 0]]
[tf.math.sigmoid(s)]*tf.shape(y_pred)[1].numpy() = [<tf.Tensor: shape=(60000,), dtype=float32, numpy=
array([0.5      , 0.7310586, 0.7310586, ..., 0.7310586, 0.7310586,
       0.7310586], dtype=float32)>, <tf.Tensor: shape=(60000,), dtype=float32, numpy=
array([0.5      , 0.7310586, 0.7310586, ..., 0.7310586, 0.7310586,
       0.7310586], dtype=float32)>, <tf.Tensor: shape=(60000,), dtype=float32, numpy=
array([0.5

In [104]:
def crooked_sigmoid(y_true, y_pred):
    """
    Our customized loss function
    Example: (4-class classification)
        y_pred = [ -500.0, -37.92, 99.99]
        y_true = [0, 0, 0]  # meaning that the true class is the fourth one.
    """
    # I guess y_pred.shape equals (m, 9) for Fashion MNIST
    s = tf.math.reduce_sum(y_pred, axis=1)
    print(f"s = {s}")
    print(f"s.shape = {s.shape}")
    last_proba = 1 - tf.math.sigmoid(s)
    print(f"tf.math.sigmoid(s) = {tf.math.sigmoid(s)}")
    print(f"tf.math.sigmoid(s).shape = {tf.math.sigmoid(s).shape}")
    print(f"last_proba = 1 - tf.math.sigmoid(s) = {last_proba}")
    print(f"last_proba.shape = {last_proba.shape}")
    #first_probas = tf.math.sigmoid(s) * tf.nn.softmax(y_pred)
    #print()
    #first_probas = tf.stack([tf.math.sigmoid(s)]*tf.cast(tf.shape(y_pred)[1], tf.float32) , axis=1) * tf.nn.softmax(y_pred)
    #sigmoid_s_good_shape = tf.stack([tf.math.sigmoid(s)]*tf.cast(tf.shape(y_pred)[1], tf.float32) , axis=1)
    print(f"tf.shape(y_pred) = {tf.shape(y_pred)}")
    print(f"tf.shape(y_pred)[1] = {tf.shape(y_pred)[1]}")
    print(f"tf.math.sigmoid(s) = {tf.math.sigmoid(s)}")
    print(f"[tf.math.sigmoid(s)]*tf.shape(y_pred)[1] = {[tf.math.sigmoid(s)]*tf.shape(y_pred)[1]}")
    print(f"[tf.math.sigmoid(s)]*tf.shape(y_pred)[1].numpy() = {[tf.math.sigmoid(s)]*tf.shape(y_pred)[1].numpy()}")
    sigmoid_s_good_shape = tf.stack([tf.math.sigmoid(s)]*tf.shape(y_pred)[1].numpy(), axis=1)
    print(f"sigmoid_s_good_shape = {sigmoid_s_good_shape}")
    print(f"sigmoid_s_good_shape.shape = {sigmoid_s_good_shape.shape}")
    print(f"tf.cast(tf.shape(y_pred)[1], tf.float32) = {tf.cast(tf.shape(y_pred)[1], tf.float32)}")
    first_probas = sigmoid_s_good_shape * tf.nn.softmax(y_pred)
    print(f"y_pred[1] = {y_pred[1]}")
    print(f"tf.nn.softmax(y_pred)[1] = {tf.nn.softmax(y_pred)[1]}")
    #print(f"first_probas = {first_probas}")
    print(f"first_probas[1] = {first_probas[1]}")
    print(f"first_probas.shape = {first_probas.shape}")
    #yy_pred = tf.concat([first_probas, [last_proba]], axis=0)
    #print(f"last_probas.shape = {last_probas.shape}")
    yy_pred = tf.concat([first_probas, last_proba[...,tf.newaxis]], axis=1)
    print(f"yy_pred[:3] = {yy_pred[:3]}")
    print(f"tf.math.reduce_sum((yy_pred[:3], axis=1) = {tf.math.reduce_sum(yy_pred[:3], axis=1)}")
    #yy_true = tf.cond(tf.math.reduce_sum(y_true) < 0.01,
    #                  true_fn=lambda: tf.concat([y_true, [1]], 0),
    #                  false_fn=lambda: tf.concat([y_true, [0]], 0),
    #)
    #print(f"yy_true = {yy_true}")
    #return tf.nn.softmax_cross_entropy_with_logits(
    #    labels=yy_true,
    #    logits=yy_pred,
    #)


In [105]:
crooked_sigmoid(y_true, y_pred)

s = [-2.8311398e-09  1.0000000e+00  1.0000000e+00 ...  1.0000000e+00
  1.0000000e+00  1.0000000e+00]
s.shape = (60000,)
tf.math.sigmoid(s) = [0.5       0.7310586 0.7310586 ... 0.7310586 0.7310586 0.7310586]
tf.math.sigmoid(s).shape = (60000,)
last_proba = 1 - tf.math.sigmoid(s) = [0.5       0.2689414 0.2689414 ... 0.2689414 0.2689414 0.2689414]
last_proba.shape = (60000,)
tf.shape(y_pred) = [60000     9]
tf.shape(y_pred)[1] = 9
tf.math.sigmoid(s) = [0.5       0.7310586 0.7310586 ... 0.7310586 0.7310586 0.7310586]
[tf.math.sigmoid(s)]*tf.shape(y_pred)[1] = [[0 0 0 ... 0 0 0]]
[tf.math.sigmoid(s)]*tf.shape(y_pred)[1].numpy() = [<tf.Tensor: shape=(60000,), dtype=float32, numpy=
array([0.5      , 0.7310586, 0.7310586, ..., 0.7310586, 0.7310586,
       0.7310586], dtype=float32)>, <tf.Tensor: shape=(60000,), dtype=float32, numpy=
array([0.5      , 0.7310586, 0.7310586, ..., 0.7310586, 0.7310586,
       0.7310586], dtype=float32)>, <tf.Tensor: shape=(60000,), dtype=float32, numpy=
array([0.5

In [106]:
0.05555*9

0.49995

In [108]:
0.0682067*8 + 0.2689 + 0.1854

0.9999536

In [109]:
y_true = [[1,0,0], [0,0,0], [0,0,1]]

In [113]:
tf.math.reduce_sum(y_true, axis=1) < tf.constant(1)

<tf.Tensor: shape=(3,), dtype=bool, numpy=array([False,  True, False])>

In [116]:
tf.math.reduce_sum(y_true, axis=1) < 1

<tf.Tensor: shape=(3,), dtype=bool, numpy=array([False,  True, False])>

In [120]:
help(tf.boolean_mask)

Help on function boolean_mask_v2 in module tensorflow.python.ops.array_ops:

boolean_mask_v2(tensor, mask, axis=None, name='boolean_mask')
    Apply boolean mask to tensor.
    
    Numpy equivalent is `tensor[mask]`.
    
    ```python
    # 1-D example
    tensor = [0, 1, 2, 3]
    mask = np.array([True, False, True, False])
    boolean_mask(tensor, mask)  # [0, 2]
    ```
    
    In general, `0 < dim(mask) = K <= dim(tensor)`, and `mask`'s shape must match
    the first K dimensions of `tensor`'s shape.  We then have:
      `boolean_mask(tensor, mask)[i, j1,...,jd] = tensor[i1,...,iK,j1,...,jd]`
    where `(i1,...,iK)` is the ith `True` entry of `mask` (row-major order).
    The `axis` could be used with `mask` to indicate the axis to mask from.
    In that case, `axis + dim(mask) <= dim(tensor)` and `mask`'s shape must match
    the first `axis + dim(mask)` dimensions of `tensor`'s shape.
    
    See also: `tf.ragged.boolean_mask`, which can be applied to both dense and
    ragge

In [124]:
help(tf.math.logical_or)

Help on function logical_or in module tensorflow.python.ops.gen_math_ops:

logical_or(x, y, name=None)
    Returns the truth value of x OR y element-wise.
    
    *NOTE*: `math.logical_or` supports broadcasting. More about broadcasting
    [here](http://docs.scipy.org/doc/numpy/user/basics.broadcasting.html)
    
    Args:
      x: A `Tensor` of type `bool`.
      y: A `Tensor` of type `bool`.
      name: A name for the operation (optional).
    
    Returns:
      A `Tensor` of type `bool`.



In [125]:
y_true

[[1, 0, 0], [0, 0, 0], [0, 0, 1]]

In [127]:
y_true = tf.constant(y_true)
y_true

<tf.Tensor: shape=(3, 3), dtype=int32, numpy=
array([[1, 0, 0],
       [0, 0, 0],
       [0, 0, 1]], dtype=int32)>

In [128]:
tf.math.reduce_sum(y_true, axis=1)

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([1, 0, 1], dtype=int32)>

In [129]:
1- tf.math.reduce_sum(y_true, axis=1)

<tf.Tensor: shape=(3,), dtype=int32, numpy=array([0, 1, 0], dtype=int32)>

In [131]:
tf.concat([y_true, (1- tf.math.reduce_sum(y_true, axis=1))[...,tf.newaxis]], axis=1)

<tf.Tensor: shape=(3, 4), dtype=int32, numpy=
array([[1, 0, 0, 0],
       [0, 0, 0, 1],
       [0, 0, 1, 0]], dtype=int32)>

In [160]:
def crooked_sigmoid(y_true, y_pred):
    """
    Our customized loss function
    Example: (4-class classification)
        y_pred = [ -500.0, -37.92, 99.99]
        y_true = [0, 0, 0]  # meaning that the true class is the fourth one.
    """
    # I guess y_pred.shape equals (m, 9) for Fashion MNIST
    s = tf.math.reduce_sum(y_pred, axis=1)
    print(f"s = {s}")
    print(f"s.shape = {s.shape}")
    last_proba = 1 - tf.math.sigmoid(s)
    print(f"tf.math.sigmoid(s) = {tf.math.sigmoid(s)}")
    print(f"tf.math.sigmoid(s).shape = {tf.math.sigmoid(s).shape}")
    print(f"last_proba = 1 - tf.math.sigmoid(s) = {last_proba}")
    print(f"last_proba.shape = {last_proba.shape}")
    #first_probas = tf.math.sigmoid(s) * tf.nn.softmax(y_pred)
    #print()
    #first_probas = tf.stack([tf.math.sigmoid(s)]*tf.cast(tf.shape(y_pred)[1], tf.float32) , axis=1) * tf.nn.softmax(y_pred)
    #sigmoid_s_good_shape = tf.stack([tf.math.sigmoid(s)]*tf.cast(tf.shape(y_pred)[1], tf.float32) , axis=1)
    print(f"tf.shape(y_pred) = {tf.shape(y_pred)}")
    print(f"tf.shape(y_pred)[1] = {tf.shape(y_pred)[1]}")
    print(f"tf.math.sigmoid(s) = {tf.math.sigmoid(s)}")
    print(f"[tf.math.sigmoid(s)]*tf.shape(y_pred)[1] = {[tf.math.sigmoid(s)]*tf.shape(y_pred)[1]}")
    print(f"[tf.math.sigmoid(s)]*tf.shape(y_pred)[1].numpy() = {[tf.math.sigmoid(s)]*tf.shape(y_pred)[1].numpy()}")
    sigmoid_s_good_shape = tf.stack([tf.math.sigmoid(s)]*tf.shape(y_pred)[1].numpy(), axis=1)
    print(f"sigmoid_s_good_shape = {sigmoid_s_good_shape}")
    print(f"sigmoid_s_good_shape.shape = {sigmoid_s_good_shape.shape}")
    print(f"tf.cast(tf.shape(y_pred)[1], tf.float32) = {tf.cast(tf.shape(y_pred)[1], tf.float32)}")
    first_probas = sigmoid_s_good_shape * tf.nn.softmax(y_pred)
    print(f"y_pred[1] = {y_pred[1]}")
    print(f"tf.nn.softmax(y_pred)[1] = {tf.nn.softmax(y_pred)[1]}")
    #print(f"first_probas = {first_probas}")
    print(f"first_probas[1] = {first_probas[1]}")
    print(f"first_probas.shape = {first_probas.shape}")
    #yy_pred = tf.concat([first_probas, [last_proba]], axis=0)
    #print(f"last_probas.shape = {last_probas.shape}")
    yy_pred = tf.concat([first_probas, last_proba[...,tf.newaxis]], axis=1)
    print(f"yy_pred[:3] = {yy_pred[:3]}")
    print(f"tf.math.reduce_sum((yy_pred[:3], axis=1) = {tf.math.reduce_sum(yy_pred[:3], axis=1)}")
    ## Wrong!
    #yy_true = tf.cond(tf.math.reduce_sum(y_true, axis=1) < 1,
    #                  true_fn=lambda: tf.concat([y_true, [1]], 0),
    #                  false_fn=lambda: tf.concat([y_true, [0]], 0),
    #)
    yy_true = tf.concat([y_true, (1- tf.math.reduce_sum(y_true, axis=1))[...,tf.newaxis]], axis=1)
    print(f"yy_true[:4] =\n{yy_true[:4]}")
    return tf.nn.softmax_cross_entropy_with_logits(
        labels=yy_true,
        logits=yy_pred,
    )

In [161]:
y_true = tf.one_hot(train_labels, depth=10)[:,:-1].numpy()
y_pred = np.empty_like(y_true)

In [162]:
crooked_sigmoid(y_true, y_pred)

s = [-1.4926147e-07  1.0000000e+00  1.0000000e+00 ...  1.0000000e+00
  1.0000000e+00  1.0000000e+00]
s.shape = (60000,)
tf.math.sigmoid(s) = [0.49999997 0.7310586  0.7310586  ... 0.7310586  0.7310586  0.7310586 ]
tf.math.sigmoid(s).shape = (60000,)
last_proba = 1 - tf.math.sigmoid(s) = [0.5       0.2689414 0.2689414 ... 0.2689414 0.2689414 0.2689414]
last_proba.shape = (60000,)
tf.shape(y_pred) = [60000     9]
tf.shape(y_pred)[1] = 9
tf.math.sigmoid(s) = [0.49999997 0.7310586  0.7310586  ... 0.7310586  0.7310586  0.7310586 ]
[tf.math.sigmoid(s)]*tf.shape(y_pred)[1] = [[0 0 0 ... 0 0 0]]
[tf.math.sigmoid(s)]*tf.shape(y_pred)[1].numpy() = [<tf.Tensor: shape=(60000,), dtype=float32, numpy=
array([0.49999997, 0.7310586 , 0.7310586 , ..., 0.7310586 , 0.7310586 ,
       0.7310586 ], dtype=float32)>, <tf.Tensor: shape=(60000,), dtype=float32, numpy=
array([0.49999997, 0.7310586 , 0.7310586 , ..., 0.7310586 , 0.7310586 ,
       0.7310586 ], dtype=float32)>, <tf.Tensor: shape=(60000,), dtype=fl

<tf.Tensor: shape=(60000,), dtype=float32, numpy=
array([1.9125932, 2.2194638, 2.2194638, ..., 2.2194638, 2.2194638,
       2.2194638], dtype=float32)>