In [11]:
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np

In [12]:
from functools import reduce

In [13]:
S = set(range(0, 9+1))
n_classes = 10  # i.e. the array elements are allowed to be 0..9

In [14]:
sum([reduce(lambda x, y: x*y, range(10,10-length,-1))
     for length in range(2, 10+1)])

9864090

In [15]:
from itertools import combinations, permutations

In [16]:
def one_hot(array, depth=n_classes):
    """
    array is an ndarray/list of shape (None,)
    """
    return np.eye(depth)[array, :]

In [17]:
import math

def train_generator(batch_size=10):
    shuffled_lengths = list(range(2, 10+1))
    np.random.shuffle(shuffled_lengths)
    for length in shuffled_lengths:
        n_instances = reduce(lambda x, y: x*y, range(10,10-length,-1))
        X = np.empty((n_instances, length, 1))
        Y = np.empty((n_instances, length, n_classes))
        n_permutations = math.factorial(length)
        #n_combinations = n_instances // n_permutations
        for i, c in enumerate(combinations(S, length)):
            c_sorted = np.array(sorted(c))  # shape (length,)
            c_onehot = one_hot(c_sorted)    # shape (length, n_classes)
            Y[i*n_permutations : i*n_permutations + n_permutations, ...] = c_onehot
            for j, p in enumerate(permutations(c)):
                X[i*n_permutations : i*n_permutations + j, ...] = np.array(p)[..., np.newaxis]
        # Throw one batch after another to the model
        for k in range(n_instances // batch_size):
            yield X[k*batch_size: (k+1)*batch_size].astype(np.float32), Y[k*batch_size: (k+1)*batch_size]

## Seq-to-Seq Model
This is the seq-to-seq model in which the output sequence's length equals the input sequence length. We had better used **bidirectional RNNs**.

In [18]:
seq2seq_LSTM_model = keras.models.Sequential([
    keras.layers.Bidirectional(keras.layers.LSTM(7,
        return_sequences=True, input_shape=[None, 1], dropout=0.2)),
    #keras.layers.LSTM(10, return_sequences=True, dropout=0.2),
    #keras.layers.Bidirectional(keras.layers.LSTM(10,
    #    return_sequences=True, dropout=0.2)),
    keras.layers.TimeDistributed(keras.layers.Dense(n_classes, activation="softmax")),
])

#seq2seq_LSTM_model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
# sparse_categorical_crossentropy would fail. Cf.
# https://stackoverflow.com/questions/49161174/tensorflow-logits-and-labels-must-have-the-same-first-dimension
seq2seq_LSTM_model.compile(loss="categorical_crossentropy",
                           optimizer="adam")

Before we jump into the potentially time-consuming training (via the `fit()` method), we can ask our model make a few predictions first to see if we implemented everything right.
01. With `dropout=0.2` in the first layer and predicting on `np.array([0,9,8,3]` produced
    ```
    TypeError: Input 'b' of 'MatMul' Op has type float32 that does not match type int64 of argument 'a'.
    ```
    And if we execute the cell again, we will get
    ```
    TypeError: 'NoneType' object is not callable
    ```
    Quite weird behaviour.
    - Actually, not `dropout=0.2`'s fault, the same error remains when we erase that input arg.
02. Once the input has been corrected to an ndarray of `dtype` equal to `float` and the ndarray shape to `n_batches, length (aka n_timesteps), 1`, prediction works no problem.

In [19]:
output = seq2seq_LSTM_model.predict(np.array([0.,9,8,3]).reshape((1,4,1)), batch_size=1)
output

array([[[0.09104336, 0.09207433, 0.09087513, 0.08971423, 0.10470831,
         0.08043717, 0.09903564, 0.08690628, 0.08403429, 0.18117124],
        [0.10100501, 0.06889137, 0.1236333 , 0.12173836, 0.1426885 ,
         0.09310558, 0.0617233 , 0.05455162, 0.08056537, 0.15209763],
        [0.09729038, 0.06632143, 0.14005709, 0.12067135, 0.15129161,
         0.09217019, 0.06692744, 0.05764613, 0.07746279, 0.1301616 ],
        [0.09301692, 0.06743786, 0.15756966, 0.10293435, 0.14172545,
         0.08238933, 0.09952792, 0.07167357, 0.07685323, 0.10687171]]],
      dtype=float32)

In [20]:
output.shape

(1, 4, 10)

In [21]:
np.argmax(output[0], axis=1)

array([9, 9, 4, 2])

In [22]:
l = 4
bs = 7
X_new = np.empty((bs, l, 1))
for i in range(X_new.shape[0]):
    X_new[i] = np.random.choice(range(0,9+1), l, replace=False).astype(np.float32).reshape((-1, 1))
output = seq2seq_LSTM_model.predict(X_new)
output.shape

(7, 4, 10)

Let's try the training.

In [23]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

```
    steps_per_epoch: Integer or `None`.
        Total number of steps (batches of samples)
        before declaring one epoch finished and starting the
        next epoch. When training with input tensors such as
        TensorFlow data tensors, the default `None` is equal to
        the number of samples in your dataset divided by
        the batch size, or 1 if that cannot be determined. If x is a
        `tf.data` dataset, and 'steps_per_epoch'
        is None, the epoch will run until the input dataset is exhausted.
        When passing an infinitely repeating dataset, you must specify the
        `steps_per_epoch` argument. This argument is not supported with
        array inputs.
```

In [None]:
seq2seq_LSTM_model.fit(train_generator(), 
                       steps_per_epoch=60000,
                       epochs=1,
                       verbose=True)



In [21]:
output = seq2seq_LSTM_model.predict(np.array([0.,9,8,3]).reshape((1,4,1)), batch_size=1)
np.argmax(output[0], axis=1)

array([9, 9, 4, 2])

In [10]:
10**4 // 60

166