In [1]:
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np

In [2]:
from functools import reduce

In [3]:
S = set(range(0, 9+1))
n_classes = 10  # i.e. the array elements are allowed to be 0..9

In [4]:
sum([reduce(lambda x, y: x*y, range(10,10-length,-1))
     for length in range(2, 10+1)])

9864090

In [5]:
from itertools import combinations, permutations

In [6]:
def one_hot(array, depth=n_classes):
    """
    array is an ndarray/list of shape (None,)
    """
    return np.eye(depth)[array, :]

In [7]:
import math

def train_generator(batch_size=10):
    shuffled_lengths = list(range(2, 10+1))
    np.random.shuffle(shuffled_lengths)
    for length in shuffled_lengths:
        n_instances = reduce(lambda x, y: x*y, range(10,10-length,-1))
        X = np.empty((n_instances, length, 1))
        Y = np.empty((n_instances, length, n_classes))
        n_permutations = math.factorial(length)
        #n_combinations = n_instances // n_permutations
        for i, c in enumerate(combinations(S, length)):
            c_sorted = np.array(sorted(c))  # shape (length,)
            c_onehot = one_hot(c_sorted)    # shape (length, n_classes)
            Y[i*n_permutations : i*n_permutations + n_permutations, ...] = c_onehot
            for j, p in enumerate(permutations(c)):
                X[i*n_permutations : i*n_permutations + j, ...] = np.array(p)[..., np.newaxis]
        # Throw one batch after another to the model
        for k in range(n_instances // batch_size):
            yield X[k*batch_size: (k+1)*batch_size].astype(np.float32), Y[k*batch_size: (k+1)*batch_size]

## Seq-to-Seq Model
This is the seq-to-seq model in which the output sequence's length equals the input sequence length. We had better used **bidirectional RNNs**.

In [8]:
seq2seq_LSTM_model = keras.models.Sequential([
    keras.layers.Bidirectional(keras.layers.LSTM(10,
        return_sequences=True, input_shape=[None, 1], dropout=0.2)),
    keras.layers.LSTM(10, return_sequences=True, dropout=0.2),
    #keras.layers.Bidirectional(keras.layers.LSTM(10,
    #    return_sequences=True, dropout=0.2)),
    keras.layers.TimeDistributed(keras.layers.Dense(n_classes, activation="softmax")),
])

#seq2seq_LSTM_model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
# sparse_categorical_crossentropy would fail. Cf.
# https://stackoverflow.com/questions/49161174/tensorflow-logits-and-labels-must-have-the-same-first-dimension
seq2seq_LSTM_model.compile(loss="categorical_crossentropy",
                           optimizer="adam")

Before we jump into the potentially time-consuming training (via the `fit()` method), we can ask our model make a few predictions first to see if we implemented everything right.
01. With `dropout=0.2` in the first layer and predicting on `np.array([0,9,8,3]` produced
    ```
    TypeError: Input 'b' of 'MatMul' Op has type float32 that does not match type int64 of argument 'a'.
    ```
    And if we execute the cell again, we will get
    ```
    TypeError: 'NoneType' object is not callable
    ```
    Quite weird behaviour.
    - Actually, not `dropout=0.2`'s fault, the same error remains when we erase that input arg.
02. Once the input has been corrected to an ndarray of `dtype` equal to `float` and the ndarray shape to `n_batches, length (aka n_timesteps), 1`, prediction works no problem.

In [9]:
output = seq2seq_LSTM_model.predict(np.array([0.,9,8,3]).reshape((1,4,1)), batch_size=1)
output

array([[[0.09993166, 0.09932965, 0.09745076, 0.09866813, 0.09772769,
         0.10073274, 0.0979954 , 0.10381538, 0.10080783, 0.10354077],
        [0.10095181, 0.09967674, 0.10383955, 0.08978332, 0.10082579,
         0.09037179, 0.09406106, 0.10066985, 0.10486231, 0.11495784],
        [0.10162369, 0.09830114, 0.10850812, 0.08265334, 0.10245721,
         0.08325569, 0.09107159, 0.0989607 , 0.10985056, 0.12331801],
        [0.1018417 , 0.09701502, 0.10919326, 0.07962976, 0.10274579,
         0.08135111, 0.09086252, 0.09932285, 0.11399698, 0.12404103]]],
      dtype=float32)

In [10]:
output.shape

(1, 4, 10)

In [11]:
np.argmax(output[0], axis=1)

array([7, 9, 9, 9])

In [12]:
l = 4
bs = 7
X_new = np.empty((bs, l, 1))
for i in range(X_new.shape[0]):
    X_new[i] = np.random.choice(range(0,9+1), l, replace=False).astype(np.float32).reshape((-1, 1))
output = seq2seq_LSTM_model.predict(X_new)
output.shape

(7, 4, 10)

Let's try the training.

In [13]:
tf.config.list_physical_devices('GPU')

[PhysicalDevice(name='/physical_device:GPU:0', device_type='GPU')]

In [None]:
seq2seq_LSTM_model.fit(train_generator(),
                       #steps_per_epoch=60_000,
                       epochs=1,
                       verbose=True)

100 batches/sec \imples 1 mil batches require 10^4 sec, i.e. 

In [15]:
10**4 // (60)  # minutes

166

In [17]:
output = seq2seq_LSTM_model.predict(np.array([0.,9,8,3]).reshape((1,4,1)), batch_size=1)
np.argmax(output[0], axis=1)

array([0, 1, 2, 3])