# Dataset4
This dataset is essentially the same as `../01-vanilla_NN/07-dataset3.ipynb` with the following exceptions:

- input to RNN models are still like `[0.45, 0.23, -0.08, -1.54, 1.12, -1.82, -1.25, 0.45, -0.39, -0.34]`, `[1.7435, -3.14159, 2.71827]`, etc.; what is new here is that the number of timesteps of the input can now vary, no longer fixed to `10`
  - we no longer need to pad our sequences to a fixed length
- we shall use the seq-to-seq model with the same number of time steps for the output as for the input

In [22]:
n_classes = 10
max_length = 10

In [2]:
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np
from functools import reduce
from itertools import combinations, permutations
from math import factorial
import sys

In [3]:
n_instances = sum([reduce(lambda x, y: x*y, range(n_classes,n_classes-length,-1)) for length in range(2, max_length+1)])
n_instances

9864090

In [8]:
tp = 0.2
def train_set_generator(test_proportion=tp):
    S = set(range(0, 9+1))
    index_instance = 0
    for length in range(2, max_length+1):    
        for c in combinations(S, length):
            for p in permutations(c):
                if index_instance % 10 <= 10*test_proportion:
                    index_instance += 1
                    continue
                x = tf.constant(p, dtype=tf.float32)
                #x = tf.ragged.constant(p, dtype=tf.float32)
                y = tf.argsort(p)
                #y = tf.ragged.constant(tf.argsort(p))
                yield x, y
                index_instance += 1

def test_set_generator(test_proportion=tp):
    S = set(range(0, 9+1))
    index_instance = 0
    for length in range(2, max_length+1):    
        for c in combinations(S, length):
            for p in permutations(c):
                if index_instance % 10 > 10*test_proportion:
                    index_instance += 1
                    continue
                x = tf.constant(p, dtype=tf.float32)
                y = tf.argsort(p)
                yield x, y
                index_instance += 1

**N.B.** It seems that we needn't use `RaggedTensor` here, because we return at each iteration a normal
`Tensor`; what is different is the number of time steps of each iteration's sequence. But this can be
handled alone by `tf.data.Dataset.from_generator()` with normal spec `tf.TensorSpec`.

In [9]:
train_set = tf.data.Dataset.from_generator(
    train_set_generator,
    output_signature=(
        tf.TensorSpec(shape=(None,), dtype=tf.float32),
        tf.TensorSpec(shape=(None,), dtype=tf.int32),
    )
)

In [16]:
# Let's show that this generator can really generate tensors of diff shapes
prev_time_steps = 1
for x, y in train_set.take(1000):
    current_time_steps = x.shape[0]
    if current_time_steps > prev_time_steps:
        print(f"x =\n{x}")
        print(f"y =\n{y}")
    prev_time_steps = current_time_steps

x =
[2. 0.]
y =
[1 0]
x =
[1. 2. 0.]
y =
[2 0 1]
x =
[0. 2. 3. 1.]
y =
[0 3 1 2]


In [17]:
for x, y in train_set.take(3):
    print(f"x.dtype =\n{x.dtype}")
    print(f"y.dtype =\n{y.dtype}")

x.dtype =
<dtype: 'float32'>
y.dtype =
<dtype: 'int32'>
x.dtype =
<dtype: 'float32'>
y.dtype =
<dtype: 'int32'>
x.dtype =
<dtype: 'float32'>
y.dtype =
<dtype: 'int32'>


In [30]:
np.array(5).shape

()

In [18]:
test_set = tf.data.Dataset.from_generator(
    test_set_generator,
    output_signature=(
        tf.TensorSpec(shape=(None,), dtype=tf.float32),
        tf.TensorSpec(shape=(None,), dtype=tf.int32),
    )
)

In [21]:
# Let's show that test set can also generate tensors of diff shapes
prev_time_steps = 1
for x, y in test_set.take(1000):
    current_time_steps = x.shape[0]
    if current_time_steps > prev_time_steps:
        print(f"x =\n{x}")
        print(f"y =\n{y}")
    prev_time_steps = current_time_steps

x =
[0. 1.]
y =
[0 1]
x =
[0. 1. 2.]
y =
[0 1 2]
x =
[0. 1. 2. 3.]
y =
[0 1 2 3]


## RNN Model


In [27]:
inputs = keras.layers.Input(shape=(None,))
inputs.shape

TensorShape([None, None])

In [28]:
inputs.shape[0]

I realize that, to produce one-hot-vector-like output at each RNN neuron output, we have to let the model
know the number of time steps (i.e. the length of the to-be-sorted array). I guess there are two possible
ways to achieve this.

01. Somehow extract the number of time steps. I don't know if this is possible in `keras`
02. Reconstruct our `tf.data.Dataset` to provide also the number of time steps

Let's try the 2nd way.

In [29]:
x = tf.constant([1,2,3])
type(x.shape)

tensorflow.python.framework.tensor_shape.TensorShape

In [35]:
tp = 0.2
def train_set_generator(test_proportion=tp):
    S = set(range(0, 9+1))
    index_instance = 0
    for length in range(2, max_length+1):    
        for c in combinations(S, length):
            for p in permutations(c):
                if index_instance % 10 <= 10*test_proportion:
                    index_instance += 1
                    continue
                #x = tf.ragged.constant(p, dtype=tf.float32)
                #x = tf.constant(p, dtype=tf.float32)
                X = (tf.constant(p, dtype=tf.float32),
                     tf.constant(len(p), dtype=tf.int32))
                y = tf.argsort(p)
                #y = tf.ragged.constant(tf.argsort(p))
                yield X, y
                index_instance += 1

def test_set_generator(test_proportion=tp):
    S = set(range(0, 9+1))
    index_instance = 0
    for length in range(2, max_length+1):    
        for c in combinations(S, length):
            for p in permutations(c):
                if index_instance % 10 > 10*test_proportion:
                    index_instance += 1
                    continue
                #x = tf.constant(p, dtype=tf.float32)
                X = (tf.constant(p, dtype=tf.float32),
                     tf.constant(len(p), dtype=tf.int32))
                y = tf.argsort(p)
                yield X, y
                index_instance += 1

In [36]:
train_set = tf.data.Dataset.from_generator(
    train_set_generator,
    output_signature=(
        (tf.TensorSpec(shape=(None,), dtype=tf.float32),
         tf.TensorSpec(shape=(), dtype=tf.float32),),
        tf.TensorSpec(shape=(None,), dtype=tf.int32),
    )
)

In [38]:
for X, y in train_set.take(3):
    print(f"X = {X}")
    print(f"y.dtype = {y.dtype}")

X = (<tf.Tensor: shape=(2,), dtype=float32, numpy=array([2., 0.], dtype=float32)>, <tf.Tensor: shape=(), dtype=float32, numpy=2.0>)
y.dtype = <dtype: 'int32'>
X = (<tf.Tensor: shape=(2,), dtype=float32, numpy=array([0., 3.], dtype=float32)>, <tf.Tensor: shape=(), dtype=float32, numpy=2.0>)
y.dtype = <dtype: 'int32'>
X = (<tf.Tensor: shape=(2,), dtype=float32, numpy=array([3., 0.], dtype=float32)>, <tf.Tensor: shape=(), dtype=float32, numpy=2.0>)
y.dtype = <dtype: 'int32'>


#### Alert
We have to give up the idea of outputing simple one-hot-vector-like sequences, because none of `LSTM`, `Dense`
accepts a variable number of neurons (which is in turn because the number of parameters in a layer is fixed).
In other words, even if we are with RNNs, we still have to restrict an integer upper bound for the length
of our input, to-be-sorted arrays.

In [45]:
type(10**6)

int

In [47]:
max_length = 10**6
L1 = keras.layers.LSTM(10, return_sequences=True)
o1 = L1(input_seq)
#L2 = keras.layers.TimeDistributed(keras.layers.Dense(n_time_steps, activation="softmax"))
L2 = keras.layers.TimeDistributed(keras.layers.Dense(max_length, activation="softmax"))
#L2 = keras.layers.Dense(n_time_steps, activation="softmax")
o2 = L2(o1)

model = keras.Model(inputs=[input_seq, n_time_steps],
                    outputs=[o2])

In [54]:
?keras.layers.Lambda

In [56]:
n_time_steps = keras.layers.Input(shape=(), dtype=tf.int32)
input_seq = keras.layers.Input(shape=(None,1), dtype=tf.float32)

max_length = 10**6
L1 = keras.layers.LSTM(10, return_sequences=True)
o1 = L1(input_seq)
#L2 = keras.layers.TimeDistributed(keras.layers.Dense(n_time_steps, activation="softmax"))
#L2 = keras.layers.TimeDistributed(keras.layers.Dense(max_length, activation="softmax"))
#L2 = keras.layers.Dense(n_time_steps, activation="softmax")
L2 = keras.layers.TimeDistributed(keras.layers.Lambda(lambda x: x[:n_time_steps]))
o2 = L2(o1)

model = keras.Model(inputs=[input_seq, n_time_steps],
                    outputs=[o2])

TypeError: Only integers, slices (`:`), ellipsis (`...`), tf.newaxis (`None`) and scalar tf.int32/tf.int64 tensors are valid indices, got <tf.Tensor 'Placeholder:0' shape=(None,) dtype=int32>

## New Ideas
01. Maybe we can restrict the range of our array elements, say, to $[-1, 1]$ because any array `A` can be always carried into such by dividing by `max(np.abs(A))`.

In [None]:
tf.float32.max

In [None]:
PADDER < tf.float32.max

The error occurs because `tf.argsort()` always output a tensor of `dtype=tf.int32`

In [None]:
tf.range(2, max_length)

In [None]:
tf.concat(([-100,-99], tf.range(2, max_length)), axis=0)

It seems that we cannot combine the above two generator functions into a single one because the first arg of `tf.data.Dataset.from_generator()` has to be the generator itself, without parenthese.

In [None]:
2 <= 10*0.2

In [None]:
train_set = tf.data.Dataset.from_generator(
    train_set_generator,
    output_signature=(
        tf.TensorSpec(shape=(max_length,), dtype=tf.float32),
        tf.TensorSpec(shape=(max_length,), dtype=tf.int32),
    )
)

In [None]:
for x, y in train_set.take(3):
    print(f"x =\n{x}")
    print(f"y =\n{y}")

In [None]:
for x, y in train_set.take(3):
    print(f"x.dtype =\n{x.dtype}")
    print(f"y.dtype =\n{y.dtype}")

In [None]:
test_set = tf.data.Dataset.from_generator(
    test_set_generator,
    output_signature=(
        tf.TensorSpec(shape=(max_length,), dtype=tf.float32),
        tf.TensorSpec(shape=(max_length,), dtype=tf.int32),
    )
)

In [None]:
for x, y in test_set.take(3):
    print(f"x =\n{x}")
    print(f"y =\n{y}")

In [None]:
for x, y in train_set.take(3):
    print(f"x.shape =\n{x.shape}")
    print(f"y.shape =\n{y.shape}")

In [None]:
train_set = train_set.batch(32, drop_remainder=True)

In [None]:
output_shape = (max_length, max_length)
model = keras.models.Sequential([
    #keras.layers.Flatten(input_shape=max_length),
    #keras.layers.Dense(20, input_shape=(max_length,), activation="relu"),
    keras.layers.Dense(np.product(output_shape),
                       input_shape=(max_length,),
                       #activation=None,
    ),
    #keras.layers.Dense(40, activation="relu"),
    keras.layers.Dense(np.product(output_shape), activation="relu"),
    keras.layers.Dense(np.product(output_shape)),
    #keras.layers.Dense(np.product(output_shape), activation="tanh"),
    keras.layers.Reshape(output_shape),
    keras.layers.Softmax(axis=-1),
])

model.compile(loss="sparse_categorical_crossentropy",
              optimizer="adam",
              metrics=["acc"],
)

In [None]:
checkpoint_cb = keras.callbacks.ModelCheckpoint("dataset3_ANN.h5")

model.fit(train_set,
          batch_size=32,
          #validation_split=0.2,
          epochs=1,
          callbacks=[checkpoint_cb],
)

**(?)** Why `loss` decreases along with `acc` as time goes by?<br>


## Bad Performance? Improvement.
**Rmk.** Usually, the accuracy will start to climb at the beginning of 1st epoch, reaching around `acc = 0.5` before the accuracy stops increasing and starts to decrease. Even when we add multiple dense layers in between, it only helped the model to climb up until `acc = 0.69` (faster, i.e. in fewer steps), and then accuracy starts to decrease (and this time it has more steps to decrease.)

Looks like the model had difficulty keeping raising the accuracy. Maybe it's because the model does not know what `PADDER` means. Here are a few improving ideas:

01. Assume all array elements are $\ge 0\,.$ And pick `PADDER = -1` and hopefully it will better understand what `PADDER` is.
  - Add as the input layer of the model an activation layer to render all `-1`'s to `0`'s

## Seeing Is Believing
Let's watch the sorting in action.

In [None]:
np.float32.max

In [None]:
type(np.float32.max)

In [None]:
tf.float32.max

In [None]:
np.finfo('d').max

In [None]:
np.finfo('float32').max

In [None]:
np.concatenate(([1], np.ones(3, dtype=np.float32))).dtype

In [None]:
A = [9, 7, 6, 0, 1]
A = np.concatenate((A, np.finfo('float32').max * np.ones(max_length - len(A))))
A = np.array([A], dtype=np.float32)
A.dtype

In [None]:
model.predict(A)

In [None]:
np.argmax(model.predict(A), axis=-1)

In [None]:
s = np.argmax(model.predict(A), axis=-1)[0]

In [None]:
A[0][s]

In [None]:
A

In [None]:
def sort(X_batch, correction=False):
    """
    args
        X_batch, ndarray of shape (batch_size, max_length)
            e.g. [[9, 2, float_max, float_max, ..., float_max],
                  [9, 8, 7, 6, 5, 4, 3, 2, 1, 0],
                  [6, 1, 2, 9, 4, 5, 3, 7, 0, 8]]
            is a case in which batch_size equals 3, max_length equals 10.

        correction, bool
            sorted_indices, due to the fact that we take only softmax,
            can contain repeated and missing indices. If correction == False,
            we will not correct this; otherwise, we correct this and make the
            indices unique and full-blown.
    
    return
        sorted_arrays, ndarray of shape (batch_size, max_length)
            e.g. (if correction=True)
                 [[2, 9, float_max, float_max, ..., float_max],
                  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9],
                  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9]]
    """
    batch_size, max_length = X_batch.shape
    y_pred = model.predict(X_batch)  # shape (batch_size, max_length, max_length)
    if not correction:
        sorted_indices = np.argmax(y_pred, axis=-1)  # shape (batch_size, max_length)
    else:
        sorted_indices = np.empty((batch_size, max_length), dtype=np.int32)
        for k, y_pred_k in enumerate(y_pred):
            # y_pred_k.shape equals (max_length, max_length)
            sorted_indices[k, 0] = np.argmax(y_pred_k[0])
            for i in range(1, max_length):
                possible_index = np.argmax(y_pred_k[i])
                while possible_index in sorted_indices[k, :i]:
                    y_pred_k[i, possible_index] = -1
                    possible_index = np.argmax(y_pred_k[i])
                sorted_indices[k, i] = possible_index
    #sorted_arrays = X_batch[:, sorted_indices]  # wrong
    #sorted_arrays = X_batch[range(X_batch.shape[0]), sorted_indices]  # wrong
    sorted_arrays = X_batch[np.repeat(np.arange(batch_size), max_length),
                            sorted_indices.reshape(-1)
                           ].reshape((batch_size, max_length))
    return sorted_arrays

In [None]:
sort(A)

In [None]:
A[0][s]

In [None]:
B = np.array([
    [9,8,7,6,5,4,3,2,1,0],
    [1,2,3,4,5,9,8,7,0,6],
])

In [None]:
sort(B)

In [None]:
sort(B, correction=True)

Here below is what I have searched to make the function `sort()` work.

In [None]:
C = np.arange(3*5).reshape((3,5))
C

If, say, we want to take

- the 1st row with column `0,1,2`
- the 2nd row with column `3,0,4`
- the 3rd row with column `4,3,2`

we can do as follows.

In [None]:
C[[0,0,0,  1,1,1,  2,2,2], [0,1,2,  3,0,4,  4,3,2]]

In [None]:
C[[0,0,0,  1,1,1,  2,2,2], np.ravel([[0,1,2],  [3,0,4],  [4,3,2]])]

In [None]:
np.ravel([[0,1,2],  [3,0,4],  [4,3,2]])

In [None]:
np.ravel([[0,1,2],  [3,0,4],  [4,3,2]], order="F")

dunno which is faster: `reshape` or `ravel`. Or maybe of the same speed.

In [None]:
np.array([[0,1,2],  [3,0,4],  [4,3,2]]).reshape((-1,))

In [None]:
np.repeat(np.arange(3), 4)

In [None]:
A.reshape(-1)

In [None]:
np.arange(10).dtype

In [None]:
np.empty((3, 10), dtype=np.int8)

# `tf.data.Dataset`
In previous notebooks, we have this code cell which is a memory hog (the `X`) and took long time to run.
Here in this notebook, our objective is to construct the same dataset by using `tf` operations
instead of `numpy` ones, hoping to reduce both memory usage and time (i.e. dataset construction time.)
```python
%%time
S = set(range(0, 9+1))
index_instance = 0
for length in range(2, max_length+1):    
    n_permutations = factorial(length)
    for c in combinations(S, length):
        for p in permutations(c):
            X[index_instance, :length, :] = one_hot(np.array(p))
            Y[index_instance, :] = np.concatenate((np.argsort(p), np.arange(length, max_length)))
            index_instance += 1
```

## Workaround
Maybe we should abandon the idea of using `tf.data.Dataset.from_tensor_slices(X)`, because that direction might always have to first allocate large memory.

We start small and try to use `tf.data.Dataset`'s method to construct an equivalent datset.

**(?)** You've already seen in `ageron`'s homl2e that a dataset is able to contain tensors of diff shapes. Try to make an example yourself.

In [None]:
lengths = tf.range(2, max_length+1)
dataset = tf.data.Dataset.from_tensor_slices(lengths)
dataset = dataset.map(lambda x: tf.range(x))

In [None]:
for tensor in dataset:
    print(tensor)

**(?)** A big question that you haven't understood is: Should a `tf.data.Dataset` instance contain both `X` and `y`, i.e. data and labels, for supervised training? If so, how do we arrange `X` and `y`?

### First try: `tf.data.Dataset.from_generator()`
As I imagine, we can keep the original code, keep the `for` loop, but instead of filling in each "row" of `X`, we make it a generator using the keyword `yield`. After implementing the generator using numpy, we pass the generator into `tf.data.Dataset.from_generator()` and we're done.

In [None]:
def dataset_generator():
    S = set(range(0, 9+1))
    index_instance = 0
    for length in range(2, max_length+1):    
        n_permutations = factorial(length)
        for c in combinations(S, length):
            for p in permutations(c):
                x = np.zeros((max_length, n_classes), dtype=np.float32)
                x[:length, :] = tf.one_hot(np.array(p),
                                           depth=n_classes).numpy()
                y = np.concatenate((np.argsort(p),
                                    np.arange(length, max_length)))
                yield x, y
                index_instance += 1

In [None]:
dataset = tf.data.Dataset.from_generator(
    dataset_generator,
    output_types=(tf.float32, tf.float32),
    output_shapes=([max_length, n_classes], [max_length]),
)

**Rmk**. Had we forgotten to specify `output_shapes`, the following cells will still be able to run, up until
`model.fit()`, which will generate the following error:
```
ValueError : as_list() is not defined on an unknown TensorShape
```
`model.fit()` is able to run once we specify both `output_types` and `output_shapes`.

In the above, we have also provided (and disactivated) an equivalent cell using `output_signature` instead of the `(output_types, output_shapes)` pair, which is to be deprecated in the future.

In [None]:
for x, y in dataset.take(3):
    print(f"x =\n{x}")
    print(f"y =\n{y}")

**Pros**

01. We do not have to wait two to six minutes for `X` to be constructed any more
02. Computers with little RAM can also run this code. Otherwise, they won't be able to even allocate enough memory for `X`.
03. Compared to building a `tf.data.Dataset` completely from its methods, this `from_generator()` has the advantage of being a lot easier to implement. Actually, we almost only replaced the assignment of rows of `X` by `yield`

**Cons**

01. We must think of a way to split the dataset into Training/Validation/Test sets because we no longer have the entire `X` to apply `train_test_split` from `sklearn`.

In [None]:
dataset = dataset.batch(32, drop_remainder=True)

In [None]:
for x, y in dataset.take(3):
    print(f"x.shape =\n{x.shape}")
    print(f"y.shape =\n{y.shape}")

In [None]:
#https://keras.io/api/layers/reshaping_layers/reshape/
#https://keras.io/api/layers/activation_layers/softmax/
input_shape = (max_length, n_classes)
product_input_shape = np.product((max_length, n_classes))
model = keras.models.Sequential([
    keras.layers.Flatten(input_shape=input_shape),
    keras.layers.Dense(product_input_shape, activation="relu"),
    #keras.layers.Dense(2*product_input_shape, activation="relu"),
    keras.layers.Dense(product_input_shape),
    keras.layers.Reshape(input_shape),
    keras.layers.Softmax(axis=-1),
])

model.compile(loss="sparse_categorical_crossentropy",
              optimizer="adam",
              metrics=["acc"],
)

In [None]:
checkpoint_cb = keras.callbacks.ModelCheckpoint("dataset3_ANN.h5")
model.fit(dataset,
          batch_size=32,
          callbacks=[checkpoint_cb],
)

In [None]:
# labels
Y = np.empty((n_instances, max_length), dtype=np.float32)  

In [None]:
%%time
#X[...] = 0
S = set(range(0, 9+1))
index_instance = 0
#for length in tqdm(range(2, max_length+1)):
for length in range(2, max_length+1):    
    n_permutations = factorial(length)
    #n_combinations = n_instances // n_permutations
    #for i, c in enumerate(combinations(S, length)):
    for c in combinations(S, length):
        #for j, p in enumerate(permutations(c)):
        for p in permutations(c):
            #print(f"(index_instance/n_instances = {index_instance}/{n_instances})", end="\r")
            #print(f"np.array(p) = {np.array(p)}")
            X[index_instance, :length, :] = one_hot(np.array(p))#[..., np.newaxis]
            Y[index_instance, :] = np.concatenate((np.argsort(p), np.arange(length, max_length)))
            index_instance += 1

### Train/Validation/Test Split

In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train_val, X_test, Y_train_val, Y_test = train_test_split(X, Y, test_size=0.2)
X_train_val.shape, X_test.shape

## Model

We might be able to use less neurons and still arrive at a similar performance. Running out of time, I had not tried to tune the model; instead, I had spent most of the time trying to implement more solutions.

In [None]:
model = keras.models.load_model("vanilla_NN_model.h5")
model.summary()

In [None]:
model.evaluate(X_test, Y_test)

## Evaluation on `X_test`
We certainly would like to have performance measures like accuracy, precision/recall, etc. But we must first write some convenience functions to facilitate the operations.

In [None]:
class Sorter:
    def __init__(self, model):
        self.model = model

    def lenlen(self, x):
        somme = np.sum(x, axis=-1)
        first_zero_index = -1
        for i, s in enumerate(somme):
            if s > 10**(-6):
                first_zero_index = i
        if first_zero_index == -1:
            length = 10
        else:
            length = first_zero_index + 1
        return length

    def prettier(self, x, y):
        """
        x.shape = (10,10)
        """
        length = self.lenlen(x)
        xx = np.argmax(x[:length], axis=-1)
        sort_indices = y.astype(int)[:length]
        yy = xx[sort_indices]
        return xx, yy
    
    def evaluate(self, X, Y):
        Y_pred = self.model.predict(X)  # of shape (n_instances, 10, 10)
        Y = Y.astype(int)               # of shape (n_instances, 10)
        m = X.shape[0]
        n_correct = 0
        for i, x in enumerate(X):
            length = self.lenlen(x)
            y_pred = Y_pred[i]
            y_pred_sparse = np.argmax(y_pred, axis=-1)
            n_correct += np.array_equal(Y[i], y_pred_sparse)
        print(f"acc = {n_correct/m}")


In [None]:
sorter = Sorter(model)

In [None]:
%%time
sorter.evaluate(X_test, Y_test)