# Diff Dataset
Recall that in `../01-vanilla_NN/01-vanilla_NN.ipynb` we have devised a dataset in which all sequences are of length
`10`, making the dataset easier to split into Train/Val/Test sets. The same dataset can be run with sequential
models, of course and that's exactly what I plan to write in this notebook.

In [1]:
import tensorflow as tf
import tensorflow.keras as keras
import numpy as np
from functools import reduce
from itertools import combinations, permutations
from math import factorial

The following `X` will be our dataset (including training/validation/test sets).

In [2]:
n_classes = 10
max_length = 10
n_instances = sum([reduce(lambda x, y: x*y, range(n_classes,n_classes-length,-1)) for length in range(2, max_length+1)])
n_instances

9864090

In [3]:
X = np.zeros((n_instances, max_length, n_classes), dtype=np.float32)

I have said in `README.md` that CNN is of little use here because we are not dealing with images. However, the shape of `X` does look like a single-channel image. Still, using CNN to extract local features makes little sense, so we will probably stick to our plan -- Maybe the first layer of our vanilla NN would be a `keras.layers.Flatten` and followed by a few fully connected layers.

In [4]:
def one_hot(array, depth=n_classes):
    """
    array is an ndarray of shape (None,)
    """
    return np.eye(depth)[array, :]

In [5]:
# labels
Y = np.empty((n_instances, max_length), dtype=np.float32)  

In [6]:
%%time
#X[...] = 0
S = set(range(0, 9+1))
index_instance = 0
for length in range(2, max_length+1):    
    n_permutations = factorial(length)
    #n_combinations = n_instances // n_permutations
    #for i, c in enumerate(combinations(S, length)):
    for c in combinations(S, length):
        #for j, p in enumerate(permutations(c)):
        for p in permutations(c):
            #print(f"(index_instance/n_instances = {index_instance}/{n_instances})", end="\r")
            #print(f"np.array(p) = {np.array(p)}")
            X[index_instance, :length, :] = one_hot(np.array(p))#[..., np.newaxis]
            Y[index_instance, :] = np.concatenate((np.argsort(p), np.arange(length, max_length)))
            #print(f"""
            #(index_instance/n_instances = {index_instance}/{n_instances})
            #x = {one_hot(np.array(p))}
            #y = {np.concatenate((np.argsort(p), np.arange(length, max_length)))}
            #""", end="\r")
            index_instance += 1

CPU times: user 2min 29s, sys: 1.57 s, total: 2min 31s
Wall time: 2min 32s


### Train/Validation/Test Split

In [7]:
from sklearn.model_selection import train_test_split

In [8]:
X_train_val, X_test, Y_train_val, Y_test = train_test_split(X, Y, test_size=0.2)
X_train_val.shape, X_test.shape

((7891272, 10, 10), (1972818, 10, 10))

## Model

In [9]:
np.product(X.shape[1:])

100

In [17]:
model = keras.models.Sequential([
    keras.layers.Bidirectional(keras.layers.LSTM(10,
        return_sequences=True, input_shape=[None, 1], dropout=0)),
    keras.layers.LSTM(10, return_sequences=True, dropout=0),
    #keras.layers.Bidirectional(keras.layers.LSTM(10,
    #    return_sequences=True, dropout=0)),
    keras.layers.TimeDistributed(keras.layers.Dense(n_classes, activation="softmax")),
])

In [18]:
model.compile(loss="sparse_categorical_crossentropy", optimizer="adam")
#model.compile(loss="categorical_crossentropy", optimizer="adam")

In [None]:
# add some callbacks before beginning training.
checkpoint_cb = keras.callbacks.ModelCheckpoint("dataset2_LSTM_model.h5",
                                                save_best_only=True)

model.fit(X_train_val,
         Y_train_val,
         #steps_per_epoch=60_000,
         epochs=1,
         validation_split=0.2,
         verbose=True,
         callbacks=[checkpoint_cb],
)

## Evaluation

In [None]:
class Sorter:
    def __init__(self, model):
        self.model = model

    def lenlen(self, x):
        somme = np.sum(x, axis=-1)
        first_zero_index = -1
        for i, s in enumerate(somme):
            if s > 10**(-6):
                first_zero_index = i
        if first_zero_index == -1:
            length = 10
        else:
            length = first_zero_index + 1
        return length

    def prettier(self, x, y):
        """
        x.shape = (10,10)
        """
        length = self.lenlen(x)
        xx = np.argmax(x[:length], axis=-1)
        sort_indices = y.astype(int)[:length]
        yy = xx[sort_indices]
        return xx, yy
    
    def evaluate(self, X, Y):
        Y_pred = self.model.predict(X)  # of shape (n_instances, 10, 10)
        Y = Y.astype(int)               # of shape (n_instances, 10)
        m = X.shape[0]
        n_correct = 0
        for i, x in enumerate(X):
            length = self.lenlen(x)
            y_pred = Y_pred[i]
            y_pred_sparse = np.argmax(y_pred, axis=-1)
            n_correct += np.array_equal(Y[i], y_pred_sparse)
        print(f"acc = {n_correct/m}")


In [None]:
sorter = Sorter(model)

In [None]:
sorter.evaluate(X_test, Y_test)