# Sentence Reconstruction

The purpose of this project is to take in input a sequence of words corresponding to a random permutation of a given english sentence, and reconstruct the original sentence.

The otuput can be either produced in a single shot, or through an iterative (autoregressive) loop generating a single token at a time.


CONSTRAINTS:
* No pretrained model can be used.
* The neural network models should have less the 20M parameters.
* No postprocessing should be done (e.g. no beamsearch)
* You cannot use additional training data.


BONUS PARAMETERS:

A bonus of 0-2 points will be attributed to incentivate the adoption of models with a low number of parameters.

# Dataset

The dataset is composed by sentences taken from the generics_kb dataset of hugging face. We restricted the vocabolary to the 10K most frequent words, and only took sentences making use of this vocabulary.

In [1]:
!pip install datasets

Collecting datasets
  Downloading datasets-2.19.2-py3-none-any.whl (542 kB)
[?25l     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m0.0/542.1 kB[0m [31m?[0m eta [36m-:--:--[0m[2K     [91m━━━━━━[0m[91m╸[0m[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m92.2/542.1 kB[0m [31m2.7 MB/s[0m eta [36m0:00:01[0m[2K     [91m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m[90m╺[0m [32m532.5/542.1 kB[0m [31m9.0 MB/s[0m eta [36m0:00:01[0m[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m542.1/542.1 kB[0m [31m7.3 MB/s[0m eta [36m0:00:00[0m
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Downloading dill-0.3.8-py3-none-any.whl (116 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m116.3/116.3 kB[0m [31m10.7 MB/s[0m eta [36m0:00:00[0m
Collecting requests>=2.32.1 (from datasets)
  Downloading requests-2.32.3-py3-none-any.whl (64 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m64.9/64.9 kB[0m [31m4.5 MB/s[0m

Download the dataset

In [2]:
from datasets import load_dataset
from keras.layers import TextVectorization
import tensorflow as tf
import numpy as np
np.random.seed(42)
ds = load_dataset('generics_kb',trust_remote_code=True)['train']

Filter row with length greater than 8.


In [3]:
ds = ds.filter(lambda row: len(row["generic_sentence"].split(" ")) > 8 )
corpus = [ '<start> ' + row['generic_sentence'].replace(","," <comma>") + ' <end>' for row in ds ]
corpus = np.array(corpus)


Create a tokenizer and Detokenizer

In [4]:
tokenizer=TextVectorization( max_tokens=10000, standardize="lower_and_strip_punctuation", encoding="utf-8",) #con il max prende le piu frequenti. ordina i token del vocab dal piu frequente al meno frequente
tokenizer.adapt(corpus)

class TextDetokenizer:
    def __init__(self, vectorize_layer):
        self.vectorize_layer = vectorize_layer
        vocab = self.vectorize_layer.get_vocabulary()
        self.index_to_word = {index: word for index, word in enumerate(vocab)}

    def __detokenize_tokens(self, tokens):
        def check_token(t):
          if t == 3:
            s="<start>"
          elif t == 2:
            s="<end>"
          elif t == 7:
            s="<comma>"
          else:
            s=self.index_to_word.get(t, '[UNK]')
          return s

        return ' '.join([ check_token(token) for token in tokens if token != 0])

    def __call__(self, batch_tokens):
       return [self.__detokenize_tokens(tokens) for tokens in batch_tokens]


detokenizer = TextDetokenizer( tokenizer )
sentences = tokenizer( corpus ).numpy()

Remove from corpus the sentences where any unknow word appears

In [5]:
mask = np.sum( (sentences==1), axis=1) >= 1
original_data = np.delete( sentences, mask , axis=0)

In [6]:
original_data.shape

(241236, 28)

Shuffle the sentences

In [7]:
def extract_full_data(generator):
    x_list = []
    y_list = []
    for i in range(len(generator)):
        x_batch, y_batch = generator[i]
        x_list.append(x_batch)
        y_list.append(y_batch)
    x = np.concatenate(x_list, axis=0)
    y = np.concatenate(y_list, axis=0)
    return x, y

In [8]:
from tensorflow.keras.utils import Sequence

class DataGenerator(Sequence):
    def __init__(self, data, batch_size=32, shuffle=True, seed=42):
        self.data = data
        self.batch_size = batch_size
        self.shuffle = shuffle
        self.seed = seed
        self.on_epoch_end()


    def __len__(self):
        return int(np.floor(len(self.data) / self.batch_size))

    def __getitem__(self, index):
        indexes = self.indexes[index*self.batch_size:(index+1)*self.batch_size]

        data_batch = np.array([self.data[k] for k in indexes])
        #copy of ordered sequences
        result = np.copy(data_batch)
        #shuffle only the relevant positions for each batch
        for i in range(data_batch.shape[0]):
          np.random.shuffle(data_batch[i,1:data_batch[i].argmin() - 1])

        return data_batch , result

    def on_epoch_end(self):
        self.indexes = np.arange(len(self.data))
        if self.shuffle:
            if self.seed is not None:
                np.random.seed(self.seed)
            np.random.shuffle(self.indexes)

In [9]:
# Make a random permutation of training and test set
np.random.seed(42)
# Shuffle the all data
shuffled_indices = np.random.permutation(len(original_data))
shuffled_data = original_data[shuffled_indices]

In [10]:
train_generator = DataGenerator(shuffled_data[:220000])
test_generator = DataGenerator(shuffled_data[220000:])

In [11]:
x, y = test_generator.__getitem__(1)
x = detokenizer(x)
y = detokenizer(y)

for i in range(7):
  print("original: ", y[i])
  print("shuffled: ", x[i])
  print("\n")

original:  <start> ranchers clear large areas of rainforest to become pastures for their cattle <end>
shuffled:  <start> large their areas for cattle ranchers rainforest clear pastures become to of <end>


original:  <start> some earwigs have stripes on the thorax and abdomen <end>
shuffled:  <start> stripes thorax some and the earwigs on abdomen have <end>


original:  <start> magnetic manipulation can turn molecules in a liquid into computing such devices <end>
shuffled:  <start> into in magnetic such a liquid molecules can manipulation computing turn devices <end>


original:  <start> healthy wetlands means cleaner water <comma> reduced flooding and more places for recreation <end>
shuffled:  <start> reduced wetlands and recreation for water places healthy cleaner flooding <comma> means more <end>


original:  <start> market share is the percent share in sales one company controls in a particular market <end>
shuffled:  <start> company percent share one controls a sales in market is

In [12]:
from difflib import SequenceMatcher

def score(s,p):
  match = SequenceMatcher(None, s, p).find_longest_match()
  return (match.size/max(len(p),len(s)))

In [13]:
original = "at first henry wanted to be friends with the king of france"
generated = "henry wanted to be friends with king of france at the first"

print("your score is ",score(original,generated))

your score is  0.5423728813559322


In [14]:
x_train, y_train = extract_full_data(train_generator)
x_train.shape, y_train.shape

((220000, 28), (220000, 28))

In [15]:
x_test, y_test = extract_full_data(test_generator)
x_test.shape, y_test.shape

((21216, 28), (21216, 28))

In [16]:
vocabulary = tokenizer.get_vocabulary()

In [17]:
x_train, y_train = extract_full_data(train_generator)
x_train.shape, x_train.shape

((220000, 28), (220000, 28))

In [19]:
from keras.utils import pad_sequences
def prepare_dataset(X, Y):
  c_set = pad_sequences(np.array([s[1:] for s in X]), maxlen=28, padding='post')
  x_set = Y
  y_set = pad_sequences(np.array([s[1:] for s in Y]), maxlen=28, padding='post')
  context = []
  labels = []
  inputs= []

  for j,x in enumerate(x_set):
    non_null_count = sum(x>0)-2
    for i in range(non_null_count):
      context.append(c_set[j])
      inputs.append(pad_sequences([x[:i+1]], maxlen=28, padding='post')[0])
      labels.append(y_set[j])

  return np.array(context), np.array(inputs), np.array(labels)


In [20]:
x_train.shape

(220000, 28)

In [21]:
context_train, inputs_train, labels_train = prepare_dataset(x_train[:200000], y_train[:200000])
context_val, inputs_val, labels_val = prepare_dataset(x_train[200000:], y_train[200000:])
context_test, inputs_test, labels_test = prepare_dataset(x_test, y_test)

In [22]:
len(context_train)

2466788

In [25]:
import numpy as np
import tensorflow as tf
import keras.backend as k
def positional_encoding(length, depth):
    depth = depth/2

    positions = np.arange(length)[:, np.newaxis]     # (seq, 1)
    depths = np.arange(depth)[np.newaxis, :]/depth   # (1, depth)

    angle_rates = 1 / (10000**depths)         # (1, depth)
    angle_rads = positions * angle_rates      # (pos, depth)

    pos_encoding = np.concatenate(
        [np.sin(angle_rads), np.cos(angle_rads)],
        axis=-1)

    return tf.cast(pos_encoding, dtype=tf.float32)
class BaseAttention(tf.keras.layers.Layer):
    def __init__(self, **kwargs):
        super().__init__()
        self.mha = tf.keras.layers.MultiHeadAttention(**kwargs)
        self.layernorm = tf.keras.layers.LayerNormalization(epsilon=1e-6)
        self.add = tf.keras.layers.Add()
class CrossAttention(BaseAttention):
    def call(self, x, context):
        attn_output, att_scores = self.mha(
            query=x,
            key=context,
            value=context,
            return_attention_scores=True
        )
        self.last_attn_scores = att_scores

        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x

class PositionalEmbedding(tf.keras.layers.Layer):
    def __init__(self, embedder, vocab_size, d_model):
        super().__init__()
        self.d_model = d_model
        self.embedding = embedder
        self.pos_encoding = positional_encoding(length=2048, depth=d_model)
    def compute_mask(self, *args, **kwargs):
        return self.embedding.compute_mask(*args, **kwargs)
    def call(self, x):
        length = tf.shape(x)[1]
        x = self.embedding(x)
        x *= tf.math.sqrt(tf.cast(self.d_model, tf.float32))
        x += self.pos_encoding[tf.newaxis, :length, :]
        return x
class GlobalSelfAttention(BaseAttention):
    def call(self, x):
        attn_output = self.mha(
            query=x,
            value=x,
            key=x
        )
        x = self.add([x, attn_output])
        x = self.layernorm(x)
        return x
class CausalSelfAttention(BaseAttention):
  def call(self, x):
    attn_output = self.mha(
        query=x,
        value=x,
        key=x,
        use_causal_mask = True)
    x = self.add([x, attn_output])
    x = self.layernorm(x)
    return x

class FeedForward(tf.keras.layers.Layer):
  def __init__(self, d_model, dff, dropout_rate=0.1):
    super().__init__()
    self.seq = tf.keras.Sequential([
      tf.keras.layers.Dense(dff, activation='relu'),
      tf.keras.layers.Dense(d_model),
      tf.keras.layers.Dropout(dropout_rate)
    ])
    self.add = tf.keras.layers.Add()
    self.layer_norm = tf.keras.layers.LayerNormalization()

  def call(self, x):
    x = self.add([x, self.seq(x)])
    x = self.layer_norm(x)
    return x

class EncoderLayer(tf.keras.layers.Layer):
  def __init__(self,*, d_model, num_heads, dff, dropout_rate=0.1):
    super().__init__()

    self.self_attention = GlobalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x):
    x = self.self_attention(x)
    x = self.ffn(x)
    return x

class Encoder(tf.keras.layers.Layer):
  def __init__(self, embedder, num_layers, d_model, num_heads,
               dff, vocab_size, dropout_rate=0.1):
    super().__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.embedding = embedder

    self.enc_layers = [
        EncoderLayer(d_model=d_model,
                     num_heads=num_heads,
                     dff=dff,
                     dropout_rate=dropout_rate)
        for _ in range(num_layers)]
    self.dropout = tf.keras.layers.Dropout(dropout_rate)

  def call(self, x):
    # `x` is token-IDs shape: (batch, seq_len)
    x = self.embedding(x)  # Shape `(batch_size, seq_len, d_model)`.

    # Add dropout.
    x = self.dropout(x)

    for i in range(self.num_layers):
      x = self.enc_layers[i](x)

    return x

class DecoderLayer(tf.keras.layers.Layer):
  def __init__(self,
               *,
               d_model,
               num_heads,
               dff,
               dropout_rate=0.1):
    super(DecoderLayer, self).__init__()

    self.causal_self_attention = CausalSelfAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.cross_attention = CrossAttention(
        num_heads=num_heads,
        key_dim=d_model,
        dropout=dropout_rate)

    self.ffn = FeedForward(d_model, dff)

  def call(self, x, context):
    x = self.causal_self_attention(x=x)
    x = self.cross_attention(x=x, context=context)

    # The last attention scores are cached for later plotting
    self.last_attn_scores = self.cross_attention.last_attn_scores

    x = self.ffn(x)  # Shape `(batch_size, seq_len, d_model)`.
    return x

class Decoder(tf.keras.layers.Layer):
  def __init__(self, embedder, num_layers, d_model, num_heads, dff, vocab_size,
               dropout_rate=0.1):
    super(Decoder, self).__init__()

    self.d_model = d_model
    self.num_layers = num_layers

    self.pos_embedding = PositionalEmbedding(embedder, vocab_size=vocab_size, d_model=d_model)

    self.dropout = tf.keras.layers.Dropout(dropout_rate)
    self.dec_layers = [
        DecoderLayer(d_model=d_model, num_heads=num_heads,
                     dff=dff, dropout_rate=dropout_rate)
        for _ in range(num_layers)]

    self.last_attn_scores = None

  def call(self, x, context):
    # `x` is token-IDs shape (batch, target_seq_len)
    x = self.pos_embedding(x)  # (batch_size, target_seq_len, d_model)

    x = self.dropout(x)

    for i in range(self.num_layers):
      x  = self.dec_layers[i](x, context)

    self.last_attn_scores = self.dec_layers[-1].last_attn_scores

    # The shape of x is (batch_size, target_seq_len, d_model).
    return x
  
import keras

@keras.saving.register_keras_serializable()
class Transformer(tf.keras.Model):
  def __init__(self, *, num_layers, d_model, num_heads, dff,
               input_vocab_size, target_vocab_size, dropout_rate=0.1):
    super().__init__()
    self.embedder = tf.keras.layers.Embedding(input_vocab_size, d_model, mask_zero=True)
    self.encoder = Encoder(self.embedder,num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=input_vocab_size,
                           dropout_rate=dropout_rate)

    self.decoder = Decoder(self.embedder,num_layers=num_layers, d_model=d_model,
                           num_heads=num_heads, dff=dff,
                           vocab_size=target_vocab_size,
                           dropout_rate=dropout_rate)

    self.final_layer = tf.keras.layers.Dense(target_vocab_size)

  def call(self, inputs):
    # All inputs must be passed in the first argument to use '.fit'

    context, x  = inputs
    context = self.encoder(context)  # (batch_size, context_len, d_model)

    x = self.decoder(x, context)  # (batch_size, target_len, d_model)

    # Final linear layer output.
    logits = self.final_layer(x)  # (batch_size, target_len, target_vocab_size)

    try:
      # Keras mask is dropped, so it doesn't scale with losses or metrics.
      del logits._keras_mask
    except AttributeError:
      pass

    # Return the final output and the attention weights.
    return logits

@keras.saving.register_keras_serializable()
class CustomSchedule(tf.keras.optimizers.schedules.LearningRateSchedule):
  def __init__(self, d_model, warmup_steps=4000):
    super().__init__()

    self.d_model = d_model
    self.d_model = tf.cast(self.d_model, tf.float32)

    self.warmup_steps = warmup_steps

  def __call__(self, step):
    step = tf.cast(step, dtype=tf.float32)
    arg1 = tf.math.rsqrt(step)
    arg2 = step * (self.warmup_steps ** -1.5)

    return tf.math.rsqrt(self.d_model) * tf.math.minimum(arg1, arg2)
  def get_config(self):
        return {
            'd_model': self.d_model,
            'warmup_steps': self.warmup_steps
        }



# num_layers = 8
num_layers = 8
# d_model = 128
d_model = 128
# dff = 512
dff = 128
# num_heads = 8
num_heads = 4
dropout_rate = 0.2
transformer = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=10_000,
    target_vocab_size=10_000,
    dropout_rate=dropout_rate)
learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)

K_VALUE = 0.97

# Defining a custom loss function that works directly on tokens
def custom_masked_loss(label, pred):

    mask = label != 0
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    loss = loss_object(label, pred)

    final_array = tf.pow(K_VALUE,tf.cast(tf.range(1,28+1),tf.float32))

    mask = tf.cast(mask, dtype=loss.dtype)
    mask*=final_array

    loss *= mask

    loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
    return loss

# Defining a custom metric that works directly on tokens
def masked_accuracy(label, pred):
    pred = tf.argmax(pred, axis=2)
    label = tf.cast(label, pred.dtype)
    match = label == pred

    mask = label != 0

    match = match & mask

    match = tf.cast(match, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(match)/tf.reduce_sum(mask)

transformer.compile(
    loss=custom_masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

transformer.build(input_shape = [(None, 28), (None, 28)])

transformer.summary()



In [43]:
# result = transformer.predict((np.array([context_train[0]]), np.array([inputs_train[0]])))
# tf.argmax(result, axis=2), context_train[0]

In [23]:
# print(custom_masked_loss(y_train[0],result))
transformer.fit(
    (context_train[:1], inputs_train[:1]),
    labels_train[:1],
    epochs=1,
    batch_size=1,
    # callbacks = [es],
    validation_data = ((context_val[:1], inputs_val[:1]), labels_val[:1]))
transformer.summary()



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m37s[0m 37s/step - loss: 9.2709 - masked_accuracy: 0.0000e+00 - val_loss: 9.2399 - val_masked_accuracy: 0.0000e+00


In [26]:
print("Num GPUs Available: ", len(tf.config.list_physical_devices('GPU')))

Num GPUs Available:  0


In [27]:
context_train.shape, context_val.shape, context_test.shape

((2466788, 28), (247184, 28), (262026, 28))

In [28]:
from keras.callbacks import EarlyStopping, ModelCheckpoint

# Callbacks
es = EarlyStopping(monitor='val_masked_accuracy', mode='max', verbose=1, patience=2)

epochs = 20
batch_size= 128

checkpoint_filepath = '/content/drive/MyDrive/UNIBO_DEEP_LEARNING/latest.weights.h5'
checkpoint_filepath = './latest.weights.h5'
checkpoint_callback = ModelCheckpoint(
    filepath=checkpoint_filepath,
    save_weights_only=True, 
    save_best_only=True,
    monitor='masked_accuracy',# Only save the weights
    save_freq=1000,          # Save every 1000 weight updates
    verbose=1                # Verbosity level (optional)
)
history = transformer.fit(
    (context_train, inputs_train),
    labels_train,
    epochs=epochs,
    batch_size=batch_size,
    callbacks=[checkpoint_callback, es],
    validation_data = ((context_val, inputs_val), labels_val)
)

Epoch 1/20




[1m  999/19272[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m9:06:59[0m 2s/step - loss: 7.8880 - masked_accuracy: 0.1356
Epoch 1: masked_accuracy improved from -inf to 0.20870, saving model to ./latest.weights.h5
[1m 1221/19272[0m [32m━[0m[37m━━━━━━━━━━━━━━━━━━━[0m [1m9:02:13[0m 2s/step - loss: 7.6311 - masked_accuracy: 0.1509

In [26]:
transformer.save_weights("weights_final.weights.h5", overwrite=True)


In [130]:


transformer2 = Transformer(
    num_layers=num_layers,
    d_model=d_model,
    num_heads=num_heads,
    dff=dff,
    input_vocab_size=10_000,
    target_vocab_size=10_000,
    dropout_rate=dropout_rate)

learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)


transformer2.compile(
    loss=custom_masked_loss,
    optimizer=optimizer,
    metrics=[masked_accuracy])

transformer2.build(input_shape = [(None, 28), (None, 28)])
history = transformer2.fit(
    (context_train[:2], inputs_train[:2]),
    labels_train[:2],
    epochs=epochs,
    batch_size=batch_size,
    # callbacks = [es],
    validation_data = ((context_val[:1], inputs_val[:1]), labels_val[:1])
)
transformer2.load_weights('weights_final.weights.h5')



Epoch 1/10




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m20s[0m 20s/step - loss: 9.2510 - masked_accuracy: 0.0000e+00 - val_loss: 9.1760 - val_masked_accuracy: 0.0000e+00
Epoch 2/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 193ms/step - loss: 9.2510 - masked_accuracy: 0.0000e+00 - val_loss: 9.1760 - val_masked_accuracy: 0.0000e+00
Epoch 3/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 172ms/step - loss: 9.2506 - masked_accuracy: 0.0000e+00 - val_loss: 9.1760 - val_masked_accuracy: 0.0000e+00
Epoch 4/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 165ms/step - loss: 9.2498 - masked_accuracy: 0.0000e+00 - val_loss: 9.1760 - val_masked_accuracy: 0.0000e+00
Epoch 5/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 161ms/step - loss: 9.2486 - masked_accuracy: 0.0000e+00 - val_loss: 9.1761 - val_masked_accuracy: 0.0000e+00
Epoch 6/10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 164ms/step - loss: 9.2470 - m

In [136]:
transformer2.get_weights()

[array([[-0.04497169,  0.02384892, -0.02589808, ..., -0.01803808,
          0.00213627, -0.03898999],
        [ 0.02015269,  0.01465249, -0.00814648, ...,  0.02828521,
          0.00210916, -0.01975038],
        [ 0.04286704, -0.04347075,  0.02920267, ..., -0.01061537,
         -0.03098737,  0.03457798],
        ...,
        [ 0.02833286, -0.0064625 ,  0.0176892 , ...,  0.0235562 ,
          0.04772631,  0.02877942],
        [ 0.01668406, -0.02471398, -0.00863882, ..., -0.04173609,
         -0.02960943, -0.02536497],
        [-0.03886903, -0.03026325, -0.03144257, ...,  0.03853064,
         -0.01046933, -0.01145511]], dtype=float32),
 array([[[ 0.00857338, -0.00964614,  0.00957859, ..., -0.00292487,
          -0.01693475, -0.01432201],
         [-0.00220412,  0.00602238,  0.009244  , ...,  0.01629256,
           0.01416738, -0.01534653],
         [ 0.0028951 ,  0.01874839, -0.00639832, ...,  0.01106662,
          -0.00563039,  0.01842386],
         [ 0.00924975,  0.01325698, -0.0056750

In [137]:

np.argmax(transformer.predict((np.array([context_train[0]]), np.array([inputs_train[0]]))),axis=-1) == np.argmax(transformer2.predict((np.array([context_train[0]]), np.array([inputs_train[0]]))),axis=-1)




[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 574ms/step
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 580ms/step


array([[ True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True,  True,  True,  True,  True,  True,  True,  True,  True,
         True]])

In [None]:
import tensorflow as tf
from keras.callbacks import EarlyStopping, ModelCheckpoint
import os

# Callbacks
# es = EarlyStopping(monitor='val_masked_accuracy', mode='max', verbose=1, patience=10)

# Define a custom callback for model saving
class SaveModelEveryEpoch(ModelCheckpoint):
    def __init__(self, filepath, **kwargs):
        super(SaveModelEveryEpoch, self).__init__(filepath, save_weights_only=True, **kwargs)

    def on_epoch_end(self, epoch, logs=None):
        if (epoch + 1) % self.save_freq == 0:
            # Save the model weights with epoch number in the filename
            filename = self.filepath.format(epoch=epoch+1)  # Epoch starts from 0
            self.model.save_weights(filename, overwrite=True)
            print()
            print(f"Model weights saved to {filename}")

# Define the path to save the models (replace with your desired location)
model_dir = '/content/gdrive/MyDrive/UNIBO_DEEP_LEARNING/'

# Create the directory to store models if it doesn't exist
if not os.path.exists(model_dir):
    os.makedirs(model_dir)

epochs = 100
batch_size = 64


checkpoint = ModelCheckpoint(
    'best.weights.h5',
    monitor="val_loss",
    verbose=0,
    save_best_only=True,
    save_weights_only=True,
    mode="auto",
    save_freq=1)
# Training phase with callbacks
history = transformer.fit(
    (context_train[:100], inputs_train[:100]),
    labels_train[:100],
    epochs=epochs,
    batch_size=batch_size,
    validation_data=((context_val[:100], inputs_val[:100]), labels_val[:100]),
    callbacks=[
        # es,
        checkpoint]
)


Epoch 1/100


ValueError: Cannot take the length of shape with unknown rank.

In [27]:
def score(s, p):
    s_np = s.numpy() if isinstance(s, tf.Tensor) else s
    p_np = p.numpy() if isinstance(p, tf.Tensor) else p
    match = SequenceMatcher(None, s_np, p_np).find_longest_match(0, len(s_np), 0, len(p_np))
    return match.size / max(len(p_np), len(s_np))

def masked_accuracy2(label, pred):
    pred = tf.argmax(pred, axis=-1)
    mask = label > 2
    m = tf.reduce_sum(tf.cast(mask, tf.int32))
    return score(label[:m], pred[:m].numpy())

x,y = x_train, y_train
for i in range(10):
    context = pad_sequences(np.array([x[i][1:]]), maxlen=28, padding='post')[0]
    current_input = pad_sequences(np.array([x[i][:1]]), maxlen=28, padding='post')[0]
    result = transformer.predict((np.array([context]), np.array([current_input])))
    label = pad_sequences(np.array([y[i][1:]]), maxlen=28, padding='post')[0]

    print(masked_accuracy2(label,result[0]))

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 1s/step
0.3333333333333333
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 83ms/step
0.36363636363636365
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 73ms/step
0.38461538461538464
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
0.3333333333333333
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 34ms/step
0.3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 76ms/step
0.3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
0.3333333333333333
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 31ms/step
0.3333333333333333
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 71ms/step
0.2727272727272727
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
0.2727272727272727


In [None]:

result = transformer.predict((np.array([context_train[0]]), np.array([inputs_train[0]])))



[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 3s/step


In [None]:
np.argmax(result,axis=2), labels_train[0]

(array([[3941, 6775,  114,  685,   21, 6429,   16, 1112,  969,   47,   10,
          224,  679,   11, 2756,    2,    2,    2,    2,    2,    2,    2,
            2,    2,  101,  101, 1112, 1112]]),
 array([3941, 6775,  114,  685,   21, 6429,   16, 1112,  969,   47,   10,
         224,  679,   11, 2756,    2,    0,    0,    0,    0,    0,    0,
           0,    0,    0,    0,    0,    0], dtype=int32))

In [36]:
score(x[i],y[i])

0.5357142857142857

In [28]:
import numpy as np
from keras.preprocessing.sequence import pad_sequences

# Example data
x, y = x_train, y_train

# Function to compute masked accuracy outside the model
def compute_masked_accuracy(label, pred):
    pred = tf.argmax(pred, axis=-1)
    mask = label > 2
    m = tf.reduce_sum(tf.cast(mask, tf.int32))
    return score(label[:m], pred[:m].numpy())

gs = []
rs = []
num_examples = 1000
for i in range(num_examples):
    context = pad_sequences(np.array([x[i][1:]]), maxlen=28, padding='post')
    current_input = pad_sequences(np.array([x[i][:1]]), maxlen=28, padding='post')
    result = transformer.predict((context, current_input))
    label = pad_sequences(np.array([y[i][1:]]), maxlen=28, padding='post')[0]
    random_accuracy = score(y[i][:sum(y[i]>0)], x[i][:sum(x[i]>0)])
    rs.append(random_accuracy)
    accuracy = compute_masked_accuracy(label, result[0])
    print(f'Example {i}: Masked Accuracy = {accuracy}')
    gs.append(accuracy)

sum(rs)/num_examples, sum(gs)/num_examples

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 41ms/step
Example 0: Masked Accuracy = 0.3333333333333333
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 33ms/step
Example 1: Masked Accuracy = 0.36363636363636365
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Example 2: Masked Accuracy = 0.38461538461538464
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
Example 3: Masked Accuracy = 0.3333333333333333
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 88ms/step
Example 4: Masked Accuracy = 0.3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
Example 5: Masked Accuracy = 0.3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 56ms/step
Example 6: Masked Accuracy = 0.3333333333333333
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 42ms/step
Example 7: Masked Accuracy = 0.3333333333333333
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/s

(0.1351170592449544, 0.27586692298499665)

In [31]:
# result = transformer.predict((np.array([context_train[1]]), np.array([inputs_train[1]])))
# score(labels_train[1],tf.argmax(result, axis=2)[0])

In [32]:
# context_train[0]

In [33]:
# labels_train[0]

In [None]:
available_tokens = context_train[0][1:sum((context_train[0]>0))-1].tolist()
print(available_tokens)
relevant_logits = result[0, 0, available_tokens]
print(relevant_logits)
available_tokens[np.argmax(relevant_logits,axis=-1)]

[6775, 114, 10, 685, 16, 2756, 1112, 11, 679, 47, 6429, 21, 3941, 969]
[ 6.204188   5.558384  -0.7421779  7.5927677  1.9305757  2.8521326
  7.6264124 -0.3337785  1.5587531  1.852251   6.119429   0.523014
 14.912247   7.537262 ]


3941

In [None]:
vocabulary_size = len(vocabulary)
batch_size = 32

In [None]:
test_generator = ModifiedDataGenerator((x_test,y_test), batch_size=batch_size, vocabulary_size=vocabulary_size)
train_generator = ModifiedDataGenerator((x_train,y_train), batch_size=batch_size, vocabulary_size=vocabulary_size)


In [29]:
from difflib import SequenceMatcher
from keras.preprocessing.sequence import pad_sequences

def score(s, p):
    match = SequenceMatcher(None, s, p).find_longest_match(0, len(s), 0, len(p))
    return match.size / max(len(p), len(s))

# Limit the number of examples for testing


def get_score(x, y, model):

  max_count = sum(x > 3)
  generated = pad_sequences(np.array([x[:1]]), maxlen=28, padding='post')[0]
  available_tokens = x[1:sum((x>0))-1].tolist()
  original = y[1:]
  original = original[:sum(original>0)-1]

  for count in range(max_count):
    current_context = x
    prediction = model.predict((np.array([current_context]), np.array([generated])), verbose=0)

    mask = x > 0

    relevant_logits = prediction[0, count, available_tokens]

    generated_index = np.argmax(relevant_logits, axis=-1)
    generated_token = available_tokens[generated_index]

    available_tokens.remove(generated_token)
    generated[count+1] = generated_token

  generated = generated[1:sum(generated>0)]

  return score(original, generated)

num_examples = 100
rs = []
gs = []
x,y = x_train[20000:], y_train[20000:]
for i in range(num_examples):
  a = x[i][1:sum(x[i]>0)-1]
  b = y[i][1:sum(y[i]>0)-1]

  rs.append(score(b, a))
  gs.append(get_score(x[i],y[i], transformer))
  print(f'{i+1}.Random: {rs[-1]}')
  print(f'{i+1}.Generated: {gs[-1]}')
  
sum(rs)/num_examples, sum(gs)/num_examples

1.Random: 0.13333333333333333
1.Generated: 0.26666666666666666
2.Random: 0.09090909090909091
2.Generated: 0.18181818181818182
3.Random: 0.1111111111111111
3.Generated: 0.2222222222222222
4.Random: 0.09090909090909091
4.Generated: 0.2727272727272727
5.Random: 0.3333333333333333
5.Generated: 0.3333333333333333
6.Random: 0.125
6.Generated: 0.25
7.Random: 0.17647058823529413
7.Generated: 0.17647058823529413
8.Random: 0.07142857142857142
8.Generated: 0.14285714285714285
9.Random: 0.36363636363636365
9.Generated: 0.2727272727272727
10.Random: 0.08333333333333333
10.Generated: 0.25
11.Random: 0.16666666666666666
11.Generated: 0.5
12.Random: 0.15384615384615385
12.Generated: 0.38461538461538464
13.Random: 0.16666666666666666
13.Generated: 0.4166666666666667
14.Random: 0.058823529411764705
14.Generated: 0.23529411764705882
15.Random: 0.18181818181818182
15.Generated: 0.6363636363636364
16.Random: 0.2222222222222222
16.Generated: 0.4444444444444444
17.Random: 0.08333333333333333
17.Generated: 0.

(0.15757758027146557, 0.3208199359549978)

In [30]:
num_examples = 100
rs = []
gs = []
x,y = x_test[2000:], y_test[2000:]
for i in range(num_examples):
  a = x[i][1:sum(x[i]>0)-1]
  b = y[i][1:sum(y[i]>0)-1]

  rs.append(score(b, a))
  gs.append(get_score(x[i],y[i], transformer))
  print(f'{i+1}.Random: {rs[-1]}')
  print(f'{i+1}.Generated: {gs[-1]}')

sum(rs)/num_examples, sum(gs)/num_examples

1.Random: 0.15384615384615385
1.Generated: 0.15384615384615385
2.Random: 0.1875
2.Generated: 0.125
3.Random: 0.11764705882352941
3.Generated: 0.11764705882352941
4.Random: 0.18181818181818182
4.Generated: 0.36363636363636365
5.Random: 0.3
5.Generated: 0.5
6.Random: 0.23076923076923078
6.Generated: 0.38461538461538464
7.Random: 0.13333333333333333
7.Generated: 0.13333333333333333
8.Random: 0.15384615384615385
8.Generated: 0.3076923076923077
9.Random: 0.07692307692307693
9.Generated: 0.23076923076923078
10.Random: 0.125
10.Generated: 0.1875
11.Random: 0.1111111111111111
11.Generated: 0.1111111111111111
12.Random: 0.09090909090909091
12.Generated: 1.0
13.Random: 0.058823529411764705
13.Generated: 0.11764705882352941
14.Random: 0.07692307692307693
14.Generated: 0.38461538461538464
15.Random: 0.09090909090909091
15.Generated: 0.09090909090909091
16.Random: 0.1
16.Generated: 0.4
17.Random: 0.07692307692307693
17.Generated: 0.23076923076923078
18.Random: 0.09090909090909091
18.Generated: 0.18

(0.13646110562983613, 0.29228569340875826)

# Metrics

Let s be the source string and p your prediction. The quality of the results will be measured according to the following metric:

1.  look for the longest substring w between s and p
2.  compute |w|/max(|s|,|p|)

If the match is exact, the score is 1.

When computing the score, you should NOT consider the start and end tokens.



The longest common substring can be computed with the SequenceMatcher function of difflib, that allows a simple definition of our metric.

Let's do an example.

In [None]:
original = "at first henry wanted to be friends with the king of france"
generated = "henry wanted to be friends with king of france at the first"

print("your score is ",score(original,generated))

your score is  0.5423728813559322


The score must be computed as an average of at least 3K random examples taken form the test set.

# What to deliver

You are supposed to deliver a single notebook, suitably commented.
The notebook should describe a single model, although you may briefly discuss additional attempts you did.

The notebook should contain a full trace of the training.
Weights should be made available on request.

You must also give a clear assesment of the performance of the model, computed with the metric that has been given to you.

# Good work!

In [29]:
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, GRU, Dense

embedding_dim = 50
gru_units = 100

model = Sequential([
    Embedding(input_dim=10000, output_dim=embedding_dim, input_length=28),
    GRU(gru_units, return_sequences=True),
    Dense(10000, activation='softmax')
])
K_VALUE = 0.97

# Defining a custom loss function that works directly on tokens
def custom_masked_loss(label, pred):

    mask = label != 0
    loss_object = tf.keras.losses.SparseCategoricalCrossentropy(
        from_logits=True, reduction='none')
    loss = loss_object(label, pred)

    final_array = tf.pow(K_VALUE,tf.cast(tf.range(1,28+1),tf.float32))

    mask = tf.cast(mask, dtype=loss.dtype)
    mask*=final_array

    loss *= mask

    loss = tf.reduce_sum(loss)/tf.reduce_sum(mask)
    return loss

# Defining a custom metric that works directly on tokens
def masked_accuracy(label, pred):
    pred = tf.argmax(pred, axis=2)
    label = tf.cast(label, pred.dtype)
    match = label == pred

    mask = label != 0

    match = match & mask

    match = tf.cast(match, dtype=tf.float32)
    mask = tf.cast(mask, dtype=tf.float32)
    return tf.reduce_sum(match)/tf.reduce_sum(mask)


model.compile(optimizer='adam', loss=custom_masked_loss, metrics=[masked_accuracy])
model.summary()


Model: "sequential_17"
_________________________________________________________________
 Layer (type)                Output Shape              Param #   
 embedding_2 (Embedding)     (None, 28, 50)            500000    
                                                                 
 gru_1 (GRU)                 (None, 28, 100)           45600     
                                                                 
 dense_34 (Dense)            (None, 28, 10000)         1010000   
                                                                 
Total params: 1555600 (5.93 MB)
Trainable params: 1555600 (5.93 MB)
Non-trainable params: 0 (0.00 Byte)
_________________________________________________________________


In [39]:
inputs_train[0]

array([3, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 0, 0], dtype=int32)

In [20]:
y_train[0]

array([   3, 1142,  907,    8,  155,   23, 3875,  724, 1142,    6, 4083,
         11,   40,  885,    9, 1858,    4, 4548,  160,    2,    0,    0,
          0,    0,    0,    0,    0,    0])

In [22]:
original_target[0]

array([[0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.],
       ...,
       [1., 0., 0., ..., 0., 0., 0.],
       [1., 0., 0., ..., 0., 0., 0.],
       [0., 0., 0., ..., 0., 0., 0.]], dtype=float32)

In [21]:
# def prepare_target_data(sequences, vocab_size):
#     targets = np.zeros((len(sequences), 28, vocab_size), dtype=np.float32)
#     for i, seq in enumerate(sequences):
#         for t, word_id in enumerate(seq):
#             if t > 0:
#                 targets[i, t-1, word_id] = 1
#     return targets

# original_target = prepare_target_data(y_train[:100], 10000)
# original_target.shape

(100, 28, 10000)

In [30]:
batch_size = 128
epochs = 10

model.fit(x_train[:10000], y_train[:10000], batch_size=batch_size, epochs=epochs, validation_split=0.2)


Epoch 1/10


  output, from_logits = _get_logits(


Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10

KeyboardInterrupt: 

In [31]:
r = model.predict(np.array([x_train[0]]))
r.shape



(1, 28, 10000)

In [32]:
np.argmax(r, axis=-1)

array([[ 3, 25,  8,  8,  4,  4,  4,  4,  4,  4,  4,  2,  2,  4,  4,  2,
         2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2,  2]])

In [42]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, GRU, Dense

embedding_dim = 128
gru_units = 100

# Encoder
encoder_inputs = Input(shape=(28,))
encoder_embedding = Embedding(input_dim=10000, output_dim=embedding_dim, input_length=28)(encoder_inputs)
encoder_gru = GRU(gru_units, return_state=True)
_, encoder_state = encoder_gru(encoder_embedding)

# Decoder
decoder_inputs = Input(shape=(28,))
decoder_embedding = Embedding(input_dim=10000, output_dim=embedding_dim, input_length=28)(decoder_inputs)
decoder_gru = GRU(gru_units, return_sequences=True, return_state=True)
decoder_outputs, _ = decoder_gru(decoder_embedding, initial_state=encoder_state)
decoder_dense = Dense(10000, activation='softmax')
decoder_outputs = decoder_dense(decoder_outputs)

# Seq2Seq Model
model = Model([encoder_inputs, decoder_inputs], decoder_outputs)

learning_rate = CustomSchedule(d_model)

optimizer = tf.keras.optimizers.Adam(learning_rate, beta_1=0.9, beta_2=0.98,
                                     epsilon=1e-9)
model.compile(optimizer=optimizer, loss=custom_masked_loss, metrics=[masked_accuracy])
model.summary()


Model: "model_3"
__________________________________________________________________________________________________
 Layer (type)                Output Shape                 Param #   Connected to                  
 input_7 (InputLayer)        [(None, 28)]                 0         []                            
                                                                                                  
 input_8 (InputLayer)        [(None, 28)]                 0         []                            
                                                                                                  
 embedding_9 (Embedding)     (None, 28, 128)              1280000   ['input_7[0][0]']             
                                                                                                  
 embedding_10 (Embedding)    (None, 28, 128)              1280000   ['input_8[0][0]']             
                                                                                            

In [None]:
batch_size = 64
epochs = 20

model.fit([context_train[:100], inputs_train[:100]], labels_train[:100], batch_size=batch_size, epochs=epochs, validation_data=([context_val[:10], inputs_val[:10]], labels_val[:10]))


Epoch 1/20
Epoch 2/20
Epoch 3/20
Epoch 4/20
Epoch 5/20
Epoch 6/20
Epoch 7/20
Epoch 8/20
Epoch 9/20
Epoch 10/20
Epoch 11/20
Epoch 12/20
Epoch 13/20
Epoch 14/20
Epoch 15/20
Epoch 16/20
Epoch 17/20
Epoch 18/20
Epoch 19/20
Epoch 20/20


In [38]:
from difflib import SequenceMatcher
from keras.preprocessing.sequence import pad_sequences

def score(s, p):
    match = SequenceMatcher(None, s, p).find_longest_match(0, len(s), 0, len(p))
    return match.size / max(len(p), len(s))

# Limit the number of examples for testing


def get_score(x, y, model):

  max_count = sum(x > 3)
  generated = pad_sequences(np.array([x[:1]]), maxlen=28, padding='post')[0]
  available_tokens = x[1:sum((x>0))-1].tolist()
  original = y[1:]
  original = original[:sum(original>0)-1]

  for count in range(max_count):
    current_context = x
    prediction = model.predict((np.array([current_context]), np.array([generated])), verbose=0)

    mask = x > 0

    relevant_logits = prediction[0, count, available_tokens]

    generated_index = np.argmax(relevant_logits, axis=-1)
    generated_token = available_tokens[generated_index]

    available_tokens.remove(generated_token)
    generated[count+1] = generated_token

  generated = generated[1:sum(generated>0)]

  return score(original, generated)

num_examples = 100
rs = []
gs = []
x,y = x_train, y_train
for i in range(num_examples):
  a = x[i][1:sum(x[i]>0)-1]
  b = y[i][1:sum(y[i]>0)-1]

  rs.append(score(b, a))
  gs.append(get_score(x[i],y[i], transformer))
  print(f'Random: {rs[-1]}')
  print(f'Generated: {gs[-1]}')

sum(rs)/num_examples, sum(gs)/num_examples

Random: 0.1111111111111111
Generated: 0.05555555555555555
Random: 0.09090909090909091
Generated: 0.09090909090909091
Random: 0.23076923076923078
Generated: 0.15384615384615385
Random: 0.1111111111111111
Generated: 0.2222222222222222
Random: 0.2
Generated: 0.3
Random: 0.1
Generated: 0.1
Random: 0.13333333333333333
Generated: 0.06666666666666667


KeyboardInterrupt: 