## Translating Human Readable Dates Into Machine Readable Dates

* The model we will build here could be used to translate from one language to another, such as translating from English to Hindi. 
* However, language translation requires massive datasets and usually takes days of training on GPUs. 
* So, we will perform a simpler "date translation" task. 
* The network will input a date written in a variety of possible formats (*e.g. "the 29th of August 1958", "03/30/1968", "24 JUNE 1987"*) 
* The network will translate them into standardized, machine readable dates (*e.g. "1958-08-29", "1968-03-30", "1987-06-24"*). 
* We will have the network learn to output dates in the common machine-readable format YYYY-MM-DD. 

In [51]:
import numpy as np
import random
import tensorflow as tf
from tqdm import tqdm
from faker import Faker
from babel.dates import format_date

from tensorflow.keras.models import Model, load_model
from tensorflow.keras.layers import (
    Input, LSTM, Dense, Activation, RepeatVector, Bidirectional,
    Concatenate, Dot, Permute, Multiply, Lambda
)
from tensorflow.keras.optimizers import Adam
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import backend as K

import os
import json
import pickle

In [46]:
Faker.seed(12345)
random.seed(12345)

In [47]:
FORMATS = ['short',
           'medium',
           'long',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'd MMM YYY', 
           'd MMMM YYY',
           'dd MMM YYY',
           'd MMM, YYY',
           'd MMMM, YYY',
           'dd, MMM YYY',
           'd MM YY',
           'd MMMM YYY',
           'MMMM d YYY',
           'MMMM d, YYY',
           'dd.MM.YY']

In [48]:
def load_date():
    """
        Loads some fake dates 
        :returns: tuple containing human readable string, machine readable string, and date object
    """
    fake = Faker()
    dt = fake.date_object()

    try:
        human_readable = format_date(dt, format=random.choice(FORMATS),  locale='en_US') # locale=random.choice(LOCALES))
        human_readable = human_readable.lower()
        human_readable = human_readable.replace(',','')
        machine_readable = dt.isoformat()
        
    except AttributeError as e:
        return None, None, None

    return human_readable, machine_readable, dt

In [49]:
def load_dataset(m):
    """
        Loads a dataset with m examples and vocabularies
        :m: the number of examples to generate
    """
    
    human_vocab = set()
    machine_vocab = set()
    dataset = []
    Tx = 30
    

    for i in tqdm(range(m)):
        h, m, _ = load_date()
        if h is not None:
            dataset.append((h, m))
            human_vocab.update(tuple(h))
            machine_vocab.update(tuple(m))
    
    human = dict(zip(sorted(human_vocab) + ['<unk>', '<pad>'], 
                     list(range(len(human_vocab) + 2))))
    inv_machine = dict(enumerate(sorted(machine_vocab)))
    machine = {v:k for k,v in inv_machine.items()}
 
    return dataset, human, machine, inv_machine

In [50]:
m = 10000
dataset, human_vocab, machine_vocab, inv_machine_vocab = load_dataset(m)

100%|██████████| 10000/10000 [04:02<00:00, 41.18it/s]


In [53]:
os.makedirs("data", exist_ok=True)

with open("data/dataset.pkl", "wb") as f:
    pickle.dump(dataset, f)

with open("data/human_vocab.json", "w") as f:
    json.dump(human_vocab, f)

with open("data/machine_vocab.json", "w") as f:
    json.dump(machine_vocab, f)

with open("data/inv_machine_vocab.json", "w") as f:
    json.dump(inv_machine_vocab, f)

In [54]:
# Load dataset
with open("data/dataset.pkl", "rb") as f:
    loaded_dataset = pickle.load(f)

# Load vocabularies
with open("data/human_vocab.json", "r") as f:
    loaded_human_vocab = json.load(f)

with open("data/machine_vocab.json", "r") as f:
    loaded_machine_vocab = json.load(f)

with open("data/inv_machine_vocab.json", "r") as f:
    loaded_inv_machine_vocab = json.load(f)

In [55]:
dataset = loaded_dataset
human_vocab = loaded_human_vocab
machine_vocab = loaded_machine_vocab
inv_machine_vocab = loaded_inv_machine_vocab

In [56]:
print("Total examples loaded:", len(dataset))

print("\nSample data pairs:")
for i in range(5):
    human_str, machine_str = dataset[i]
    print(f"  {i+1}. Human: {human_str}  ->  Machine: {machine_str}")

print("\nInput vocabulary (human_vocab):")
print(f"  Size: {len(human_vocab)}")
print(f"  Example mapping: {list(human_vocab.items())[:5]}")

print("\nOutput vocabulary (machine_vocab):")
print(f"  Size: {len(machine_vocab)}")
print(f"  Example mapping: {list(machine_vocab.items())[:5]}")

print("\nInverse machine vocab (inv_machine_vocab):")
print(f"  Size: {len(inv_machine_vocab)}")
print(f"  Example mapping: {list(inv_machine_vocab.items())[:5]}")

Total examples loaded: 10000

Sample data pairs:
  1. Human: 9 may 1998  ->  Machine: 1998-05-09
  2. Human: 10.11.19  ->  Machine: 2019-11-10
  3. Human: 9/10/70  ->  Machine: 1970-09-10
  4. Human: monday august 19 2024  ->  Machine: 2024-08-19
  5. Human: saturday april 28 1990  ->  Machine: 1990-04-28

Input vocabulary (human_vocab):
  Size: 37
  Example mapping: [(' ', 0), ('.', 1), ('/', 2), ('0', 3), ('1', 4)]

Output vocabulary (machine_vocab):
  Size: 11
  Example mapping: [('-', 0), ('0', 1), ('1', 2), ('2', 3), ('3', 4)]

Inverse machine vocab (inv_machine_vocab):
  Size: 11
  Example mapping: [('0', '-'), ('1', '0'), ('2', '1'), ('3', '2'), ('4', '3')]


In [57]:
def string_to_int(string, length, vocab):
    """
    Converts a string to a fixed-length list of integers based on the provided vocabulary.
    
    Arguments:
    string -- Input string (e.g., 'Wed 10 Jul 2007')
    length -- Desired length of output sequence (pads or cuts accordingly)
    vocab -- Dictionary mapping characters to integer indices
    
    Returns:
    rep -- List of integers of size 'length', representing the string
    """
    
    string = string.lower().replace(',', '')
    
    if len(string) > length:
        string = string[:length]
    
    rep = [vocab.get(char, '<unk>') for char in string]
    
    if len(string) < length:
        rep += [vocab['<pad>']] * (length - len(string))
    
    return rep

In [58]:
def preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty):
    """
    Prepares input and output sequences for training a sequence-to-sequence model.
    
    Arguments:
    dataset       -- List of (human_readable, machine_readable) string pairs
    human_vocab   -- Dictionary mapping input characters to integer indices
    machine_vocab -- Dictionary mapping output characters to integer indices
    Tx            -- Length of input sequences (fixed)
    Ty            -- Length of output sequences (fixed)
    
    Returns:
    X     -- np.array of shape (m, Tx), input sequences (integers)
    Y     -- np.array of shape (m, Ty), output sequences (integers)
    Xoh   -- np.array of shape (m, Tx, len(human_vocab)), one-hot encoded inputs
    Yoh   -- np.array of shape (m, Ty, len(machine_vocab)), one-hot encoded outputs
    """

    X_texts, Y_texts = zip(*dataset)
    
    X = np.array([string_to_int(x, Tx, human_vocab) for x in X_texts])
    Y = np.array([string_to_int(y, Ty, machine_vocab) for y in Y_texts])
    
    Xoh = np.array([to_categorical(x_seq, num_classes=len(human_vocab)) for x_seq in X])
    Yoh = np.array([to_categorical(y_seq, num_classes=len(machine_vocab)) for y_seq in Y])
    
    return X, Y, Xoh, Yoh

In [59]:
Tx = 30
Ty = 10  # YYYY-MM-DD is 10 characters long.

X, Y, Xoh, Yoh = preprocess_data(dataset, human_vocab, machine_vocab, Tx, Ty)

print("X.shape:", X.shape)
print("Y.shape:", Y.shape)
print("Xoh.shape:", Xoh.shape)
print("Yoh.shape:", Yoh.shape)

X.shape: (10000, 30)
Y.shape: (10000, 10)
Xoh.shape: (10000, 30, 37)
Yoh.shape: (10000, 10, 11)


In [60]:
index = 0
print("Source date:", dataset[index][0])
print("Target date:", dataset[index][1])
print()
print("Source after preprocessing (indices):", X[index])
print("Target after preprocessing (indices):", Y[index])
print()
print("Source after preprocessing (one-hot):", Xoh[index])
print("Target after preprocessing (one-hot):", Yoh[index])

Source date: 9 may 1998
Target date: 1998-05-09

Source after preprocessing (indices): [12  0 24 13 34  0  4 12 12 11 36 36 36 36 36 36 36 36 36 36 36 36 36 36
 36 36 36 36 36 36]
Target after preprocessing (indices): [ 2 10 10  9  0  1  6  0  1 10]

Source after preprocessing (one-hot): [[0. 0. 0. ... 0. 0. 0.]
 [1. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]
Target after preprocessing (one-hot): [[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 1. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 1. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]]


## 2 - Neural Machine Translation with Attention

* If we had to translate a book's paragraph from French to English, we would not read the whole paragraph, then close the book and translate. 
* Even during the translation process, we would read/re-read and focus on the parts of the French paragraph corresponding to the parts of the English we are writing down. 
* The attention mechanism tells a Neural Machine Translation model where it should pay attention to at any step. 

### 2.1 - Attention Mechanism

In this part, we will implement the attention mechanism.  


* The diagram on the left shows the attention model. 
* The diagram on the right shows what one "attention" step does to calculate the attention variables $\alpha^{\langle t, t' \rangle}$.
* The attention variables $\alpha^{\langle t, t' \rangle}$ are used to compute the context variable $context^{\langle t \rangle}$ for each timestep in the output ($t=1, \ldots, T_y$). 

<table>
<td> 
<img src="images/attn_model.png" style="width:500;height:500px;"> <br>
</td> 
<td> 
<img src="images/attn_mechanism.png" style="width:500;height:500px;"> <br>
</td> 
</table>
<caption><center>Neural machine translation with attention</center></caption>


#### Pre-attention and Post-attention LSTMs on both sides of the attention mechanism
- There are two separate LSTMs in this model (see diagram on the left): pre-attention and post-attention LSTMs.
- *Pre-attention* Bi-LSTM is the one at the bottom of the picture is a Bi-directional LSTM and comes *before* the attention mechanism.
    - The attention mechanism is shown in the middle of the left-hand diagram.
    - The pre-attention Bi-LSTM goes through $T_x$ time steps
- *Post-attention* LSTM: at the top of the diagram comes *after* the attention mechanism. 
    - The post-attention LSTM goes through $T_y$ time steps. 

- The post-attention LSTM passes the hidden state $s^{\langle t \rangle}$ and cell state $c^{\langle t \rangle}$ from one time step to the next. 

#### Each time step does not use predictions from the previous time step
* The post-attention LSTM at time $t$ does not take the previous time step's prediction $y^{\langle t-1 \rangle}$ as input.
* The post-attention LSTM at time 't' only takes the hidden state $s^{\langle t\rangle}$ and cell state $c^{\langle t\rangle}$ as input. 
* Because unlike language generation (where adjacent characters are highly correlated) there isn't as strong a dependency between the previous character and the next character in a YYYY-MM-DD date.

In [61]:
def softmax(x, axis=1):
    """Softmax activation function.
    # Arguments
        x : Tensor.
        axis: Integer, axis along which the softmax normalization is applied.
    # Returns
        Tensor, output of softmax transformation.
    # Raises
        ValueError: In case `dim(x) == 1`.
    """
    ndim = K.ndim(x)
    if ndim == 2:
        return K.softmax(x)
    elif ndim > 2:
        e = K.exp(x - K.max(x, axis=axis, keepdims=True))
        s = K.sum(e, axis=axis, keepdims=True)
        return e / s
    else:
        raise ValueError('Cannot apply softmax to a tensor that is 1D')

In [62]:
repeator = RepeatVector(Tx)
concatenator = Concatenate(axis=-1)
densor1 = Dense(10, activation = "tanh")
densor2 = Dense(1, activation = "relu")
activator = Activation(softmax, name='attention_weights') # We are using a custom softmax(axis = 1) loaded in this notebook
dotor = Dot(axes = 1)

In [63]:
def one_step_attention(a, s_prev):
    """
    Computes the context vector using attention mechanism for one decoder time step.

    Arguments:
    a -- Encoder hidden states, shape (m, Tx, 2*n_a)
    s_prev -- Previous decoder hidden state, shape (m, n_s)

    Returns:
    context -- Context vector, shape (m, 1, 2*n_a)
    """
    s_prev = repeator(s_prev)                      # (m, Tx, n_s)
    concat = concatenator([a, s_prev])             # (m, Tx, 2*n_a + n_s)
    e = densor1(concat)                            # (m, Tx, 10)
    energies = densor2(e)                          # (m, Tx, 1)
    alphas = activator(energies)                   # (m, Tx, 1)
    context = dotor([alphas, a])                   # (m, 1, 2*n_a)
    return context

In [64]:
n_a = 32 # number of units for the pre-attention, bi-directional LSTM's hidden state 'a'
n_s = 64 # number of units for the post-attention LSTM's hidden state "s"

# this is the post attention LSTM cell.  
post_activation_LSTM_cell = LSTM(n_s, return_state = True)
output_layer = Dense(len(machine_vocab), activation=softmax)

In [65]:
def modelf(Tx, Ty, n_a, n_s, human_vocab_size, machine_vocab_size=None):
    """
    Builds a sequence-to-sequence model with attention.

    Arguments:
    Tx -- input sequence length
    Ty -- output sequence length
    n_a -- Bi-LSTM hidden units
    n_s -- post-attention LSTM hidden units
    human_vocab_size -- size of input vocabulary
    machine_vocab_size -- (optional) size of output vocabulary

    Returns:
    model -- Keras model instance
    """
    X = Input(shape=(Tx, human_vocab_size))
    s0 = Input(shape=(n_s,), name='s0')
    c0 = Input(shape=(n_s,), name='c0')
    s, c = s0, c0

    outputs = []

    a = Bidirectional(LSTM(n_a, return_sequences=True))(X)

    for t in range(Ty):
        context = one_step_attention(a, s)
        _, s, c = post_activation_LSTM_cell(context, initial_state=[s, c])
        out = output_layer(s)
        outputs.append(out)

    model = Model(inputs=[X, s0, c0], outputs=outputs)
    return model

In [66]:
model = modelf(Tx, Ty, n_a, n_s, len(human_vocab))

In [67]:
model.summary()

Model: "functional_3"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_2 (InputLayer)            [(None, 30, 37)]     0                                            
__________________________________________________________________________________________________
s0 (InputLayer)                 [(None, 64)]         0                                            
__________________________________________________________________________________________________
bidirectional_1 (Bidirectional) (None, 30, 64)       17920       input_2[0][0]                    
__________________________________________________________________________________________________
repeat_vector_1 (RepeatVector)  (None, 30, 64)       0           s0[0][0]                         
                                                                 lstm_2[0][1]          

In [68]:
opt = Adam(lr=0.005, beta_1=0.9, beta_2=0.999, decay=0.01) 
model.compile(loss = 'categorical_crossentropy', optimizer = opt, metrics = ['accuracy'])

In [69]:
a = Yoh.swapaxes(0, 1)
a.shape

(10, 10000, 11)

In [70]:
m = 10000
s0 = np.zeros((m, n_s))
c0 = np.zeros((m, n_s))
outputs = list(Yoh.swapaxes(0,1))

In [71]:
model.fit([Xoh, s0, c0], outputs, epochs=1, batch_size=100)



<tensorflow.python.keras.callbacks.History at 0x739d6a498ed0>

In [72]:
model.load_weights('models/model.h5')

We can now see the results on new examples.

In [73]:
EXAMPLES = ['3 May 1979', '5 April 09', '21th of August 2016', 
            'Tue 10 Jul 2007', 'Saturday May 9 2018', 
            'March 3 2001', 'March 3rd 2001', '1 March 2001']

s00 = np.zeros((1, n_s))
c00 = np.zeros((1, n_s))

for example in EXAMPLES:
    source = string_to_int(example, Tx, human_vocab)
    source = np.array([to_categorical(x, num_classes=len(human_vocab)) for x in source])
    source = np.expand_dims(source, axis=0)

    prediction = model.predict([source, s00, c00])
    prediction = np.argmax(prediction, axis=-1)

    output = [inv_machine_vocab[str(i)] for i in prediction.flatten()]
    
    print("source:", example)
    print("output:", ''.join(output), "\n")

source: 3 May 1979
output: 1979-05-33 

source: 5 April 09
output: 2009-04-05 

source: 21th of August 2016
output: 2016-08-20 

source: Tue 10 Jul 2007
output: 2007-07-10 

source: Saturday May 9 2018
output: 2018-05-09 

source: March 3 2001
output: 2001-03-03 

source: March 3rd 2001
output: 2001-03-03 

source: 1 March 2001
output: 2001-03-01 

