In [1]:
!pip install faker

Collecting faker
  Downloading Faker-30.6.0-py3-none-any.whl.metadata (15 kB)
Downloading Faker-30.6.0-py3-none-any.whl (1.8 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m21.2 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: faker
Successfully installed faker-30.6.0


In [2]:
from faker import Faker
import numpy as np
import random
from babel.dates import format_date
from tqdm import tqdm
from tensorflow.keras.utils import to_categorical
from tensorflow.keras.layers import Bidirectional, Concatenate, Permute, Dot, Input, LSTM, Multiply
from tensorflow.keras.layers import RepeatVector, Dense, Activation, Lambda
import tensorflow.keras.backend as K
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import Adam

In [3]:
Faker.seed(12345)
random.seed(12345)
fake=Faker()
LOCALES = ['en_US']

In [4]:
fake.date_object()

datetime.date(1992, 10, 30)

In [5]:
FORMATS = ['short',
           'medium',
           'long',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'd MMM YYY',
           'd MMMM YYY',
           'dd MMM YYY',
           'd MMM, YYY',
           'd MMMM, YYY',
           'dd, MMM YYY',
           'd MM YY',
           'd MMMM YYY',
           'MMMM d YYY',
           'MMMM d, YYY',
           'dd.MM.YY']

In [6]:
format_date(fake.date_object(),format=random.choice(FORMATS),locale="en_US")

'23 Jul 1970'

In [7]:
fake.date_object().isoformat()

'2015-03-22'

In [8]:
def softmax(x, axis=1):

    ndim = K.ndim(x)
    if ndim == 2:
        return K.softmax(x)
    elif ndim > 2:
        e = K.exp(x - K.max(x, axis=axis, keepdims=True))
        s = K.sum(e, axis=axis, keepdims=True)
        return e / s
    else:
        raise ValueError('Cannot apply softmax to a tensor that is 1D')

In [9]:
def load_data():
  dt=fake.date_object()
  try :
    human_readable=format_date(dt,format=random.choice(FORMATS),locale="en_US")
    human_readable=human_readable.lower()
    human_readable=human_readable.replace(",","")
    machine_readable=dt.isoformat()
  except AttributeError as e :
    return None,None,None
  return human_readable,machine_readable,dt



In [10]:
def load_dataset(m):
  human_vocab=set()
  machine_vocab=set()
  dataset=[]
  for i in tqdm(range(m)):
    h,m,_=load_data()
    if h is not None :
      dataset.append((h,m))
      human_vocab.update((h))
      machine_vocab.update((m))
  human=dict(zip(sorted(human_vocab)+["<unk>","<pad>"],list(range(len(human_vocab)+2))))
  inv_machine=dict(enumerate(sorted(machine_vocab)))
  machine={v:k for k,v in inv_machine.items()}
  return dataset,human,machine,inv_machine




In [11]:
load_dataset(5)

100%|██████████| 5/5 [00:00<00:00, 7747.14it/s]


([('14.05.86', '1986-05-14'),
  ('3/10/90', '1990-03-10'),
  ('tuesday august 12 1980', '1980-08-12'),
  ('saturday january 6 2001', '2001-01-06'),
  ('saturday november 11 1978', '1978-11-11')],
 {' ': 0,
  '.': 1,
  '/': 2,
  '0': 3,
  '1': 4,
  '2': 5,
  '3': 6,
  '4': 7,
  '5': 8,
  '6': 9,
  '7': 10,
  '8': 11,
  '9': 12,
  'a': 13,
  'b': 14,
  'd': 15,
  'e': 16,
  'g': 17,
  'j': 18,
  'm': 19,
  'n': 20,
  'o': 21,
  'r': 22,
  's': 23,
  't': 24,
  'u': 25,
  'v': 26,
  'y': 27,
  '<unk>': 28,
  '<pad>': 29},
 {'-': 0,
  '0': 1,
  '1': 2,
  '2': 3,
  '3': 4,
  '4': 5,
  '5': 6,
  '6': 7,
  '7': 8,
  '8': 9,
  '9': 10},
 {0: '-',
  1: '0',
  2: '1',
  3: '2',
  4: '3',
  5: '4',
  6: '5',
  7: '6',
  8: '7',
  9: '8',
  10: '9'})

In [12]:
m = 10000
dataset, human_vocab, machine_vocab, inv_machine_vocab = load_dataset(m)

100%|██████████| 10000/10000 [00:01<00:00, 9797.06it/s]


In [13]:
dataset[:10]

[('saturday october 23 1976', '1976-10-23'),
 ('22 sep 1993', '1993-09-22'),
 ('20 oct 2000', '2000-10-20'),
 ('sunday july 22 1979', '1979-07-22'),
 ('wednesday april 26 2000', '2000-04-26'),
 ('tuesday june 13 1989', '1989-06-13'),
 ('3 jul 2022', '2022-07-03'),
 ('thursday january 2 1975', '1975-01-02'),
 ('19 august 2023', '2023-08-19'),
 ('1 august 1992', '1992-08-01')]

In [14]:
def string_to_int(string,lenght,vocab):
  string=string.lower()
  string=string.replace(",","")
  if len(string)>lenght:
    string=string[:lenght]
  rep=list(map(lambda x :vocab.get(x,"<unk>"),string))
  if len(string)<lenght:
    rep+=[vocab["<pad>"]]*(lenght-len(string))
  return rep

In [15]:
def preprocess_data(dataset,tx,ty):
  X,Y=zip(*dataset)
  X=np.array([string_to_int(x,tx,human_vocab) for x in X])
  Y=np.array([string_to_int(y,ty,machine_vocab) for y in Y])
  Xoh=np.array(list(map(lambda x :to_categorical(x,num_classes=len(human_vocab)),X)))
  Yoh=np.array(list(map(lambda y :to_categorical(y,num_classes=len(machine_vocab)),Y)))
  return X,Y,Xoh,Yoh


In [16]:
X,Y,Xoh,Yoh=preprocess_data(dataset,30,10)

In [17]:
index = 0
Tx = 30
Ty = 10
print("Source date:", dataset[index][0])
print("Target date:", dataset[index][1])
print()
print("Source after preprocessing (indices):", X[index])
print("Target after preprocessing (indices):", Y[index])
print()
print("Source after preprocessing (one-hot):", Xoh[index])
print("Target after preprocessing (one-hot):", Yoh[index])

Source date: saturday october 23 1976
Target date: 1976-10-23

Source after preprocessing (indices): [29 13 30 31 28 16 13 34  0 26 15 30 26 14 17 28  0  5  6  0  4 12 10  9
 36 36 36 36 36 36]
Target after preprocessing (indices): [ 2 10  8  7  0  2  1  0  3  4]

Source after preprocessing (one-hot): [[0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 [0. 0. 0. ... 0. 0. 0.]
 ...
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]
 [0. 0. 0. ... 0. 0. 1.]]
Target after preprocessing (one-hot): [[0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 0. 0. 0. 1.]
 [0. 0. 0. 0. 0. 0. 0. 0. 1. 0. 0.]
 [0. 0. 0. 0. 0. 0. 0. 1. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 1. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 1. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [1. 0. 0. 0. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 1. 0. 0. 0. 0. 0. 0. 0.]
 [0. 0. 0. 0. 1. 0. 0. 0. 0. 0. 0.]]


## Attention layer

In [18]:
repetator=RepeatVector(Tx)
concatinator =Concatenate(axis=-1)
densor1=Dense(10,activation="tanh")
densor2=Dense(1,activation="relu")
activation=Activation(softmax,name="attention_weights")
dotor=Dot(axes=1)

In [19]:
def one_step_attention(a,s_previous):
  s_prev=repetator(s_previous)
  concat=concatinator([a,s_prev])
  e=densor1(concat)
  energies=densor2(e)
  alphas=activation(energies)
  context=dotor([alphas,a])
  return context



## Model constaction

In [20]:
n_a = 32 # number of units for the pre-attention, bi-directional LSTM's hidden state 'a'
n_s = 64 # number of units for the post-attention LSTM's hidden state "s"

# this is the post attention LSTM cell.
post_activation_LSTM_cell = LSTM(n_s, return_state = True)
output_layer = Dense(len(machine_vocab), activation=softmax)

In [28]:
def modelf(tx,ty,n_a,n_s,human_vocab,machine_vocab):
  x=Input(shape=(tx,len(human_vocab)))
  s0=Input(shape=(n_s,),name="s0")
  c0=Input(shape=(n_s,),name="c0")
  s=s0
  c=c0
  outputs=[]

  a=Bidirectional(LSTM(n_a,return_sequences=True),input_shape=(m, tx, n_a*2))(x)
  print(a)
  for i in range (ty):
    context=one_step_attention(a,s)
    s,_,c=post_activation_LSTM_cell(context,initial_state =[s,c])
    out=output_layer(s)
    outputs.append(out)

  model=Model(inputs=[x,s0,c0],outputs=outputs)
  return model


In [29]:
model=modelf(Tx,Ty,n_a,n_s,human_vocab,machine_vocab)

<KerasTensor shape=(None, 30, 64), dtype=float32, sparse=False, name=keras_tensor_103>


In [23]:
model.summary()

In [30]:
opt =Adam(learning_rate=0.005, beta_1=0.9, beta_2=0.999,decay=0.01)
model.compile(optimizer=opt,loss="categorical_crossentropy",metrics=["accuracy"]*10)

In [25]:
Yoh.swapaxes(0,1).shape

(10, 10000, 11)

In [32]:
s0 = np.zeros((m, n_s))
c0 = np.zeros((m, n_s))
Yoh= list(Yoh.swapaxes(0,1))

In [33]:
model.fit([Xoh,s0,c0],Yoh,epochs=10,batch_size=100)

Epoch 1/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m12s[0m 24ms/step - dense_2_accuracy: 0.7753 - dense_2_accuracy_1: 0.7621 - dense_2_accuracy_2: 0.4263 - dense_2_accuracy_3: 0.2410 - dense_2_accuracy_4: 0.9851 - dense_2_accuracy_5: 0.8187 - dense_2_accuracy_6: 0.3490 - dense_2_accuracy_7: 0.9179 - dense_2_accuracy_8: 0.3987 - dense_2_accuracy_9: 0.2484 - loss: 12.0107
Epoch 2/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 24ms/step - dense_2_accuracy: 0.9936 - dense_2_accuracy_1: 0.9939 - dense_2_accuracy_2: 0.9548 - dense_2_accuracy_3: 0.9633 - dense_2_accuracy_4: 0.9998 - dense_2_accuracy_5: 0.9727 - dense_2_accuracy_6: 0.8870 - dense_2_accuracy_7: 1.0000 - dense_2_accuracy_8: 0.8626 - dense_2_accuracy_9: 0.9135 - loss: 1.4176
Epoch 3/10
[1m100/100[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m3s[0m 32ms/step - dense_2_accuracy: 0.9991 - dense_2_accuracy_1: 0.9998 - dense_2_accuracy_2: 0.9997 - dense_2_accuracy_3: 0.9978 - dense_2_accurac

<keras.src.callbacks.history.History at 0x7b64fcff20e0>

In [34]:
EXAMPLES = ['3 May 1979', '5 April 09', '21th of August 2016', 'Tue 10 Jul 2007', 'Saturday May 9 2018', 'March 3 2001', 'March 3rd 2001', '1 March 2001']

In [35]:
s00=np.zeros((1,n_s))
c00=np.zeros((1,n_s))

In [70]:
for example in EXAMPLES :
  source=string_to_int(example,Tx,human_vocab)
  source=np.array(list(map(lambda x : to_categorical(x,num_classes=len(human_vocab)),source)))

  source=np.expand_dims(source,axis=0)
  predictions =model.predict([source,s00,c00])
  prediction=np.argmax(predictions,axis=-1)
  output=[inv_machine_vocab[x[0]] for x in prediction]
  output="".join(output)
  print("source :",example)
  print("output :",output)


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
source : 3 May 1979
output : 1979-05-03
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
source : 5 April 09
output : 2009-04-05
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
source : 21th of August 2016
output : 2016-08-01
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
source : Tue 10 Jul 2007
output : 2007-07-10
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 23ms/step
source : Saturday May 9 2018
output : 2018-05-09
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
source : March 3 2001
output : 2001-03-03
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 22ms/step
source : March 3rd 2001
output : 2001-03-03
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 24ms/step
source : 1 March 2001
output : 2001-03-01
