In [16]:
import string
import os
import numpy as np
import tensorflow as tf

from tensorflow import keras
from datetime import datetime, timedelta

In [17]:
POSSIBLE_INPUT_CHARS = f'{"".join(list(map(str, range(10))))}{string.ascii_lowercase}-'
POSSIBLE_OUTPUT_CHARS = f'{"".join(list(map(str, range(10))))}-'
CURRENT_DAY = datetime.utcnow()
ROW_COUNT = 10000


def string_to_ids(s: str, chars: str) -> list[int]:
    ids = []

    for char in s.lower():
        try:
            idx = chars.index(char)

            ids.append(idx)
        except:
            ids.append(-1)

    return ids


def shuffle(vals: tf.RaggedTensor, targets: tf.RaggedTensor) -> (tf.RaggedTensor, tf.RaggedTensor):
    a = tf.random.shuffle(tf.range(vals.shape[0]))
    b = tf.reshape(a, (vals.shape[0], 1))
    shuffled_vals = tf.gather_nd(vals, b)
    shuffled_targets = tf.gather_nd(targets, b)

    return shuffled_vals, shuffled_targets


def pad_year(year: int) -> str:
    return f'{"".join(map(str, [0] * (4 - len(str(year)))))}{year}'


def get_date_pairs() -> (np.ndarray, np.ndarray):
    xs = []
    ys = []
    all_years = np.arange(ROW_COUNT).tolist()
    years_padded = np.array([pad_year(year) for year in all_years])

    np.random.shuffle(years_padded)

    for counter in range(ROW_COUNT):
        date = CURRENT_DAY - timedelta(days=counter)
        year_month_day = date.strftime('%Y-%m-%d')
        year_month_name_day = date.strftime('%Y-%B-%d')
        _, month_name, day = year_month_name_day.split('-')
        year = years_padded[counter]
        year_month_day = f'{year}-{year_month_day[5:]}'
        xs.append(tf.constant(
            string_to_ids(f'{year}-', POSSIBLE_INPUT_CHARS) +
            string_to_ids(f'{month_name}-', POSSIBLE_INPUT_CHARS) +
            string_to_ids(day, POSSIBLE_INPUT_CHARS)))
        ys.append(tf.constant(string_to_ids(year_month_day, POSSIBLE_OUTPUT_CHARS)))

    ragged_xs = tf.ragged.stack(xs, axis=0)
    ragged_ys = tf.ragged.stack(ys, axis=0)

    return shuffle(ragged_xs, ragged_ys)


X, y = get_date_pairs()
X = (X + 1).to_tensor()
y = y.to_tensor()
seventy_percent_count = int(X.shape[0] * .7)
ninety_percent_count = int(X.shape[0] * .9)
X_train, y_train = X[:seventy_percent_count, :], y[:seventy_percent_count, :]
X_valid, y_valid = X[seventy_percent_count:ninety_percent_count, :], y[seventy_percent_count:ninety_percent_count, :]
X_test, y_test = X[ninety_percent_count:, :], y[ninety_percent_count:, :]

In [18]:
def run_basic_model(model: keras.Model, name: str, patience: int=5):
    model_dir = os.path.join(os.curdir, 'saved_models')
    run_logdir_root = os.path.join(os.curdir, 'tensor_logs')
    dirs_count = len([
        name
        for name in os.listdir(run_logdir_root)
        if os.path.isdir(os.path.join(run_logdir_root, name)) and name.startswith(name)
    ])
    run_logdir = os.path.join(run_logdir_root, f'{name}_{dirs_count}')
    early_stopping = keras.callbacks.EarlyStopping(monitor='val_loss', patience=patience)
    model_checkpoint = keras.callbacks.ModelCheckpoint(os.path.join(model_dir, f'{name}_{dirs_count}.h5'), save_best_only=True)
    tensorboard = keras.callbacks.TensorBoard(run_logdir, histogram_freq=1, profile_batch=10)

    optimizer = keras.optimizers.Nadam()
    model.compile(loss="sparse_categorical_crossentropy", optimizer=optimizer, metrics=["accuracy"])

    return model.fit(X_train, y_train, validation_data=(X_valid, y_valid), epochs=40, callbacks=[early_stopping, model_checkpoint, tensorboard])


embedding_size = 32
max_output_length = y.shape[1]

np.random.seed(42)
tf.random.set_seed(42)

In [19]:
encoder = keras.models.Sequential([
    keras.layers.Embedding(input_dim=len(POSSIBLE_INPUT_CHARS) + 1,
                           output_dim=embedding_size,
                           input_shape=[None]),
    keras.layers.LSTM(128)
])

decoder = keras.models.Sequential([
    keras.layers.LSTM(128, return_sequences=True),
    keras.layers.Dense(len(POSSIBLE_OUTPUT_CHARS) + 1, activation="softmax")
])

model = keras.models.Sequential([
    encoder,
    keras.layers.RepeatVector(max_output_length),
    decoder
])


run_basic_model(model, 'basic_encoder_model_lstm')

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40


<keras.callbacks.History at 0x27d6ba0e460>

In [20]:
encoder = keras.models.Sequential([
    keras.layers.Embedding(input_dim=len(POSSIBLE_INPUT_CHARS) + 1,
                           output_dim=embedding_size,
                           input_shape=[None]),
    keras.layers.GRU(128)
])

decoder = keras.models.Sequential([
    keras.layers.GRU(128, return_sequences=True),
    keras.layers.Dense(len(POSSIBLE_OUTPUT_CHARS) + 1, activation="softmax")
])

model = keras.models.Sequential([
    encoder,
    keras.layers.RepeatVector(max_output_length),
    decoder
])


run_basic_model(model, 'basic_encoder_model_gru')

Epoch 1/40
Epoch 2/40
Epoch 3/40
Epoch 4/40
Epoch 5/40
Epoch 6/40
Epoch 7/40
Epoch 8/40
Epoch 9/40
Epoch 10/40
Epoch 11/40
Epoch 12/40
Epoch 13/40
Epoch 14/40
Epoch 15/40
Epoch 16/40
Epoch 17/40
Epoch 18/40
Epoch 19/40
Epoch 20/40
Epoch 21/40
Epoch 22/40
Epoch 23/40
Epoch 24/40
Epoch 25/40
Epoch 26/40
Epoch 27/40
Epoch 28/40
Epoch 29/40
Epoch 30/40
Epoch 31/40
Epoch 32/40
Epoch 33/40
Epoch 34/40
Epoch 35/40
Epoch 36/40
Epoch 37/40
Epoch 38/40
Epoch 39/40
Epoch 40/40


<keras.callbacks.History at 0x27fe11e3f40>

In [21]:
def ids_to_date_strs(ids: list[int]) -> str:
    year = ids[:4]
    day = ids[(4 + 4):]
    kind_of_month = ids[4:(4 + 4)]

    return f'{"".join(map(str, year))}{POSSIBLE_OUTPUT_CHARS[kind_of_month[0]]}{POSSIBLE_OUTPUT_CHARS[kind_of_month[1]]}{POSSIBLE_OUTPUT_CHARS[kind_of_month[2]]}{POSSIBLE_OUTPUT_CHARS[kind_of_month[3]]}{"".join(map(str, day))}'


def zero_pad_right(vec: list[int], max_length: int) -> list[int]:
    return vec + [0] * (max_length - len(vec))

In [22]:
X_new = [zero_pad_right(list(map(lambda n: n + 1, string_to_ids(date_str, POSSIBLE_INPUT_CHARS))), X.shape[1]) for date_str in ['1999-June-30', '1874-September-08']]
chosen_ids_array = np.argmax(model.predict(X_new), axis=-1)

for row in chosen_ids_array:
    print(ids_to_date_strs(row.tolist()))

1999-06-30
1874-09-08
