<a href="https://colab.research.google.com/github/mojtaba732/ML_Practice/blob/main/neural_machine_translation.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Neural Machine Translation**


A Neural Machine Translation (NMT) model to translate human readable dates ("25th of June, 2009") into machine readable dates ("2009-06-25"), using an attention model.

**Following are helper functions**

In [2]:
!pip3 install faker

Collecting faker
  Downloading Faker-24.1.0-py3-none-any.whl (1.8 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m1.8/1.8 MB[0m [31m11.7 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: faker
Successfully installed faker-24.1.0


In [3]:
import numpy as np
from faker import Faker
import random
from tqdm import tqdm
from babel.dates import format_date
from keras.utils import to_categorical
import keras.backend as K
import matplotlib.pyplot as plt

In [5]:
fake = Faker()
Faker.seed(12345)
random.seed(12345)

In [6]:
# Define format of the data we would like to generate
FORMATS = ['short',
           'medium',
           'long',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'd MMM YYY',
           'd MMMM YYY',
           'dd MMM YYY',
           'd MMM, YYY',
           'd MMMM, YYY',
           'dd, MMM YYY',
           'd MM YY',
           'd MMMM YYY',
           'MMMM d YYY',
           'MMMM d, YYY',
           'dd.MM.YY']

In [7]:
# change this if you want it to work with another language
LOCALES = ['en_US']

In [12]:
def load_date():
    """
        Loads some fake dates
        :returns: tuple containing human readable string, machine readable string, and date object
    """
    dt = fake.date_object()

    try:
        human_readable = format_date(dt, format=random.choice(FORMATS),  locale='en_US') # locale=random.choice(LOCALES))
        human_readable = human_readable.lower()
        human_readable = human_readable.replace(',','')
        machine_readable = dt.isoformat()

    except AttributeError as e:
        return None, None, None

    return human_readable, machine_readable, dt

In [28]:
def load_dataset(m):
    """
        Loads a dataset with m examples and vocabularies
        :m: the number of examples to generate
    """

    human_vocab = set()
    machine_vocab = set()
    dataset = []
   # Tx = 30


    for i in tqdm(range(m)):
        h, m, _ = load_date()
        if h is not None:
            dataset.append((h, m))
            human_vocab.update(tuple(h))
            machine_vocab.update(tuple(m))

    human = dict(zip(sorted(human_vocab) + ['<unk>', '<pad>'],
                     list(range(len(human_vocab) + 2))))
    inv_machine = dict(enumerate(sorted(machine_vocab)))
    machine = {v:k for k,v in inv_machine.items()}

    return dataset, human, machine, inv_machine



In [34]:
dict(enumerate(sorted([14,5,21,9])))

{0: 5, 1: 9, 2: 14, 3: 21}

In [37]:
t = enumerate(sorted([14,5,21,9]))
for x in t:
  print(x)

(0, 5)
(1, 9)
(2, 14)
(3, 21)


In [29]:
a,b,c,d = load_dataset(5)

100%|██████████| 5/5 [00:00<00:00, 889.19it/s]

04.08.82
1982-08-04
1982-08-04
26 apr 2017
2017-04-26
2017-04-26
9/22/81
1981-09-22
1981-09-22
6/15/13
2013-06-15
2013-06-15
january 30 2000
2000-01-30
2000-01-30





In [31]:
a

[('04.08.82', '1982-08-04'),
 ('26 apr 2017', '2017-04-26'),
 ('9/22/81', '1981-09-22'),
 ('6/15/13', '2013-06-15'),
 ('january 30 2000', '2000-01-30')]

In [32]:
b

{' ': 0,
 '.': 1,
 '/': 2,
 '0': 3,
 '1': 4,
 '2': 5,
 '3': 6,
 '4': 7,
 '5': 8,
 '6': 9,
 '7': 10,
 '8': 11,
 '9': 12,
 'a': 13,
 'j': 14,
 'n': 15,
 'p': 16,
 'r': 17,
 'u': 18,
 'y': 19,
 '<unk>': 20,
 '<pad>': 21}

In [33]:
c

{'-': 0,
 '0': 1,
 '1': 2,
 '2': 3,
 '3': 4,
 '4': 5,
 '5': 6,
 '6': 7,
 '7': 8,
 '8': 9,
 '9': 10}