In [23]:
import tqdm
from faker import Faker
import random
from babel.dates import format_date

In [24]:
fake = Faker()
fake.seed(12345)
random.seed(12345)

In [25]:
# Define format of the data we would like to generate
FORMATS = ['short',
           'medium',
           'long',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'full',
           'd MMM YYY', 
           'd MMMM YYY',
           'dd MMM YYY',
           'd MMM, YYY',
           'd MMMM, YYY',
           'dd, MMM YYY',
           'd MM YY',
           'd MMMM YYY',
           'MMMM d YYY',
           'MMMM d, YYY',
           'dd.MM.YY']

In [26]:
# change this if you want it to work with another language
LOCALES = ['en_US']

In [27]:
def load_date():
    """
        Loads some fake dates 
        :returns: tuple containing human readable string, machine readable string, and date object
    """
    dt = fake.date_object()

    try:
        human_readable = format_date(dt, format=random.choice(FORMATS),  locale='en_US') # locale=random.choice(LOCALES))
        human_readable = human_readable.lower()
        human_readable = human_readable.replace(',','')
        machine_readable = dt.isoformat()
        
    except AttributeError as e:
        return None, None, None

    return human_readable, machine_readable, dt

In [28]:
def load_dataset(m):
    """
        Loads a dataset with m examples and vocabularies
        :m: the number of examples to generate
    """
    
    human_vocab = set()
    machine_vocab = set()
    dataset = []
    Tx = 30
    

    for i in tqdm.tqdm(range(m)):
        h, m, _ = load_date()
        if h is not None:
            dataset.append((h, m))
            human_vocab.update(tuple(h))
            machine_vocab.update(tuple(m))
    
    human = dict(zip(sorted(human_vocab) + ['<unk>', '<pad>'], 
                     list(range(len(human_vocab) + 2))))
    inv_machine = dict(enumerate(sorted(machine_vocab)))
    machine = {v:k for k,v in inv_machine.items()}
 
    return dataset, human, machine, inv_machine

In [32]:
m = 10000
dataset, human_vocab, machine_vocab, inv_machine_vocab = load_dataset(m)

100%|██████████| 10000/10000 [00:00<00:00, 18487.25it/s]


In [33]:
dataset

[('27 august 1994', '1994-08-27'),
 ('thursday july 20 1972', '1972-07-20'),
 ('monday march 8 1971', '1971-03-08'),
 ('11/17/12', '2012-11-17'),
 ('monday september 22 1986', '1986-09-22'),
 ('sunday november 23 2003', '2003-11-23'),
 ('25 feb 2012', '2012-02-25'),
 ('23 06 71', '1971-06-23'),
 ('saturday may 22 2004', '2004-05-22'),
 ('thursday december 29 1988', '1988-12-29'),
 ('11 july 2018', '2018-07-11'),
 ('saturday december 19 1981', '1981-12-19'),
 ('thursday june 13 1996', '1996-06-13'),
 ('01 jun 1998', '1998-06-01'),
 ('tuesday june 29 1982', '1982-06-29'),
 ('friday september 28 2012', '2012-09-28'),
 ('april 6 2003', '2003-04-06'),
 ('may 26 1988', '1988-05-26'),
 ('april 14 2017', '2017-04-14'),
 ('november 2 2013', '2013-11-02'),
 ('03 dec 1992', '1992-12-03'),
 ('aug 14 1986', '1986-08-14'),
 ('june 19 1981', '1981-06-19'),
 ('9 june 1981', '1981-06-09'),
 ('thursday september 10 1987', '1987-09-10'),
 ('oct 10 1988', '1988-10-10'),
 ('thursday february 26 2004', '200

In [34]:
with open('/home/marcin/dates_valid.csv', 'w') as f:
    for hd, md in dataset:
        f.write(hd + ',' + md + '\n')