In [2]:
from datetime import datetime
import random
from pathlib import Path
from typing import Tuple

import pandas as pd

In [12]:
class DateGenerator:
    """
    Generate a date string dataset with dates in a random string format and the standard format.
    """

    std_format = '%Y-%m-%d'
    formats = [
        '%Y-%-m-%-d',
        '%-d-%-m-%Y',
        '%-m-%-d-%Y',

        '%Y/%-m/%-d',
        '%-d/%-m/%Y',
        '%-m/%-d/%Y',

        '%b %-d, %Y',
        '%b %-d, %y',
        '%B %-d, %Y',

        '%A %B %-d, %Y',  # day of week, full month, day, year
        '%A %B %-d %Y',

        '%b %-d %Y',  # abreviated month, day, year

        '%B %-d %Y',  # full month, day, year
        
    ]

    def random_datetime(self) -> datetime:
        """Get a random datetime object."""
        rand_year = random.randint(1401, 2100)
        rand_day = random.randint(1, 31)
        rand_month = random.randint(1, 12)
        try:
            return datetime(rand_year, rand_month, rand_day)
        except ValueError:
            return self.random_datetime()
    
    def get_sample(self) -> Tuple[str, str]:
        """
        Get a random sample.
        Returns the date in (standard_format, random_format, format_index)
        """
        date = self.random_datetime()
        rand_format = random.choice(self.formats)
        format_index = self.formats.index(rand_format)
        std_str = date.strftime(self.std_format)
        rand_str = date.strftime(rand_format)
        return (std_str, rand_str, format_index)
    
    def generate_df(self, n: int) -> pd.DataFrame:
        """Generate a df with `n` samples."""
        return pd.DataFrame(
            [self.get_sample() for _ in range(n)],
            columns=['output', 'input', 'format']
        )

In [13]:
date_generator = DateGenerator()
train_df = date_generator.generate_df(100000)
val_df = date_generator.generate_df(10000)
test_df = date_generator.generate_df(10000)

In [14]:
_DATA_PATH = Path('../data')
_RAW_PATH = _DATA_PATH / 'raw'
if not _RAW_PATH.exists():
    _RAW_PATH.mkdir(parents=True)

train_df.to_csv(_RAW_PATH / 'train.csv', index=False)
val_df.to_csv(_RAW_PATH / 'valid.csv', index=False)
test_df.to_csv(_RAW_PATH / 'test.csv', index=False)

In [15]:
train_df.sample(10)

Unnamed: 0,output,input,format
39698,1769-03-13,3/13/1769,5
63598,2017-06-11,11-6-2017,1
11101,1688-07-28,"Wednesday July 28, 1688",9
76356,1683-01-31,1683/1/31,3
12084,1920-01-12,1920-1-12,0
32016,1613-04-10,10/4/1613,4
83998,1500-01-05,1500/1/5,3
21520,1427-03-02,3-2-1427,2
49364,1864-10-07,"Oct 7, 1864",6
1229,1994-08-31,1994/8/31,3


In [11]:
train_df.input.apply(len).describe()

count    100000.000000
mean         12.274350
std           4.917943
min           8.000000
25%           9.000000
50%          10.000000
75%          14.000000
max          28.000000
Name: input, dtype: float64