In [1]:
from datetime import datetime
import random
from pathlib import Path
from typing import Tuple

import pandas as pd

In [2]:
class DateGenerator:
    """
    Generate a date string dataset with dates in a random string format and the standard format.
    """

    std_format = '%Y-%m-%d'
    formats = [
        '%Y-%-m-%-d',
        '%-d-%-m-%Y',
        '%-m-%-d-%Y',

        '%Y/%-m/%-d',
        '%-d/%-m/%Y',
        '%-m/%-d/%Y',

        '%b %-d, %Y',
        '%b %-d, %y',
        '%B %-d, %Y',

        '%A %B %-d, %Y',  # day of week, full month, day, year
        '%A %B %-d %Y',

        '%b %-d %Y',  # abreviated month, day, year

        '%B %-d %Y',  # full month, day, year
        
    ]

    def random_datetime(self) -> datetime:
        """Get a random datetime object."""
        rand_year = random.randint(1401, 2100)
        rand_day = random.randint(1, 31)
        rand_month = random.randint(1, 12)
        try:
            return datetime(rand_year, rand_month, rand_day)
        except ValueError:
            return self.random_datetime()
    
    def get_sample(self) -> Tuple[str, str]:
        """
        Get a random sample.
        Returns the date in (standard_format, random_format, format_index)
        """
        date = self.random_datetime()
        rand_format = random.choice(self.formats)
        format_index = self.formats.index(rand_format)
        std_str = date.strftime(self.std_format)
        rand_str = date.strftime(rand_format)
        return (std_str, rand_str, format_index)
    
    def generate_df(self, n: int) -> pd.DataFrame:
        """Generate a df with `n` samples."""
        return pd.DataFrame(
            [self.get_sample() for _ in range(n)],
            columns=['output', 'input', 'format']
        )

In [3]:
date_generator = DateGenerator()
train_df = date_generator.generate_df(100000)
val_df = date_generator.generate_df(10000)
test_df = date_generator.generate_df(10000)

In [4]:
_DATA_PATH = Path('../data')
_RAW_PATH = _DATA_PATH / 'raw'
if not _RAW_PATH.exists():
    _RAW_PATH.mkdir(parents=True)

train_df.to_csv(_RAW_PATH / 'train.csv', index=False)
val_df.to_csv(_RAW_PATH / 'valid.csv', index=False)
test_df.to_csv(_RAW_PATH / 'test.csv', index=False)

In [5]:
train_df.sample(10)

Unnamed: 0,output,input,format
70784,1849-04-09,"Apr 9, 1849",6
90435,1576-12-11,12/11/1576,5
30262,1765-05-17,5-17-1765,2
58601,2031-05-26,May 26 2031,12
39221,1549-12-18,1549/12/18,3
2242,1514-07-23,Jul 23 1514,11
1550,1650-09-26,9/26/1650,5
1491,1466-12-01,1466/12/1,3
78931,1953-06-12,Friday June 12 1953,10
30127,1509-05-19,19/5/1509,4


In [6]:
train_df.input.apply(len).describe()

count    100000.00000
mean         12.29574
std           4.93885
min           8.00000
25%           9.00000
50%          10.00000
75%          14.00000
max          28.00000
Name: input, dtype: float64