In [2]:
from datetime import datetime
import random
from pathlib import Path
from typing import Tuple

import pandas as pd

In [7]:
class DateGenerator:
    """
    Generate a date string dataset with dates in a random string format and the standard format.
    """

    std_format = '%Y-%m-%d'
    formats = [
        '%Y-%-m-%-d',
        '%-d-%-m-%Y',
        '%-m-%-d-%Y',

        '%Y/%-m/%-d',
        '%-d/%-m/%Y',
        '%-m/%-d/%Y',

        '%b %-d, %Y',
        '%b %-d, %y',
        '%B %-d, %Y',

        '%A %B %-d, %Y',  # day of week, full month, day, year
        '%A %B %-d %Y',

        '%b %-d %Y',  # abreviated month, day, year

        '%B %-d %Y',  # full month, day, year
        
    ]

    def random_datetime(self) -> datetime:
        """Get a random datetime object."""
        rand_year = random.randint(1401, 2100)
        rand_day = random.randint(1, 31)
        rand_month = random.randint(1, 12)
        try:
            return datetime(rand_year, rand_month, rand_day)
        except ValueError:
            return self.random_datetime()
    
    def get_sample(self) -> Tuple[str, str]:
        """
        Get a random sample.
        Returns the date in (standard_format, random_format)
        """
        date = self.random_datetime()
        rand_format = random.choice(self.formats)
        std_str = date.strftime(self.std_format)
        rand_str = date.strftime(rand_format)
        return (std_str, rand_str)
    
    def generate_df(self, n: int) -> pd.DataFrame:
        """Generate a df with `n` samples."""
        return pd.DataFrame(
            [self.get_sample() for _ in range(n)],
            columns=['output', 'input']
        )

In [8]:
date_generator = DateGenerator()
train_df = date_generator.generate_df(100000)
val_df = date_generator.generate_df(10000)
test_df = date_generator.generate_df(10000)

In [9]:
_DATA_PATH = Path('../data')
_RAW_PATH = _DATA_PATH / 'raw'
if not _RAW_PATH.exists():
    _RAW_PATH.mkdir(parents=True)

train_df.to_csv(_RAW_PATH / 'train.csv', index=False)
val_df.to_csv(_RAW_PATH / 'valid.csv', index=False)
test_df.to_csv(_RAW_PATH / 'test.csv', index=False)

In [10]:
train_df.sample(10)


Unnamed: 0,output,input
31627,1886-11-17,Nov 17 1886
146,1888-03-14,"Mar 14, 88"
92956,1707-06-24,Friday June 24 1707
64794,1856-11-12,"November 12, 1856"
80943,1579-12-20,1579/12/20
86087,1679-11-26,11/26/1679
44971,1732-07-15,Tuesday July 15 1732
42342,1783-07-08,"Jul 8, 83"
90278,1757-06-23,"Jun 23, 1757"
56959,1985-06-30,"Jun 30, 85"


In [11]:
train_df.input.apply(len).describe()

count    100000.000000
mean         12.274350
std           4.917943
min           8.000000
25%           9.000000
50%          10.000000
75%          14.000000
max          28.000000
Name: input, dtype: float64