In [6]:
from datetime import datetime
import random
from pathlib import Path
from typing import Tuple

import pandas as pd

In [66]:
class DateGenerator:
    """
    Generate a date string dataset with dates in a random string format and the standard format.
    """

    std_format = '%Y-%m-%d'
    formats = [
        '%Y-%m-%d',
        '%d-%m-%Y',
        '%m-%d-%Y',

        '%Y/%m/%d',
        '%d/%m/%Y',
        '%m/%d/%Y',

        '%b %d, %Y',
        '%b %d, %y',
        '%B %d, %Y',

        '%A %B %d, %Y',  # day of week, full month, day, year
        '%A %B %d %Y',

        '%b %d %Y',  # abreviated month, day, year
        '%b %d %y',

        '%B %d %Y',  # full month, day, year
        '%B %d %y',
        
    ]

    def random_datetime(self) -> datetime:
        """Get a random datetime object."""
        rand_year = random.randint(1401, 2100)
        rand_day = random.randint(1, 31)
        rand_month = random.randint(1, 12)
        try:
            return datetime(rand_year, rand_month, rand_day)
        except ValueError:
            return self.random_datetime()
    
    def get_sample(self) -> Tuple[str, str]:
        """
        Get a random sample.
        Returns the date in (standard_format, random_format)
        """
        date = self.random_datetime()
        rand_format = random.choice(self.formats)
        std_str = date.strftime(self.std_format)
        rand_str = date.strftime(rand_format)
        return (std_str, rand_str)
    
    def generate_df(self, n: int) -> pd.DataFrame:
        """Generate a df with `n` samples."""
        return pd.DataFrame(
            [self.get_sample() for _ in range(n)],
            columns=['output', 'input']
        )

In [67]:
date_generator = DateGenerator()
train_df = date_generator.generate_df(100000)
val_df = date_generator.generate_df(10000)
test_df = date_generator.generate_df(10000)

In [68]:
_DATA_PATH = Path('../data')
_RAW_PATH = _DATA_PATH / 'raw'
if not _RAW_PATH.exists():
    _RAW_PATH.mkdir(parents=True)

train_df.to_csv(_RAW_PATH / 'train.csv', index=False)
val_df.to_csv(_RAW_PATH / 'valid.csv', index=False)
test_df.to_csv(_RAW_PATH / 'test.csv', index=False)