In [5]:
from datetime import datetime

import re

In [10]:
date_strings = [
    "2023-01-15",
    "15 January 2023",
    "01/15/2023",
    "January 15, 2023",
    "15th Jan 2023",
    "2023.01.15",
    "2023 01 15",
    "1st February 2024",
    "2024-2-5",
    'Monday 01 January 2024',
    'Monday, 01 January 2024',
    'monday, 01 January 2024',
    'monday, 01 january 2024',
    'monday, 01 jan 2024',
]


preferred_format = "%Y-%m-%d"


def remove_day(date_str):
    """Remove day names, and any separators following them."""
    return re.sub(r'(?i)^\s*(monday|tuesday|wednesday|thursday|friday|saturday|sunday),?\s*', '', date_str)

def remove_ordinal_suffix(date_str):
    return re.sub(r'(\d+)(st|nd|rd|th)', r'\1', date_str)

def clean_date_string(date_str):
    possible_separators = ["-", " ", "/", ".", ","]
    preferred_separator = "-"
    date_str = remove_day(date_str)
    date_str = date_str.strip()
    date_str = remove_ordinal_suffix(date_str)
    for sep in possible_separators:
        date_str = date_str.replace(sep, preferred_separator)
    date_str = date_str.replace("--", "-")
    return date_str

def fuzzy_strptime(date_str, target_format="%Y-%m-%d`"):

    date_str = clean_date_string(date_str)
    possible_date_input_formats = [
        "%Y-%m-%d",
        "%d-%B-%Y",
        "%m-%d-%Y",
        "%B-%d-%Y",
        "%d-%b-%Y",
        "%Y.%m.%d",
        "%Y %m %d",
        "%d-%m-%Y",
    ]

    for fmt in possible_date_input_formats:
        try:
            dt = datetime.strptime(date_str, fmt)
            return dt
        except ValueError:
            continue
    return None


for date_str in date_strings:
    std_date = fuzzy_strptime(date_str, preferred_format)
    print(f"Original: {date_str} -> Standardized: {std_date}")



Original: 2023-01-15 -> Standardized: 2023-01-15 00:00:00
Original: 15 January 2023 -> Standardized: 2023-01-15 00:00:00
Original: 01/15/2023 -> Standardized: 2023-01-15 00:00:00
Original: January 15, 2023 -> Standardized: 2023-01-15 00:00:00
Original: 15th Jan 2023 -> Standardized: 2023-01-15 00:00:00
Original: 2023.01.15 -> Standardized: 2023-01-15 00:00:00
Original: 2023 01 15 -> Standardized: 2023-01-15 00:00:00
Original: 1st February 2024 -> Standardized: 2024-02-01 00:00:00
Original: 2024-2-5 -> Standardized: 2024-02-05 00:00:00
Original: Monday 01 January 2024 -> Standardized: 2024-01-01 00:00:00
Original: Monday, 01 January 2024 -> Standardized: 2024-01-01 00:00:00
Original: monday, 01 January 2024 -> Standardized: 2024-01-01 00:00:00
Original: monday, 01 january 2024 -> Standardized: 2024-01-01 00:00:00
Original: monday, 01 jan 2024 -> Standardized: 2024-01-01 00:00:00


In [7]:
for date_str in date_strings:
    try:
        print(f"Original: {date_str} -> datetime: {datetime.strptime(date_str, '%Y-%m-%d')}")
    except ValueError:
        print(f"Original: {date_str} -> datetime: Not recognized")

Original: 2023-01-15 -> datetime: 2023-01-15 00:00:00
Original: 15 January 2023 -> datetime: Not recognized
Original: 01/15/2023 -> datetime: Not recognized
Original: January 15, 2023 -> datetime: Not recognized
Original: 15th Jan 2023 -> datetime: Not recognized
Original: 2023.01.15 -> datetime: Not recognized
Original: 2023 01 15 -> datetime: Not recognized
Original: 1st February 2024 -> datetime: Not recognized
Original: 2024-2-5 -> datetime: 2024-02-05 00:00:00
Original: Monday 01 January 2024 -> datetime: Not recognized
Original: Monday, 01 January 2024monday, 01 January 2024 -> datetime: Not recognized


In [11]:
from dateutil.parser import parse

# "Fuzzy" parsing of text containing a date
text = "The meeting is on March 15, 2024 at 3 PM"
dt = parse(text, fuzzy=True)

print(dt)
# Output: 2024-03-15 15:00:00
for date_str in date_strings:
    try:
        dt = parse(date_str, fuzzy=True)
        print(f"Original: {date_str} -> Parsed datetime: {dt}")
    except ValueError:
        print(f"Original: {date_str} -> Parsed datetime: Not recognized")

2024-03-15 15:00:00
Original: 2023-01-15 -> Parsed datetime: 2023-01-15 00:00:00
Original: 15 January 2023 -> Parsed datetime: 2023-01-15 00:00:00
Original: 01/15/2023 -> Parsed datetime: 2023-01-15 00:00:00
Original: January 15, 2023 -> Parsed datetime: 2023-01-15 00:00:00
Original: 15th Jan 2023 -> Parsed datetime: 2023-01-15 00:00:00
Original: 2023.01.15 -> Parsed datetime: 2023-01-15 00:00:00
Original: 2023 01 15 -> Parsed datetime: 2023-01-15 00:00:00
Original: 1st February 2024 -> Parsed datetime: 2024-02-01 00:00:00
Original: 2024-2-5 -> Parsed datetime: 2024-02-05 00:00:00
Original: Monday 01 January 2024 -> Parsed datetime: 2024-01-01 00:00:00
Original: Monday, 01 January 2024 -> Parsed datetime: 2024-01-01 00:00:00
Original: monday, 01 January 2024 -> Parsed datetime: 2024-01-01 00:00:00
Original: monday, 01 january 2024 -> Parsed datetime: 2024-01-01 00:00:00
Original: monday, 01 jan 2024 -> Parsed datetime: 2024-01-01 00:00:00


In [None]:
def fuzzy_date_str_to_date(date, month_input_format=None) -> pd.Timestamp | None:
    """Convert a date string to a datetime object.
    valid input formats:
    - '%d %B %Y' (e.g. '01 January 2024')
    - %A %d %B %Y' (e.g. 'Monday 01 January 2024') 
    - %A %d %m %Y' (e.g. 'Monday 01 01 2024')
    - %d %m %Y' (e.g. '01 01 2024')

    Args:
        date (str): The date string to convert.
        month_input_format (str, optional): The format of the month in the date string. 
        Can be "%B" for full month name or "%b" for abbreviated month name. 
        If None, format is inferred. Defaults to None.
    """

    remove_chars = [',']
    for char in remove_chars:
        date = date.replace(char, '')

    parts = date.strip().split()
    if not parts:
        return None

    valid_days = {
        "monday",
        "tuesday",
        "wednesday",
        "thursday",
        "friday",
        "saturday",
        "sunday",
    }

    parts = [part for part in parts if part.lower() not in valid_days]
    date_cleaned = ' '.join(parts)

    if month_input_format is None:
        if len(parts) < 3:
            return None
        month_part = parts[1]
        if month_part.isdigit():
            month_input_format = "%m"
        else:
            month_lower = month_part.lower()
            full_months = {
                "january",
                "february",
                "march",
                "april",
                "may",
                "june",
                "july",
                "august",
                "september",
                "october",
                "november",
                "december",
            }
            abbr_months = {
                "jan",
                "feb",
                "mar",
                "apr",
                "may",
                "jun",
                "jul",
                "aug",
                "sep",
                "oct",
                "nov",
                "dec",
            }
            if month_lower in full_months:
                month_input_format = "%B"
            elif month_lower in abbr_months:
                month_input_format = "%b"
            else:
                return None

    try:
        return pd.to_datetime(date_cleaned, format=f"%d {month_input_format} %Y")
    except ValueError:
        return None
