In [1]:
import re
import csv
from typing import List, Tuple, Optional
from datetime import datetime

class DateParser:
    def __init__(self):
        # Month mappings
        self.month_names = {
            'january': 1, 'jan': 1, 'february': 2, 'feb': 2, 'march': 3, 'mar': 3,
            'april': 4, 'apr': 4, 'may': 5, 'june': 6, 'jun': 6, 'july': 7, 'jul': 7,
            'august': 8, 'aug': 8, 'september': 9, 'sep': 9, 'sept': 9,
            'october': 10, 'oct': 10, 'november': 11, 'nov': 11, 'december': 12, 'dec': 12
        }

        # Ordinal number patterns (1st, 2nd, 3rd, etc.)
        self.ordinal_patterns = r'(?:1st|2nd|3rd|[4-9]th|1[0-9]th|2[01]st|22nd|23rd|2[4-9]th|30th|31st)'

        # Date patterns - ordered by specificity (most specific first)
        self.patterns = [
            # ISO format: YYYY-MM-DD, YYYY/MM/DD, YYYY.MM.DD
            self._create_iso_patterns(),

            # DD Month YYYY: 21st June 2024, 21 June 2024
            self._create_dd_month_yyyy_patterns(),

            # Month DD, YYYY: June 21st, 2024, June 21, 2024
            self._create_month_dd_yyyy_patterns(),

            # DD/MM/YYYY, MM/DD/YYYY, DD-MM-YYYY, MM-DD-YYYY
            self._create_numeric_patterns(),

            # DD Month: 21st June, 21 June (current year assumed)
            self._create_dd_month_patterns(),

            # Month DD: June 21st, June 21 (current year assumed)
            self._create_month_dd_patterns(),

            # Month YYYY: June 2024 (1st day assumed)
            self._create_month_year_patterns(),

            # Year only: 2024 (January 1st assumed)
            self._create_year_only_patterns()
        ]

    def _create_iso_patterns(self):
        """ISO format patterns: YYYY-MM-DD, YYYY/MM/DD, YYYY.MM.DD"""
        return [
            {
                'pattern': r'\b((?:19|20)\d{2})[-/.]([0-1]?[0-9])[-/.]([0-3]?[0-9])\b',
                'groups': {'year': 1, 'month': 2, 'day': 3},
                'format': 'ISO'
            }
        ]

    def _create_dd_month_yyyy_patterns(self):
        """DD Month YYYY patterns"""
        month_pattern = '|'.join(self.month_names.keys())
        return [
            {
                'pattern': rf'\b({self.ordinal_patterns})\s+({month_pattern})[,\s]+(\d{{4}})\b',
                'groups': {'day': 1, 'month': 2, 'year': 3},
                'format': 'DD Month YYYY (ordinal)'
            },
            {
                'pattern': rf'\b([0-3]?[0-9])\s+({month_pattern})[,\s]+(\d{{4}})\b',
                'groups': {'day': 1, 'month': 2, 'year': 3},
                'format': 'DD Month YYYY'
            }
        ]

    def _create_month_dd_yyyy_patterns(self):
        """Month DD, YYYY patterns"""
        month_pattern = '|'.join(self.month_names.keys())
        return [
            {
                'pattern': rf'\b({month_pattern})\s+({self.ordinal_patterns})[,\s]+(\d{{4}})\b',
                'groups': {'month': 1, 'day': 2, 'year': 3},
                'format': 'Month DD, YYYY (ordinal)'
            },
            {
                'pattern': rf'\b({month_pattern})\s+([0-3]?[0-9])[,\s]+(\d{{4}})\b',
                'groups': {'month': 1, 'day': 2, 'year': 3},
                'format': 'Month DD, YYYY'
            }
        ]

    def _create_numeric_patterns(self):
        """Numeric date patterns"""
        return [
            # DD/MM/YYYY (European format)
            {
                'pattern': r'\b([0-3]?[0-9])[/\-.]([0-1]?[0-9])[/\-.](\d{4})\b',
                'groups': {'day': 1, 'month': 2, 'year': 3},
                'format': 'DD/MM/YYYY'
            },
            # MM/DD/YY or DD/MM/YY
            {
                'pattern': r'\b([0-1]?[0-9])[/\-.]([0-3]?[0-9])[/\-.](\d{2})\b',
                'groups': {'month': 1, 'day': 2, 'year': 3},
                'format': 'MM/DD/YY'
            }
        ]

    def _create_dd_month_patterns(self):
        """DD Month patterns (no year)"""
        month_pattern = '|'.join(self.month_names.keys())
        return [
            {
                'pattern': rf'\b({self.ordinal_patterns})\s+({month_pattern})\b',
                'groups': {'day': 1, 'month': 2},
                'format': 'DD Month (ordinal)'
            },
            {
                'pattern': rf'\b([0-3]?[0-9])\s+({month_pattern})\b',
                'groups': {'day': 1, 'month': 2},
                'format': 'DD Month'
            }
        ]

    def _create_month_dd_patterns(self):
        """Month DD patterns (no year)"""
        month_pattern = '|'.join(self.month_names.keys())
        return [
            {
                'pattern': rf'\b({month_pattern})\s+({self.ordinal_patterns})\b',
                'groups': {'month': 1, 'day': 2},
                'format': 'Month DD (ordinal)'
            },
            {
                'pattern': rf'\b({month_pattern})\s+([0-3]?[0-9])\b',
                'groups': {'month': 1, 'day': 2},
                'format': 'Month DD'
            }
        ]

    def _create_month_year_patterns(self):
        """Month YYYY patterns"""
        month_pattern = '|'.join(self.month_names.keys())
        return [
            {
                'pattern': rf'\b({month_pattern})\s+(\d{{4}})\b',
                'groups': {'month': 1, 'year': 2},
                'format': 'Month YYYY'
            }
        ]

    def _create_year_only_patterns(self):
        """Year only patterns"""
        return [
            {
                'pattern': r'\b((?:19|20)\d{2})\b',
                'groups': {'year': 1},
                'format': 'Year only'
            }
        ]

    def _extract_ordinal_day(self, ordinal_str: str) -> int:
        """Extract day number from ordinal string (21st -> 21)"""
        return int(re.sub(r'[a-zA-Z]', '', ordinal_str))

    def _month_name_to_number(self, month_str: str) -> int:
        """Convert month name to number"""
        return self.month_names.get(month_str.lower(), 1)

    def _normalize_year(self, year: int) -> int:
        """Normalize 2-digit years to 4-digit years"""
        if year < 50:
            return 2000 + year
        elif year < 100:
            return 1900 + year
        return year

    def _validate_date(self, day: int, month: int, year: int) -> bool:
        """Validate if the date is realistic"""
        try:
            if day < 1 or day > 31:
                return False
            if month < 1 or month > 12:
                return False
            if year < 1900 or year > 2100:
                return False

            # Check if day is valid for the month
            datetime(year, month, day)
            return True
        except ValueError:
            return False

    def _format_date(self, day: int, month: int, year: int) -> str:
        """Format date as DD/MM/YYYY"""
        return f"{day:02d}/{month:02d}/{year:04d}"

    def parse_single_date(self, text: str, default_year: Optional[int] = None) -> Optional[str]:
        """
        Parse a single date from text and return in DD/MM/YYYY format

        Args:
            text: Input text containing date
            default_year: Year to use when not specified (defaults to current year)

        Returns:
            Formatted date string or None if no date found
        """
        if default_year is None:
            default_year = datetime.now().year

        text = text.lower().strip()

        # Try each pattern group
        for pattern_group in self.patterns:
            for pattern_info in pattern_group:
                pattern = pattern_info['pattern']
                groups = pattern_info['groups']

                matches = list(re.finditer(pattern, text, re.IGNORECASE))

                for match in matches:
                    try:
                        # Extract components
                        day = 1  # default
                        month = 1  # default
                        year = default_year  # default

                        if 'day' in groups:
                            day_str = match.group(groups['day'])
                            if any(ord_pattern in day_str for ord_pattern in ['st', 'nd', 'rd', 'th']):
                                day = self._extract_ordinal_day(day_str)
                            else:
                                day = int(day_str)

                        if 'month' in groups:
                            month_str = match.group(groups['month'])
                            if month_str.isdigit():
                                month = int(month_str)
                            else:
                                month = self._month_name_to_number(month_str)

                        if 'year' in groups:
                            year = int(match.group(groups['year']))
                            year = self._normalize_year(year)

                        # Validate and format
                        if self._validate_date(day, month, year):
                            return self._format_date(day, month, year)

                    except (ValueError, IndexError):
                        continue

        return None

    def parse_multiple_dates(self, text: str, default_year: Optional[int] = None) -> List[str]:
        """
        Parse multiple dates from text

        Args:
            text: Input text containing dates
            default_year: Year to use when not specified

        Returns:
            List of formatted date strings
        """
        if default_year is None:
            default_year = datetime.now().year

        dates_found = []
        text = text.lower().strip()

        # Keep track of processed positions to avoid duplicates
        processed_positions = set()

        for pattern_group in self.patterns:
            for pattern_info in pattern_group:
                pattern = pattern_info['pattern']
                groups = pattern_info['groups']

                matches = list(re.finditer(pattern, text, re.IGNORECASE))

                for match in matches:
                    # Check if we've already processed this position
                    start, end = match.span()
                    if any(pos in range(start, end + 1) for pos in processed_positions):
                        continue

                    try:
                        # Extract components
                        day = 1
                        month = 1
                        year = default_year

                        if 'day' in groups:
                            day_str = match.group(groups['day'])
                            if any(ord_pattern in day_str for ord_pattern in ['st', 'nd', 'rd', 'th']):
                                day = self._extract_ordinal_day(day_str)
                            else:
                                day = int(day_str)

                        if 'month' in groups:
                            month_str = match.group(groups['month'])
                            if month_str.isdigit():
                                month = int(month_str)
                            else:
                                month = self._month_name_to_number(month_str)

                        if 'year' in groups:
                            year = int(match.group(groups['year']))
                            year = self._normalize_year(year)

                        # Validate and add to results
                        if self._validate_date(day, month, year):
                            formatted_date = self._format_date(day, month, year)
                            if formatted_date not in dates_found:
                                dates_found.append(formatted_date)
                                processed_positions.update(range(start, end + 1))

                    except (ValueError, IndexError):
                        continue

        return dates_found

def test_date_parser():
    """Test the date parser with various examples"""
    parser = DateParser()

    test_cases = [
        "I went to London on 21st June, 2024",
        "The meeting is scheduled for June 21st, 2024",
        "Born on 15/03/1990",
        "Event date: 2024-12-25",
        "December 31st, 2023",
        "5th of May 2022",
        "March 15th",
        "15 March",
        "June 2024",
        "2023",
        "Meeting on 1st Jan 2024 and follow-up on 15th Jan 2024",
        "Invalid date: 32nd March 2024",
        "No date in this text",
        "2024/02/29",  # leap year
        "29/02/2024",  # leap year
    ]

    print("=== DATE PARSER TEST CASES ===")
    print()

    for i, text in enumerate(test_cases, 1):
        print(f"Test {i}: {text}")

        single_date = parser.parse_single_date(text)
        multiple_dates = parser.parse_multiple_dates(text)

        print(f"  Single date: {single_date if single_date else 'None found'}")
        print(f"  All dates: {multiple_dates if multiple_dates else 'None found'}")
        print()

def process_csv_file(filename: str):
    """Process a CSV file with date parsing test cases"""
    parser = DateParser()

    try:
        with open(filename, 'r', encoding='utf-8') as file:
            csv_reader = csv.DictReader(file)

            print(f"=== PROCESSING {filename} ===")
            print()

            results = []

            for row_num, row in enumerate(csv_reader, 1):
                # Assume the CSV has columns like 'text', 'input', or similar
                text_columns = ['text', 'input', 'sentence', 'description']
                text = None

                for col in text_columns:
                    if col in row:
                        text = row[col]
                        break

                if not text:
                    # If no standard column found, use the first non-empty column
                    for key, value in row.items():
                        if value and value.strip():
                            text = value
                            break

                if text:
                    parsed_dates = parser.parse_multiple_dates(text)
                    results.append({
                        'row': row_num,
                        'text': text,
                        'parsed_dates': parsed_dates
                    })

                    print(f"Row {row_num}: {text}")
                    print(f"  Parsed dates: {parsed_dates if parsed_dates else 'None found'}")
                    print()

            # Save results
            output_filename = filename.replace('.csv', '_results.csv')
            with open(output_filename, 'w', newline='', encoding='utf-8') as outfile:
                fieldnames = ['row', 'text', 'parsed_dates', 'first_date']
                writer = csv.DictWriter(outfile, fieldnames=fieldnames)
                writer.writeheader()

                for result in results:
                    writer.writerow({
                        'row': result['row'],
                        'text': result['text'],
                        'parsed_dates': '; '.join(result['parsed_dates']) if result['parsed_dates'] else '',
                        'first_date': result['parsed_dates'][0] if result['parsed_dates'] else ''
                    })

            print(f"Results saved to {output_filename}")
            print(f"Processed {len(results)} rows")

    except FileNotFoundError:
        print(f"File '{filename}' not found. Please ensure the file exists in the current directory.")
        print("Creating a sample CSV file for testing...")

        # Create sample test file
        sample_data = [
            {'text': 'I went to London on 21st June, 2024'},
            {'text': 'Meeting scheduled for December 15th, 2023'},
            {'text': 'Born on 03/05/1995'},
            {'text': 'Event on 2024-01-01'},
            {'text': 'Conference from March 10th to March 12th, 2024'},
            {'text': 'No specific date mentioned here'},
            {'text': 'Multiple dates: Jan 1st 2024, Feb 14th 2024, and Dec 25th 2024'}
        ]

        with open('sample_date_parser_testcases.csv', 'w', newline='', encoding='utf-8') as sample_file:
            writer = csv.DictWriter(sample_file, fieldnames=['text'])
            writer.writeheader()
            writer.writerows(sample_data)

        print("Sample file 'sample_date_parser_testcases.csv' created!")

        # Process the sample file
        process_csv_file('sample_date_parser_testcases.csv')

def main():
    """Main function to run the date parser"""
    print("Date Parser - Rule-based text processing")
    print("=" * 50)

    # Run test cases
    test_date_parser()

    # Try to process the CSV file
    csv_filename = '/content/date_parser_testcases (1).csv'
    process_csv_file(csv_filename)

if __name__ == "__main__":
    main()

Date Parser - Rule-based text processing
=== DATE PARSER TEST CASES ===

Test 1: I went to London on 21st June, 2024
  Single date: 21/06/2024
  All dates: ['21/06/2024']

Test 2: The meeting is scheduled for June 21st, 2024
  Single date: 21/06/2024
  All dates: ['21/06/2024']

Test 3: Born on 15/03/1990
  Single date: 15/03/1990
  All dates: ['15/03/1990']

Test 4: Event date: 2024-12-25
  Single date: 25/12/2024
  All dates: ['25/12/2024']

Test 5: December 31st, 2023
  Single date: 31/12/2023
  All dates: ['31/12/2023']

Test 6: 5th of May 2022
  Single date: 01/05/2022
  All dates: ['01/05/2022']

Test 7: March 15th
  Single date: 15/03/2025
  All dates: ['15/03/2025']

Test 8: 15 March
  Single date: 15/03/2025
  All dates: ['15/03/2025']

Test 9: June 2024
  Single date: 01/06/2024
  All dates: ['01/06/2024']

Test 10: 2023
  Single date: 01/01/2023
  All dates: ['01/01/2023']

Test 11: Meeting on 1st Jan 2024 and follow-up on 15th Jan 2024
  Single date: 01/01/2024
  All dates: