Build a date parser using basic text processing and rules. (No ML models)

- Dataset:

[date_parser_testcases.csv](https://prod-files-secure.s3.us-west-2.amazonaws.com/2ad6026b-7cdc-4780-99a4-6e4e0034cf90/aabbc537-a7c4-478b-ba8a-2afc146a8d23/date_parser_testcases.csv)

- Given a piece of text, extract the day, month and year info and present it in DD/MM/YYYY format.
    - Example: “I went to London on 21st June, 2024” → 21/06/2024
- Use only default python packages and regex (no ML models OR external libraries)

In [1]:
import numpy as np
import pandas as pd
import re
from datetime import datetime

In [2]:
df = pd.read_csv('date_parser_testcases.csv')
df

Unnamed: 0,Input,Expected Output
0,"The event will take place on March 5, 2023.",05/03/2023
1,Her birthday is on 07/08/1990.,07/08/1990
2,The deadline is 2022-12-31.,31/12/2022
3,We met on 1st of January 2000.,01/01/2000
4,"The concert is scheduled for 15th September, 2...",15/09/2021
...,...,...
95,"We celebrate Independence Day on 2023-07-04, a...",04/07/2023
96,The final date for submission is 30th November...,30/11/2022
97,"The annual conference is on 15th October 2023,...",15/10/2023
98,"His birthdate, noted as 1990-05-20, is in the ...",20/05/1990


In [4]:
df.iloc[[50]] 
df.drop(50,inplace=True) 
df

Unnamed: 0,Input,Expected Output
0,"The event will take place on March 5, 2023.",05/03/2023
1,Her birthday is on 07/08/1990.,07/08/1990
2,The deadline is 2022-12-31.,31/12/2022
3,We met on 1st of January 2000.,01/01/2000
4,"The concert is scheduled for 15th September, 2...",15/09/2021
...,...,...
95,"We celebrate Independence Day on 2023-07-04, a...",04/07/2023
96,The final date for submission is 30th November...,30/11/2022
97,"The annual conference is on 15th October 2023,...",15/10/2023
98,"His birthdate, noted as 1990-05-20, is in the ...",20/05/1990


Trying out all Inputs one by one:

In [5]:
s1 = 'The event will take place on March 5, 2023.'
re1 = r'([A-Za-z]+)\s+(\d{1,2})(?:st|nd|rd|th)?,?\s+(\d{4})'

match = re.search(re1,s1)
print(match.groups())
day = match.group(2).zfill(2)
month_str = match.group(1)
year = match.group(3)
month = datetime.strptime(month_str, "%B").strftime("%m")
print(f"{day}/{month}/{year}")

('March', '5', '2023')
05/03/2023


In [6]:
s2 = 'We met on 1st of January, 2000.'
re2 = r'(\d{1,2})(st|nd|rd|th)?\s+of\s+([A-Za-z]+)\s*,?\s*(\d{4})'
match = re.search(re2,s2)
print(match.groups())
day = match.group(1).zfill(2)
month_str = match.group(3)
year = match.group(4)
month = datetime.strptime(month_str, "%B").strftime("%m")
print(f"{day}/{month}/{year}")

('1', 'st', 'January', '2000')
01/01/2000


In [7]:
s3 = 'The project started on 5/6/19.'
r3 = r'(\d{1,2})/(\d{1,2})/(\d{2})'
match = re.search(r3,s3)
print(match.groups())
day = match.group(1).zfill(2)
month = match.group(2)
year = match.group(3)
year = "20" + year if int(year) < 100 else year
# month = datetime.strptime(month_str, "%B").strftime("%m")
print(f"{day}/{month}/{year}")

('5', '6', '19')
05/6/2019


In [8]:
s4 = "Let's catch up on 02.04.2022."
r4 = r'(\d{2}).(\d{2}).(\d{4})' 
match = re.search(r4,s4)

print(match.groups())
day = match.group(3).zfill(2)
month = match.group(2)
year = match.group(1)
print(f"{day}/{month}/{year}")

('02', '04', '2022')
2022/04/02


In [9]:
import re
from datetime import datetime

# Regex pattern updated to handle both full and abbreviated month names
s5 = 'The concert is scheduled for 25th Dec 2024'
r5 = r'(\d{1,2})(st|nd|rd|th)?\s+([A-Za-z]+)\s*,?\s*(\d{4})'
match = re.search(r5, s5)

if match:
    print(match.groups())
    day = match.group(1).zfill(2)
    month_str = match.group(3)
    year = match.group(4)
    
    # Handle both full and abbreviated month names
    try:
        month = datetime.strptime(month_str, "%b").strftime("%m")  # Short month format 
    except ValueError:
        try:
            month = datetime.strptime(month_str, "%B").strftime("%m")  # Full month format
        except ValueError:
            month = "00" 
    
    print(f"{day}/{month}/{year}")
else:
    print("No match found.")

('25', 'th', 'Dec', '2024')
25/12/2024


Creating a function to extract and format all the dates

In [10]:
def Format(text):
    
    # all the combinations of Regex patterns
    patterns = [
        (r'([A-Za-z]+)\s+(\d{1,2})(?:st|nd|rd|th)?,?\s+(\d{4})', "%B %d, %Y"),
        (r'(\d{1,2})(st|nd|rd|th)?\s+of\s+([A-Za-z]+)\s*,?\s*(\d{4})', "%d of %B %Y"),
        (r'(\d{1,2})/(\d{1,2})/(\d{4})', "%d/%m/%Y"),
        (r'(\d{4})-(\d{2})-(\d{2})', "%Y-%m-%d"),
        (r'(\d{1,2})(st|nd|rd|th)?\s+([A-Za-z]+)\s*,?\s*(\d{4})', "%d %B, %Y"),
        (r'(\d{1,2})(st|nd|rd|th)?\s+([A-Za-z]+)\s*?\s*(\d{4})', "%d %B, %Y"),
        (r'(\d{4}).(\d{2}).(\d{2})',"%Y-%m-%d"),
        (r'(\d{2}).(\d{2}).(\d{4})',"%d.%m.%Y"),
        (r'(\d{1,2})/(\d{1,2})/(\d{2})','%d/%m/%Y'),                
        (r'(\d{1,2})(st|nd|rd|th)?\s+([A-Za-z]+),?\s+including\s+(\d{4})', "%d %B %Y"), # Pattern for '25th Dec, including 2024'        
        (r'(\d{1,2})(st|nd|rd|th)?\s+of\s+([A-Za-z]+)\s+every\s+year,\s+including\s+(\d{4})', "%d %B %Y")# Pattern for '4th of July including 2022'
    ]

    for pattern, date_format in patterns:
        match = re.search(pattern, text)
        if match:
            if len(match.groups()) == 4: # Eg: "15th September, 2020"
                day = match.group(1).zfill(2)
                month_str = match.group(3)
                year = match.group(4)
                year = "20" + year if int(year) < 100 else year
                # For full and abbreviated names
                try:
                    month = datetime.strptime(month_str, "%B").strftime("%m")
                except ValueError:
                    try:
                        month = datetime.strptime(month_str, "%b").strftime("%m")
                    except ValueError:
                        month = "00"  
                        
            elif date_format == "%d/%m/%Y":                
                day, month, year = match.groups()
                year = "20" + year if int(year) < 100 else year
                if int(month) > 12:
                    day, month = month, day  # Swapping if month is invalid
                    
            elif date_format == "%Y-%m-%d":                
                year, month, day = match.groups()
                year = "20" + year if int(year) < 100 else year
                
            elif date_format == "%d.%m.%Y":                
                day, month, year = match.groups()
                year = "20" + year if int(year) < 100 else year
                
            else:
                # Other patterns
                day = match.group(2).zfill(2) if match.group(2) else match.group(1).zfill(2)
                month_str = match.group(3) if 'of' in pattern else match.group(1)
                year = match.group(4) if 'of' in pattern else match.group(3)
                year = "20" + year if int(year) < 100 else year
                
                # Attempt to parse the month with full or abbreviated names
                try:
                    month = datetime.strptime(month_str, "%B").strftime("%m")
                except ValueError:
                    try:
                        month = datetime.strptime(month_str, "%b").strftime("%m")
                    except ValueError:
                        month = "00"              

            formatted_date = f"{day.zfill(2)}/{month.zfill(2)}/{year}"
            return formatted_date
    return None 

df['Parsed Date'] = df['Input'].apply(Format)
df

Unnamed: 0,Input,Expected Output,Parsed Date
0,"The event will take place on March 5, 2023.",05/03/2023,05/03/2023
1,Her birthday is on 07/08/1990.,07/08/1990,07/08/1990
2,The deadline is 2022-12-31.,31/12/2022,31/12/2022
3,We met on 1st of January 2000.,01/01/2000,01/01/2000
4,"The concert is scheduled for 15th September, 2...",15/09/2021,15/09/2021
...,...,...,...
95,"We celebrate Independence Day on 2023-07-04, a...",04/07/2023,04/07/2023
96,The final date for submission is 30th November...,30/11/2022,30/11/2022
97,"The annual conference is on 15th October 2023,...",15/10/2023,15/10/2023
98,"His birthdate, noted as 1990-05-20, is in the ...",20/05/1990,20/05/1990


In [11]:
df['Match'] = df['Parsed Date'] == df['Expected Output']

# Calculating accuracy
accuracy = df['Match'].mean() * 100
print(f"Accuracy: {accuracy:.2f}%")

Accuracy: 100.00%
