In [172]:
import re

def extract_dates(input_file, output_file):
    # open input output files
    with open(input_file, 'r') as fIn, open(output_file, 'w') as fOut:
        # Read file
        Data = fIn.read()

        # list to store found dates
        dates = []

        # case-insensitive
        patterns = [
            # american holidays
            r'(?i)(?:new year\'s day|martin luther king, jr\. day|george washington\'s birthday|memorial day|independence day|labor day|colombus day|veterans day|thanksgiving day|christmas day|christmas|new year\'s|thanksgiving)',
            # month, day (num written) month, year -- with the/of
            r'(?i)(?:the\s)?(?:(?:(?:[1-9]|1[0-9]|2[0-9]|3[0-1])(?:st|nd|rd|th)?))(?:\s)of(?:\s)(?:january|february|march|april|may|june|july|august|september|october|november|december)[,.]?\s?(?:\d{4})?',
            # day of week, month, day (num written), year -- with the/of
            r'(?i)(?:sunday|monday|tuesday|wednesday|thursday|friday|saturday)(?:\s)(?:the\s)(?:(?:[1-9]|1[0-9]|2[0-9]|3[0-1])(?:st|nd|rd|th)?)[,.]?\s?(?:\d{4})?\b',
            # day of week, month, day (num written), year -- without the/of
            r'(?i)(?:monday|tuesday|wednesday|thursday|friday|saturday)[,]?\s(?:january|february|march|april|may|june|july|august|september|october|november|december)\s(?:(?:[1-9]|1[0-9]|2[0-9]|3[0-1])(?:st|nd|rd|th)?)?[,.]?\s?(?:\d{4})',
            # month, day (num written), year -- without the/of
            r'(?i)\b(?:january|february|march|april|may|june|july|august|september|october|november|december)\s(?:(?:[1-9]|1[0-9]|2[0-9]|3[0-1])(?:st|nd|rd|th)?)[,.]?\s?(?:\d{4})?\b',
            # month (abr), day (num written), year -- without the/of
            r'(?i)\b(?:jan|feb|mar|apr|may|jun|jul|aug|sep|oct|nov|dec)\s(?:(?:[1-9]|1[0-9]|2[0-9]|3[0-1])(?:st|nd|rd|th)?)[,.]?\s?(?:\d{4})?\b',
            # date in MM/DD/YYYY or M/D/YYYY
            r'(?i)(?:(?:[1-9]|1[0-2])/(?:(?:[1-9])|(?:[1-9]|1[0-9]|2[0-9]|3[0-1]))/(?:\d{4}|\d{2}))',
            # only years
            r'(?i)\d{4}',
            # day of week and time in 12-hr format
            r'(?i)(?:monday|tuesday|wednesday|thursday|friday|saturday)[,]?\s(?:1[0-2]|[1-9])(?:a\.m|p\.m|am|pm)',
            # day of week at time category (morn, aft, eve, night)
            r'(?i)(?:sunday|monday|tuesday|wednesday|thursday|friday|saturday)\s(?:morning|afternoon|evening|night)',
            # day of week at time category (morn, aft, eve, night) - alternate pattern
            r'(?i)(?:monday|tuesday|wednesday|thursday|friday|saturday)\s(?:morning|afternoon|evening|night)',
            # day of week
            r'(?i)(?:sunday|monday|tuesday|wednesday|thursday|friday|saturday)',
            # only months
            r'(?i)(?:january|february|march|april|may|june|july|august|september|october|november|december)',
        ]

        # Loop through patterns and find matches
        for pattern in patterns:
            matches = re.findall(pattern, Data)
            for date in matches:
                Data = re.sub(re.escape(date), "", Data)
                dates.append(date)

        # Write found dates to output file
        for date in dates:
            fOut.write(date + '\n')

    return dates

def main():
    input_file = "../Data/Input/input.txt"
    output_file = "../Data/Output/output.txt"

    extracted_dates = extract_dates(input_file, output_file)
    print(extracted_dates)

if __name__ == "__main__":
    main()


['Labor Day', 'Memorial day', 'the 13th of May. ', 'the 1st of January, 2020', 'the 5th of May', 'Friday, December 21st, 2012', 'January 5th, 2015', 'january 5th, 2015', 'January 15, 2029', 'May 13, 2007', 'September 11, 2001', 'October 3rd ', 'November 29th ', 'May 5th', 'Jan 5th, 2015', 'Jan 5th', 'jan 5th, 2015', '1/1/2020', '2/1/2020', '2015', '1935', '1948', 'Monday, 2pm', 'Tuesday afternoon', 'monday', 'Sunday', 'Monday', 'Monday', 'Monday', 'January', 'May', 'May', 'June', 'June']
