### Handling Unstructured Data with Python
**Description**: Extract structured data from unstructured text using Python.

**Steps**:
1. Load and analyze an unstructured text document.
2. Extract information using regex.

In [4]:
import re
import logging
import unittest # for unit tests
from datetime import datetime

# Configure logging
logging.basicConfig(level=logging.WARNING) # Consider logging.INFO or logging.DEBUG for more detail during development

def extract_email(text):
    """Extracts email addresses from text.  Returns None if no email is found, or is invalid"""
    email_pattern = r"[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}"
    match = re.search(email_pattern, text)
    if match:
        email = match.group(0)
        if validate_email(email):  # Added email validation
            return email
        else:
            logging.warning(f"Invalid email format found: {email}")
            return None
    else:
        logging.warning("No email found in text.")
        return None

def validate_email(email):
    """Validates email format using a more robust regex."""
    email_regex = r"^[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\.[a-zA-Z]{2,}$"
    return bool(re.match(email_regex, email))

def extract_phone_number(text):
  """Extracts phone numbers from text. Returns None if no phone number is found, or is invalid."""
  phone_pattern = r"\(?\d{3}\)?[-.\s]?\d{3}[-.\s]?\d{4}" #improved pattern
  match = re.search(phone_pattern, text)
  if match:
      phone_number = match.group(0)
      if validate_phone_number(phone_number):
          return phone_number
      else:
          logging.warning(f"Invalid phone number format: {phone_number}")
          return None
  else:
      logging.warning("No phone number found in text.")
      return None

def validate_phone_number(phone_number):
    """Validates phone number format.  Can be extended for more specific regional formats."""
    # Basic validation - adjust as needed for specific requirements
    phone_regex = r"^\(\d{3}\)[-.\s]?\d{3}[-.\s]?\d{4}$|^\d{3}[-.\s]?\d{3}[-.\s]?\d{4}$|^\d{10}$" # Allow (XXX) XXX-XXXX, XXX-XXX-XXXX, and XXXXXXXXXX

    return bool(re.match(phone_regex, phone_number))


def extract_name(text):
    """Extracts names from text. Returns None if no name is found."""
    # This is a simplified name extraction - can be significantly improved with NLP techniques
    name_pattern = r"[A-Z][a-z]+ [A-Z][a-z]+"
    match = re.search(name_pattern, text)
    if match:
        return match.group(0)
    else:
        logging.warning("No name found in text.")
        return None

def extract_date(text):
    """Extracts dates from text.  Returns None if no date is found, or is invalid"""
    date_pattern = r"\d{4}-\d{2}-\d{2}" #YYYY-MM-DD
    match = re.search(date_pattern, text)
    if match:
        date_string = match.group(0)
        try:
            date_object = datetime.strptime(date_string, "%Y-%m-%d") #validate & convert to date object
            return date_object
        except ValueError:
            logging.warning(f"Invalid date format found: {date_string}")
            return None
    else:
        logging.warning("No date found in text.")
        return None
# Unit Tests

class TestExtractionFunctions(unittest.TestCase):

    def test_extract_email_valid(self):
        self.assertEqual(extract_email("My email is test@example.com"), "test@example.com")

    def test_extract_email_invalid(self):
        self.assertIsNone(extract_email("My email is not an email"))
        self.assertIsNone(extract_email("My email is test@@example.com")) #invalid email

    def test_extract_email_none(self):
        self.assertIsNone(extract_email("No email here"))

    def test_extract_phone_number_valid(self):
        self.assertEqual(extract_phone_number("My number is 555-123-4567"), "555-123-4567")
        self.assertEqual(extract_phone_number("My number is (555) 123-4567"), "(555) 123-4567")
        self.assertEqual(extract_phone_number("My number is 5551234567"), "5551234567")

    def test_extract_phone_number_invalid(self):
        self.assertIsNone(extract_phone_number("My number is 123"))

    def test_extract_phone_number_none(self):
        self.assertIsNone(extract_phone_number("No phone number here"))

    def test_extract_name_valid(self):
        self.assertEqual(extract_name("My name is John Doe"), "John Doe")

    def test_extract_name_none(self):
        self.assertIsNone(extract_name("No name here"))

    def test_extract_date_valid(self):
        self.assertEqual(extract_date("The date is 2023-10-27"), datetime(2023, 10, 27))

    def test_extract_date_invalid(self):
        self.assertIsNone(extract_date("The date is 2023/10/27")) #wrong separator
        self.assertIsNone(extract_date("The date is 2023-33-27")) #invalid month

    def test_extract_date_none(self):
        self.assertIsNone(extract_date("No date here"))

if __name__ == '__main__':
    unittest.main() #Run the tests

#Example usage
text = "Contact John Doe at test@example.com or call (555) 123-4567. The date is 2023-11-15."
email = extract_email(text)
phone = extract_phone_number(text)
name = extract_name(text)
date = extract_date(text)

print(f"Email: {email}")
print(f"Phone: {phone}")
print(f"Name: {name}")
print(f"Date: {date}")

usage: ipykernel_launcher.py [-h] [-v] [-q] [--locals] [-f] [-c] [-b]
                             [-k TESTNAMEPATTERNS]
                             [tests ...]
ipykernel_launcher.py: error: argument -f/--failfast: ignored explicit argument '/home/vscode/.local/share/jupyter/runtime/kernel-v3772f8ee44c21399114f1f62c4e1771cd861dcda4.json'


SystemExit: 2