In [5]:
import nltk
import re
from nltk.tokenize import word_tokenize

# 📝 Sample text
text = """
Visit our site at http://www.uxlabs.co.uk or https://uxlabs.co.uk.
You can also check www.google.com or contact us at mailto:someone@google.com.

Send an email to fred@gmail.com, but not to bob @ aol.com or steve or mary@aolcom!

Important dates: 23/01/2021, 23-Jan-2021, 2021/01/23, and 23-01-2021.
"""

# ✂️ Step 1: Tokenize the text into words using NLTK
tokens = word_tokenize(text)

# 🔍 Step 2: Define regular expressions

# URL pattern
url_pattern = re.compile(r"(http[s]?://\S+|www\.\S+|mailto:\S+@\S+)", re.I)

# Email pattern
email_pattern = re.compile(r"\b[A-Za-z0-9._%+-]+@[A-Za-z0-9.-]+\.[A-Za-z]{2,}\b")

# Date pattern
date_pattern = re.compile(r"(\d{2}/\d{2}/\d{4}|\d{2}-\w{3}-\d{4}|\d{4}/\d{2}/\d{2}|\d{2}-\d{2}-\d{4})")

# 🕵️‍♂️ Step 3: Search for matches from the tokens
urls = [token for token in tokens if url_pattern.match(token)]
emails = [token for token in tokens if email_pattern.match(token)]
dates = [token for token in tokens if date_pattern.match(token)]

# 🎯 Step 4: Print Results
print("URLs found:", urls)
print("Emails found:", emails)
print("Dates found:", dates)


URLs found: ['www.google.com']
Emails found: []
Dates found: ['23/01/2021', '23-Jan-2021', '2021/01/23', '23-01-2021']
