In [8]:
# Importing necessary libraries
import nltk
import spacy
from nltk.tokenize import word_tokenize
from nltk.stem import PorterStemmer
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer

# Download NLTK data
nltk.download('punkt')
nltk.download('wordnet')
nltk.download('stopwords')
nltk.download('omw-1.4')

# Load spaCy English model
nlp = spacy.load("en_core_web_sm")

# Load dataset text from file
with open("/content/mixed_emails_numbers_hashtags_dataset.txt", "r") as file:
    text = file.read()

print("\n========== DATASET ==========\n")
print(text[:1000])


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!




#Reminder History watch course collection someone where material early. Krista Patel (carolyn36@mann.net) confirmed the appointment at 969-821-0810x93513 on 2005-07-11 22:49. #Update Brother themselves be kind environmental grow which show room body. Call us at 379.828.0084 or visit our office at 321 Hayden Key Suite 642
New Matthew, MN 92506. You can reach out to Joshua Davis at freemanmarvin@ferrell.com for more information.

Call us at 2449701358 or visit our office at 49946 Derrick Loop
Christopherview, VT 95861. You can reach out to Lindsay Moore at baldwinsarah@daugherty.com for more information. Andrea Larson (garzajerry@yahoo.com) confirmed the appointment at 878.797.6239 on 1991-08-14 13:12. Call us at 010-663-7360 or visit our office at 39968 Donald Port
Leonardberg, CT 88381. Call us at +1-253-488-7482x2450 or visit our office at 66194 Tyler Islands
Lake Gary, UT 05767. Call us at (903)935-6013x3404 or visit our office at 337 Wheeler Well
Caroltown, AZ 61999. Paul Sullivan

In [9]:
print("\n========== TEXT PREPROCESSING (NLTK & spaCy) ==========\n")

# Tokenization (NLTK)
tokens = word_tokenize(text)
print("Tokens:\n", tokens[:50])  # print first 50 tokens to avoid large output

# Stemming (NLTK)
stemmer = PorterStemmer()
stemmed_words = [stemmer.stem(word) for word in tokens]
print("\nStemmed Words:\n", stemmed_words[:50])

# Lemmatization (NLTK)
lemmatizer = WordNetLemmatizer()
lemmatized_words_nltk = [lemmatizer.lemmatize(word) for word in tokens]
print("\nLemmatized Words (NLTK):\n", lemmatized_words_nltk[:50])

# Lemmatization using spaCy
doc = nlp(text)
lemmatized_words_spacy = [token.lemma_ for token in doc]
print("\nLemmatized Words (spaCy):\n", lemmatized_words_spacy[:50])




Tokens:
 ['#', 'Reminder', 'History', 'watch', 'course', 'collection', 'someone', 'where', 'material', 'early', '.', 'Krista', 'Patel', '(', 'carolyn36', '@', 'mann.net', ')', 'confirmed', 'the', 'appointment', 'at', '969-821-0810x93513', 'on', '2005-07-11', '22:49', '.', '#', 'Update', 'Brother', 'themselves', 'be', 'kind', 'environmental', 'grow', 'which', 'show', 'room', 'body', '.', 'Call', 'us', 'at', '379.828.0084', 'or', 'visit', 'our', 'office', 'at', '321']

Stemmed Words:
 ['#', 'remind', 'histori', 'watch', 'cours', 'collect', 'someon', 'where', 'materi', 'earli', '.', 'krista', 'patel', '(', 'carolyn36', '@', 'mann.net', ')', 'confirm', 'the', 'appoint', 'at', '969-821-0810x93513', 'on', '2005-07-11', '22:49', '.', '#', 'updat', 'brother', 'themselv', 'be', 'kind', 'environment', 'grow', 'which', 'show', 'room', 'bodi', '.', 'call', 'us', 'at', '379.828.0084', 'or', 'visit', 'our', 'offic', 'at', '321']

Lemmatized Words (NLTK):
 ['#', 'Reminder', 'History', 'watch', 'cou

In [10]:

print("\n========== REGULAR EXPRESSIONS ==========\n")

# Extracting Email Addresses
emails = re.findall(r'[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+', text)
print("Email Addresses Found:\n", emails)

# Extracting Phone Numbers (simple international + local formats)
phones = re.findall(r'\+?\d[\d\s\-]{8,}\d', text)
print("\nPhone Numbers Found:\n", phones)

# Extracting Hashtags
hashtags = re.findall(r'#\w+', text)
print("\nHashtags Found:\n", hashtags)



Email Addresses Found:
 ['carolyn36@mann.net', 'freemanmarvin@ferrell.com', 'baldwinsarah@daugherty.com', 'garzajerry@yahoo.com', 'jennifer01@gordon.com', 'cthomas@yahoo.com', 'thuynh@yahoo.com', 'coleerin@yahoo.com', 'charles45@hotmail.com', 'stephanie14@buchanan.com', 'jacquelinephillips@gmail.com', 'kaylasmith@hall.info', 'crystal34@barnett-carroll.info', 'hopkinselizabeth@mcintosh.com', 'carsonamanda@klein.info', 'dwilson@hotmail.com', 'cheneric@yahoo.com', 'patrickcoleman@barnes.com', 'elizabeth38@yahoo.com', 'crystalrivers@wilson.com', 'whitedebbie@carey.org', 'hmiller@hotmail.com', 'christensenteresa@yahoo.com', 'daniel32@yahoo.com', 'tylergarcia@gmail.com', 'copelandkelly@mcclure.info', 'jmason@yahoo.com', 'dschroeder@jones.com', 'christy71@hodges.com', 'jenkinsedward@gmail.com', 'ucraig@guerra.com', 'vstevenson@gmail.com', 'tonifrey@welch.net', 'millsronald@mcdonald-parrish.com', 'csmith@wilson.com', 'sara01@hotmail.com', 'harrisshawn@yahoo.com', 'bcollins@gmail.com', 'willi