In [50]:
import re
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.stem.porter import PorterStemmer

tweet = "omg! I just 8 Red Lobster!🤭 So many #delicious items on their menu: @RedLobster https://www.redlobster.com/menu"
print("sample tweet:")
print(tweet)


sample tweet:
omg! I just 8 Red Lobster!🤭 So many #delicious items on their menu: @RedLobster https://www.redlobster.com/menu


In [51]:
# remove non-ASCII characters
tweet = re.sub(r'[^\x00-\x7F]+', ' ', tweet)
print(tweet)


omg! I just 8 Red Lobster!  So many #delicious items on their menu: @RedLobster https://www.redlobster.com/menu


In [52]:
# replace URLs, user mentions and hashtags
tweet = re.sub(r'http\S+', 'URL', tweet)
tweet = re.sub(r'@\w+', 'MENTION', tweet)
tweet = re.sub(r'#\w+', 'HASHTAG', tweet)
print(tweet)

omg! I just 8 Red Lobster!  So many HASHTAG items on their menu: MENTION URL


In [53]:
# remove numbers
tweet = re.sub(r'\d+', '', tweet)
print(tweet)

omg! I just  Red Lobster!  So many HASHTAG items on their menu: MENTION URL


In [54]:
# remove punctuation
tweet = re.sub(r'[^\w\s]', '', tweet)
print(tweet)

omg I just  Red Lobster  So many HASHTAG items on their menu MENTION URL


In [55]:
# lowercase
tweet = tweet.lower()
print(tweet)

omg i just  red lobster  so many hashtag items on their menu mention url


In [56]:
import nltk
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/mehwishahmed/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

Lemmatizing reduces words to their base or root form.

In [57]:
import nltk
from nltk.stem import WordNetLemmatizer

# download WordNet corpus
nltk.download('wordnet')
nltk.download('omw-1.4')

# given filtered tweet
filtered_tweet = ['omg', 'i', 'just', 'red', 'lobster', 'so', 'many', 'hashtag', 'items', 'on', 'their','menu','url']

# initialize WordNetLemmatizer
lemmatizer = WordNetLemmatizer()

# lemmatize each word in the filtered tweet
lemmatized_tweet = [lemmatizer.lemmatize(w) for w in filtered_tweet]

# results
print(lemmatized_tweet)


['omg', 'i', 'just', 'red', 'lobster', 'so', 'many', 'hashtag', 'item', 'on', 'their', 'menu', 'url']


[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/mehwishahmed/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/mehwishahmed/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


PorterStemmer() is an algorithm used for stemming English words. It’s part of the NLTK library.
stemmed_tweet is a list comprehension that applies the stemmer to each word in lemmatized_tweet.
stemmer.stem(w) applies the stemming process to each word w.
The result, stemmed_tweet, is a list of words where each word has been reduced to its stemmed form.
Finally, it prints out the list of stemmed words.
This step is often used in text preprocessing for machine learning models where the exact form of a word is less important than the type of action or idea it represents.

In [58]:
# stemming
stemmer = PorterStemmer()
stemmed_tweet = [stemmer.stem(w) for w in lemmatized_tweet]
print(stemmed_tweet)

['omg', 'i', 'just', 'red', 'lobster', 'so', 'mani', 'hashtag', 'item', 'on', 'their', 'menu', 'url']


In [59]:
# combine words back to sentence
preprocessed_tweet = ' '.join(stemmed_tweet)
print("\nfinal tweet:")
print(preprocessed_tweet)


final tweet:
omg i just red lobster so mani hashtag item on their menu url
