Text Preprocessing Techniques - Basic

In [126]:
data = "The quick brown fox ðŸ¦Š jumps <p>over the lazy dog. its been such a long day, I realy knead a break </p> ðŸ˜´. Did you sea what happen'd at the store today? check out at: 'https://checknow.com' I cant beleive it, totaly wild! ðŸŽ‰"

In [127]:
# Lowercasing
data_1 = data.lower()
data_1

"the quick brown fox ðŸ¦Š jumps <p>over the lazy dog. its been such a long day, i realy knead a break </p> ðŸ˜´. did you sea what happen'd at the store today? check out at: 'https://checknow.com' i cant beleive it, totaly wild! ðŸŽ‰"

In [128]:
# Remove HTML tags
import re
def remove_html_tags(text):
    clean_text = re.sub(r'<.*?>', '', text)
    return clean_text

data_1 = remove_html_tags(data_1)
data_1

"the quick brown fox ðŸ¦Š jumps over the lazy dog. its been such a long day, i realy knead a break  ðŸ˜´. did you sea what happen'd at the store today? check out at: 'https://checknow.com' i cant beleive it, totaly wild! ðŸŽ‰"

In [129]:
# Remove URLs
def remove_urls(text):
    clean_text = re.sub(r'http\S+|www\S+|https\S+', '', text)
    return clean_text

data_1 = remove_urls(data_1)
data_1

"the quick brown fox ðŸ¦Š jumps over the lazy dog. its been such a long day, i realy knead a break  ðŸ˜´. did you sea what happen'd at the store today? check out at: ' i cant beleive it, totaly wild! ðŸŽ‰"

In [130]:
# Remove punctuation - Manually
import string
exclude = string.punctuation
def remove_punctuation(text):
    for char in exclude:
        text = text.replace(char, '')
    return text

data_1 = remove_punctuation(data_1)
data_1

'the quick brown fox ðŸ¦Š jumps over the lazy dog its been such a long day i realy knead a break  ðŸ˜´ did you sea what happend at the store today check out at  i cant beleive it totaly wild ðŸŽ‰'

In [131]:
data_2 = data.lower()
# Remove punctuation - Using translate (faster)
def remv_punct_translate(text):
    translator = str.maketrans('', '', exclude)
    return text.translate(translator)

remv_punct_translate(data_2)

'the quick brown fox ðŸ¦Š jumps pover the lazy dog its been such a long day i realy knead a break p ðŸ˜´ did you sea what happend at the store today check out at httpschecknowcom i cant beleive it totaly wild ðŸŽ‰'

In [132]:
# Chat Words Treatment
chat_words_dict = {
    "u": "you",
    "ur": "your",
    "btw": "by the way",
    "idk": "I don't know",
    "imo": "in my opinion",
    "brb": "be right back",
    "r": "are",
}

chat_word_text = "idk what u r doing btw"
def replace_chat_words(text):
    words = text.split()
    new_words = [chat_words_dict.get(word, word) for word in words]
    return ' '.join(new_words)

replace_chat_words(chat_word_text)

"I don't know what you are doing by the way"

In [133]:
# Spelling Correction
data_3 = "I cant beleive it, totaly wild! I realy knead a break. Did you sea what happen'd at the store today?"
from textblob import TextBlob
def correct_spelling(text):
    blob = TextBlob(text)
    return str(blob.correct())

correct_spelling(data_3)

"I can believe it, total wild! I really head a break. Did you sea what happen'd at the store today?"

In [134]:
# Removing the Stop Words
from nltk.corpus import stopwords
stopwords_list = stopwords.words('english')
def remove_stop_words(text):
    words = text.split()
    filtered_words = [word for word in words if word not in stopwords_list]
    return ' '.join(filtered_words)

remove_stop_words(data_1)

'quick brown fox ðŸ¦Š jumps lazy dog long day realy knead break ðŸ˜´ sea happend store today check cant beleive totaly wild ðŸŽ‰'

In [135]:
# handling Emojis - remove
import emoji
def remove_emojis(text):
    removed_emoji_text = [char for char in text if char not in emoji.EMOJI_DATA]
    removed_emoji_text = ''.join(removed_emoji_text)
    return removed_emoji_text

remove_emojis(data_1)

'the quick brown fox  jumps over the lazy dog its been such a long day i realy knead a break   did you sea what happend at the store today check out at  i cant beleive it totaly wild '

In [136]:
# handling Emojis - convert to text
def emojis_to_text(text):
    return emoji.demojize(text)

emojis_to_text(data_1)

'the quick brown fox :fox: jumps over the lazy dog its been such a long day i realy knead a break  :sleeping_face: did you sea what happend at the store today check out at  i cant beleive it totaly wild :party_popper:'

In [137]:
# Sentence Tokenization
data_4 = data
from nltk.tokenize import sent_tokenize
sentences = sent_tokenize(data_4)
sentences

['The quick brown fox ðŸ¦Š jumps <p>over the lazy dog.',
 'its been such a long day, I realy knead a break </p> ðŸ˜´.',
 "Did you sea what happen'd at the store today?",
 "check out at: 'https://checknow.com' I cant beleive it, totaly wild!",
 'ðŸŽ‰']

In [138]:
# Word Tokenization
from nltk.tokenize import word_tokenize
words = word_tokenize(data_4)
words

['The',
 'quick',
 'brown',
 'fox',
 'ðŸ¦Š',
 'jumps',
 '<',
 'p',
 '>',
 'over',
 'the',
 'lazy',
 'dog',
 '.',
 'its',
 'been',
 'such',
 'a',
 'long',
 'day',
 ',',
 'I',
 'realy',
 'knead',
 'a',
 'break',
 '<',
 '/p',
 '>',
 'ðŸ˜´',
 '.',
 'Did',
 'you',
 'sea',
 'what',
 'happen',
 "'d",
 'at',
 'the',
 'store',
 'today',
 '?',
 'check',
 'out',
 'at',
 ':',
 "'https",
 ':',
 '//checknow.com',
 "'",
 'I',
 'cant',
 'beleive',
 'it',
 ',',
 'totaly',
 'wild',
 '!',
 'ðŸŽ‰']

In [139]:
# Stemming
from nltk.stem import PorterStemmer
ps = PorterStemmer()
stemmed_words = [ps.stem(word) for word in words]
stemmed_words

['the',
 'quick',
 'brown',
 'fox',
 'ðŸ¦Š',
 'jump',
 '<',
 'p',
 '>',
 'over',
 'the',
 'lazi',
 'dog',
 '.',
 'it',
 'been',
 'such',
 'a',
 'long',
 'day',
 ',',
 'i',
 'reali',
 'knead',
 'a',
 'break',
 '<',
 '/p',
 '>',
 'ðŸ˜´',
 '.',
 'did',
 'you',
 'sea',
 'what',
 'happen',
 "'d",
 'at',
 'the',
 'store',
 'today',
 '?',
 'check',
 'out',
 'at',
 ':',
 "'http",
 ':',
 '//checknow.com',
 "'",
 'i',
 'cant',
 'beleiv',
 'it',
 ',',
 'totali',
 'wild',
 '!',
 'ðŸŽ‰']

In [140]:
# Lemmatization
from nltk.stem import WordNetLemmatizer
wnl = WordNetLemmatizer()
lemmatized_words = [wnl.lemmatize(word) for word in words]
lemmatized_words

['The',
 'quick',
 'brown',
 'fox',
 'ðŸ¦Š',
 'jump',
 '<',
 'p',
 '>',
 'over',
 'the',
 'lazy',
 'dog',
 '.',
 'it',
 'been',
 'such',
 'a',
 'long',
 'day',
 ',',
 'I',
 'realy',
 'knead',
 'a',
 'break',
 '<',
 '/p',
 '>',
 'ðŸ˜´',
 '.',
 'Did',
 'you',
 'sea',
 'what',
 'happen',
 "'d",
 'at',
 'the',
 'store',
 'today',
 '?',
 'check',
 'out',
 'at',
 ':',
 "'https",
 ':',
 '//checknow.com',
 "'",
 'I',
 'cant',
 'beleive',
 'it',
 ',',
 'totaly',
 'wild',
 '!',
 'ðŸŽ‰']