# NLP preprocessing technique

## Lowercasing

In [1]:
text = "Hello!!This is NLP"
lower_text = text.lower()
print(lower_text)

hello!!this is nlp


## Removing Punctuations

In [2]:
import string

text = "Hello,there! How's it going??"
text_no_punc = text.translate(str.maketrans("","",string.punctuation))
print(text_no_punc)

Hellothere Hows it going


## Removing special characters

In [3]:
import re

text = "Hello @NLP! #MachineLearning $100 is amazing."
text_clean = re.sub(r"[^a-zA-Z0-9\s]", "", text)
print(text_clean)


Hello NLP MachineLearning 100 is amazing


## Removind stopwords

In [4]:
import nltk
from nltk.corpus import stopwords

nltk.download("stopwords")
stop_words = set(stopwords.words("english"))

text = "This is an example of removing stopwords from a sentence."
filtered_text = " ".join([word for word in text.split() if word.lower() not in stop_words])
print(filtered_text)


example removing stopwords sentence.


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


## Tokenization

In [5]:
import nltk
from nltk.tokenize import word_tokenize, sent_tokenize


nltk.download('punkt')
nltk.download('punkt_tab')

text = "Tokenization is an important step in NLP. It helps in processing text efficiently."
word_tokens = word_tokenize(text)
sent_tokens = sent_tokenize(text)

print("Word Tokens:", word_tokens)
print("Sentence Tokens:", sent_tokens)


Word Tokens: ['Tokenization', 'is', 'an', 'important', 'step', 'in', 'NLP', '.', 'It', 'helps', 'in', 'processing', 'text', 'efficiently', '.']
Sentence Tokens: ['Tokenization is an important step in NLP.', 'It helps in processing text efficiently.']


[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


## Stemming

In [6]:
from nltk.stem import PorterStemmer

stemmer = PorterStemmer()
words = ["running", "flies", "easily", "playing"]

stemmed_words = [stemmer.stem(word) for word in words]
print(stemmed_words)


['run', 'fli', 'easili', 'play']


## Lemmatization

In [7]:
from nltk.stem import WordNetLemmatizer

nltk.download("wordnet")

lemmatizer = WordNetLemmatizer()
words = ["running", "flies", "easily", "playing"]

lemmatized_words = [lemmatizer.lemmatize(word, pos="v") for word in words]
print(lemmatized_words)


[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


['run', 'fly', 'easily', 'play']
