**File:** Text cleaning.ipynb <br>
**Author:** Prem kumar Kamasani <br>
**Date:** 05/21/2024 <br>
**Description:** This python script shows the text cleaning techniques using NLKT library.
<ol>
    <li>Tokenization</li>
            <li>POS tagging and stop words removal.</li>
            <li>Stemming</li>
            <li>Lemmatization</li>
 </ol>

## Section 1
### Tokenization: 
<p> Breaking down a document or text into smaller units (tokens). </p>

In [2]:
import nltk

In [3]:
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\premk\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.


True

In [25]:
text = "Natural language processing (NLP) is a subfield of artificial intelligence (AI) that allows computers to understand and process human language, both spoken and written. NLP has been around for over 50 years and has many real-world applications, including: Search engines, Business intelligence, Medical research, Language translation, and Chatbots."

In [26]:
tokens = nltk.word_tokenize(text)

In [27]:
tokens

['Natural',
 'language',
 'processing',
 '(',
 'NLP',
 ')',
 'is',
 'a',
 'subfield',
 'of',
 'artificial',
 'intelligence',
 '(',
 'AI',
 ')',
 'that',
 'allows',
 'computers',
 'to',
 'understand',
 'and',
 'process',
 'human',
 'language',
 ',',
 'both',
 'spoken',
 'and',
 'written',
 '.',
 'NLP',
 'has',
 'been',
 'around',
 'for',
 'over',
 '50',
 'years',
 'and',
 'has',
 'many',
 'real-world',
 'applications',
 ',',
 'including',
 ':',
 'Search',
 'engines',
 ',',
 'Business',
 'intelligence',
 ',',
 'Medical',
 'research',
 ',',
 'Language',
 'translation',
 ',',
 'and',
 'Chatbots',
 '.']

In [28]:
# Above mentioned approch considers punctuations as a token.
# This approch will remove punctuations.
from nltk.tokenize import RegexpTokenizer

tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(text)

In [29]:
print(tokens)

['Natural', 'language', 'processing', 'NLP', 'is', 'a', 'subfield', 'of', 'artificial', 'intelligence', 'AI', 'that', 'allows', 'computers', 'to', 'understand', 'and', 'process', 'human', 'language', 'both', 'spoken', 'and', 'written', 'NLP', 'has', 'been', 'around', 'for', 'over', '50', 'years', 'and', 'has', 'many', 'real', 'world', 'applications', 'including', 'Search', 'engines', 'Business', 'intelligence', 'Medical', 'research', 'Language', 'translation', 'and', 'Chatbots']


In [30]:
# Bigrams: A pair of consecutive tokens.
bigram_tokens = list(nltk.bigrams(tokens))

In [31]:
bigram_tokens

[('Natural', 'language'),
 ('language', 'processing'),
 ('processing', 'NLP'),
 ('NLP', 'is'),
 ('is', 'a'),
 ('a', 'subfield'),
 ('subfield', 'of'),
 ('of', 'artificial'),
 ('artificial', 'intelligence'),
 ('intelligence', 'AI'),
 ('AI', 'that'),
 ('that', 'allows'),
 ('allows', 'computers'),
 ('computers', 'to'),
 ('to', 'understand'),
 ('understand', 'and'),
 ('and', 'process'),
 ('process', 'human'),
 ('human', 'language'),
 ('language', 'both'),
 ('both', 'spoken'),
 ('spoken', 'and'),
 ('and', 'written'),
 ('written', 'NLP'),
 ('NLP', 'has'),
 ('has', 'been'),
 ('been', 'around'),
 ('around', 'for'),
 ('for', 'over'),
 ('over', '50'),
 ('50', 'years'),
 ('years', 'and'),
 ('and', 'has'),
 ('has', 'many'),
 ('many', 'real'),
 ('real', 'world'),
 ('world', 'applications'),
 ('applications', 'including'),
 ('including', 'Search'),
 ('Search', 'engines'),
 ('engines', 'Business'),
 ('Business', 'intelligence'),
 ('intelligence', 'Medical'),
 ('Medical', 'research'),
 ('research', 'La

In [32]:
# Trigrams: A group of three consecutive tokens.
trigram_tokens = list(nltk.trigrams(tokens))

In [33]:
trigram_tokens

[('Natural', 'language', 'processing'),
 ('language', 'processing', 'NLP'),
 ('processing', 'NLP', 'is'),
 ('NLP', 'is', 'a'),
 ('is', 'a', 'subfield'),
 ('a', 'subfield', 'of'),
 ('subfield', 'of', 'artificial'),
 ('of', 'artificial', 'intelligence'),
 ('artificial', 'intelligence', 'AI'),
 ('intelligence', 'AI', 'that'),
 ('AI', 'that', 'allows'),
 ('that', 'allows', 'computers'),
 ('allows', 'computers', 'to'),
 ('computers', 'to', 'understand'),
 ('to', 'understand', 'and'),
 ('understand', 'and', 'process'),
 ('and', 'process', 'human'),
 ('process', 'human', 'language'),
 ('human', 'language', 'both'),
 ('language', 'both', 'spoken'),
 ('both', 'spoken', 'and'),
 ('spoken', 'and', 'written'),
 ('and', 'written', 'NLP'),
 ('written', 'NLP', 'has'),
 ('NLP', 'has', 'been'),
 ('has', 'been', 'around'),
 ('been', 'around', 'for'),
 ('around', 'for', 'over'),
 ('for', 'over', '50'),
 ('over', '50', 'years'),
 ('50', 'years', 'and'),
 ('years', 'and', 'has'),
 ('and', 'has', 'many'),
 

In [34]:
# ngrams: A group of n consecutive tokens.
ngram_tokens = list(nltk.ngrams(tokens, 4))

In [35]:
ngram_tokens

[('Natural', 'language', 'processing', 'NLP'),
 ('language', 'processing', 'NLP', 'is'),
 ('processing', 'NLP', 'is', 'a'),
 ('NLP', 'is', 'a', 'subfield'),
 ('is', 'a', 'subfield', 'of'),
 ('a', 'subfield', 'of', 'artificial'),
 ('subfield', 'of', 'artificial', 'intelligence'),
 ('of', 'artificial', 'intelligence', 'AI'),
 ('artificial', 'intelligence', 'AI', 'that'),
 ('intelligence', 'AI', 'that', 'allows'),
 ('AI', 'that', 'allows', 'computers'),
 ('that', 'allows', 'computers', 'to'),
 ('allows', 'computers', 'to', 'understand'),
 ('computers', 'to', 'understand', 'and'),
 ('to', 'understand', 'and', 'process'),
 ('understand', 'and', 'process', 'human'),
 ('and', 'process', 'human', 'language'),
 ('process', 'human', 'language', 'both'),
 ('human', 'language', 'both', 'spoken'),
 ('language', 'both', 'spoken', 'and'),
 ('both', 'spoken', 'and', 'written'),
 ('spoken', 'and', 'written', 'NLP'),
 ('and', 'written', 'NLP', 'has'),
 ('written', 'NLP', 'has', 'been'),
 ('NLP', 'has', 

## Section 2
### POS tagging
<p> POS Tagging is the process of marking up a word in a text (corpus) as corresponding to a particular part of speech, based on its definition and its context.</p>

### Stop words removal
<p> Stop words are a set of commonly used words in a language. These are so commonly used, as they carry very little useful information. examples of stop words in English are, "a", "the", "in", "on", "are" </p>

In [36]:
nltk.download("averaged_perceptron_tagger")

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\premk\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [38]:
for token in tokens:
    print(nltk.pos_tag([token]))

[('Natural', 'JJ')]
[('language', 'NN')]
[('processing', 'NN')]
[('NLP', 'NN')]
[('is', 'VBZ')]
[('a', 'DT')]
[('subfield', 'NN')]
[('of', 'IN')]
[('artificial', 'JJ')]
[('intelligence', 'NN')]
[('AI', 'NN')]
[('that', 'IN')]
[('allows', 'NNS')]
[('computers', 'NNS')]
[('to', 'TO')]
[('understand', 'NN')]
[('and', 'CC')]
[('process', 'NN')]
[('human', 'NN')]
[('language', 'NN')]
[('both', 'DT')]
[('spoken', 'NN')]
[('and', 'CC')]
[('written', 'VBN')]
[('NLP', 'NN')]
[('has', 'VBZ')]
[('been', 'VBN')]
[('around', 'IN')]
[('for', 'IN')]
[('over', 'IN')]
[('50', 'CD')]
[('years', 'NNS')]
[('and', 'CC')]
[('has', 'VBZ')]
[('many', 'JJ')]
[('real', 'JJ')]
[('world', 'NN')]
[('applications', 'NNS')]
[('including', 'VBG')]
[('Search', 'NN')]
[('engines', 'NNS')]
[('Business', 'NN')]
[('intelligence', 'NN')]
[('Medical', 'JJ')]
[('research', 'NN')]
[('Language', 'NN')]
[('translation', 'NN')]
[('and', 'CC')]
[('Chatbots', 'NNS')]


In [40]:
nltk.download('stopwords')

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\premk\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.


True

In [41]:
from nltk.corpus import stopwords
stop_words = stopwords.words('english')

In [43]:
filtered_data = [w for w in tokens if not w in stop_words]

In [44]:
filtered_data

['Natural',
 'language',
 'processing',
 'NLP',
 'subfield',
 'artificial',
 'intelligence',
 'AI',
 'allows',
 'computers',
 'understand',
 'process',
 'human',
 'language',
 'spoken',
 'written',
 'NLP',
 'around',
 '50',
 'years',
 'many',
 'real',
 'world',
 'applications',
 'including',
 'Search',
 'engines',
 'Business',
 'intelligence',
 'Medical',
 'research',
 'Language',
 'translation',
 'Chatbots']

## Section 3
### Stemming
<p> Stemming is the process of removing a partr of a word, or reducing a word to its stem or root form. Stemmed word may not be a word in dictionary.</p>
Types of Stemming in NLTK are,
<ol> 
<li>Porter Stemmer</li>
    <li>Snowball Stemmer</li>
    <li>lancaster Stemmer</li>
    <li>regex Stemmer</li>
</ol>

In [45]:
# Porter Stemmer
from nltk.stem import PorterStemmer

In [60]:
porter = PorterStemmer()
words = ['generous', 'generously', 'generate', 'generation', 'general', 'generalize', 'eat', 'ate', 'eats']
for word in words:
    print(f"{word} --> {porter.stem(word)}")

generous --> gener
generously --> gener
generate --> gener
generation --> gener
general --> gener
generalize --> gener
eat --> eat
ate --> ate
eats --> eat


In [61]:
# Snowball Stemmer
from nltk.stem import SnowballStemmer
snowball = SnowballStemmer(language = 'english')
for word in words:
    print(f"{word} --> {snowball.stem(word)}")

generous --> generous
generously --> generous
generate --> generat
generation --> generat
general --> general
generalize --> general
eat --> eat
ate --> ate
eats --> eat


In [62]:
# Lancaster Stemmer
from nltk.stem import LancasterStemmer
lan = LancasterStemmer()
for word in words:
    print(f"{word} --> {lan.stem(word)}")

generous --> gen
generously --> gen
generate --> gen
generation --> gen
general --> gen
generalize --> gen
eat --> eat
ate --> at
eats --> eat


In [64]:
# regular expression stemmer
from nltk.stem import RegexpStemmer
regexp = RegexpStemmer('ing$|s$|e$|able$', min=4)
for word in words:
    print(f"{word} --> {regexp.stem(word)}")

generous --> generou
generously --> generously
generate --> generat
generation --> generation
general --> general
generalize --> generaliz
eat --> eat
ate --> ate
eats --> eat


## Section 4
### Lemmatization
<p> Converting the words into the root word using Parts of Speech and context as a base. Root word will be a valid word in a dictionary</p>

In [65]:
from nltk.stem import WordNetLemmatizer

In [66]:
nltk.download('wordnet')
nltk.download('omw-1.4')

[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\premk\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package omw-1.4 to
[nltk_data]     C:\Users\premk\AppData\Roaming\nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!


True

In [67]:
lemma = WordNetLemmatizer()
for word in words:
    print(f"{word} --> {lemma.lemmatize(word)}")

generous --> generous
generously --> generously
generate --> generate
generation --> generation
general --> general
generalize --> generalize
eat --> eat
ate --> ate
eats --> eats
