In [79]:
import nltk
nltk.download('punkt_tab') 
nltk.download('stopwords')  # For stopwords
nltk.download('wordnet')  # For lemmatization
nltk.download('averaged_perceptron_tagger_eng')

[nltk_data] Downloading package punkt_tab to
[nltk_data]     C:\Users\LENOVO/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\LENOVO/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\LENOVO/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger_eng to
[nltk_data]     C:\Users\LENOVO/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger_eng is already up-to-
[nltk_data]       date!


True

In [81]:
text = "Tokenization is the first step in text analytics.The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization."

In [83]:
from nltk.tokenize import sent_tokenize
tokenized_text = sent_tokenize(text)
print(tokenized_text)

['Tokenization is the first step in text analytics.The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization.']


In [85]:
from nltk.tokenize import word_tokenize
tokenized_word = word_tokenize(text)
print(tokenized_word)

['Tokenization', 'is', 'the', 'first', 'step', 'in', 'text', 'analytics.The', 'process', 'of', 'breaking', 'down', 'a', 'text', 'paragraph', 'into', 'smaller', 'chunks', 'such', 'as', 'words', 'or', 'sentences', 'is', 'called', 'Tokenization', '.']


In [87]:
from nltk.corpus import stopwords
import re

stop_words = set(stopwords.words("english"))
text = "How to remove stop words with NLTK library in Python?"
text = re.sub('[^a-zA-Z]', ' ', text)  # Remove punctuation
tokens = word_tokenize(text.lower())
filtered_text = [w for w in tokens if w not in stop_words]

print("Tokenized Sentence:", tokens)
print("Filtered Sentence:", filtered_text)

Tokenized Sentence: ['how', 'to', 'remove', 'stop', 'words', 'with', 'nltk', 'library', 'in', 'python']
Filtered Sentence: ['remove', 'stop', 'words', 'nltk', 'library', 'python']


In [89]:
from nltk.stem import PorterStemmer

e_words = ["wait", "waiting", "waited", "waits"]
ps = PorterStemmer()

for w in e_words:
    rootWord = ps.stem(w)
    print(rootWord)

wait
wait
wait
wait


In [91]:
from nltk.stem import WordNetLemmatizer

wordnet_lemmatizer = WordNetLemmatizer()
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)

for w in tokenization:
    print("Lemma for {}: {}".format(w, wordnet_lemmatizer.lemmatize(w)))

Lemma for studies: study
Lemma for studying: studying
Lemma for cries: cry
Lemma for cry: cry


In [93]:
from nltk.tokenize import word_tokenize
data = "The pink sweater fit her perfectly"
words = word_tokenize(data)

for word in words:
    print(nltk.pos_tag([word]))

[('The', 'DT')]
[('pink', 'NN')]
[('sweater', 'NN')]
[('fit', 'NN')]
[('her', 'PRP$')]
[('perfectly', 'RB')]


In [95]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import math


In [97]:
documentA = 'Jupiter is the largest Planet'
documentB = 'Mars is the fourth planet from the Sun'

In [99]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

In [101]:
bagOfWordsA

['Jupiter', 'is', 'the', 'largest', 'Planet']

In [103]:
bagOfWordsB

['Mars', 'is', 'the', 'fourth', 'planet', 'from', 'the', 'Sun']

In [105]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))
uniqueWords

{'Jupiter',
 'Mars',
 'Planet',
 'Sun',
 'fourth',
 'from',
 'is',
 'largest',
 'planet',
 'the'}

In [107]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1

numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1

In [109]:
numOfWordsA

{'fourth': 0,
 'largest': 1,
 'Planet': 1,
 'Mars': 0,
 'Sun': 0,
 'planet': 0,
 'from': 0,
 'Jupiter': 1,
 'the': 1,
 'is': 1}

In [111]:
numOfWordsB

{'fourth': 1,
 'largest': 0,
 'Planet': 0,
 'Mars': 1,
 'Sun': 1,
 'planet': 1,
 'from': 1,
 'Jupiter': 0,
 'the': 2,
 'is': 1}

In [113]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)
 

In [115]:
tfA

{'fourth': 0.0,
 'largest': 0.2,
 'Planet': 0.2,
 'Mars': 0.0,
 'Sun': 0.0,
 'planet': 0.0,
 'from': 0.0,
 'Jupiter': 0.2,
 'the': 0.2,
 'is': 0.2}

In [117]:
tfB

{'fourth': 0.125,
 'largest': 0.0,
 'Planet': 0.0,
 'Mars': 0.125,
 'Sun': 0.125,
 'planet': 0.125,
 'from': 0.125,
 'Jupiter': 0.0,
 'the': 0.25,
 'is': 0.125}

In [119]:
def computeIDF(documents):
    N = len(documents)
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

idfs = computeIDF([numOfWordsA, numOfWordsB])

In [69]:
idfs 

{'fourth': 0.6931471805599453,
 'largest': 0.6931471805599453,
 'Planet': 0.6931471805599453,
 'Mars': 0.6931471805599453,
 'Sun': 0.6931471805599453,
 'planet': 0.6931471805599453,
 'from': 0.6931471805599453,
 'Jupiter': 0.6931471805599453,
 'the': 0.0,
 'is': 0.0}

In [71]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)

# Create a DataFrame for visualization
df = pd.DataFrame([tfidfA, tfidfB])
print(df)

     fourth   largest    Planet      Mars       Sun    planet      from  \
0  0.000000  0.138629  0.138629  0.000000  0.000000  0.000000  0.000000   
1  0.086643  0.000000  0.000000  0.086643  0.086643  0.086643  0.086643   

    Jupiter  the   is  
0  0.138629  0.0  0.0  
1  0.000000  0.0  0.0  
