In [1]:
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('averaged_perceptron_tagger')

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping tokenizers\punkt.zip.
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping corpora\stopwords.zip.
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\Admin\AppData\Roaming\nltk_data...
[nltk_data]   Unzipping taggers\averaged_perceptron_tagger.zip.


True

In [2]:
text= "Tokenization is the first step in text analytics. The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization."


In [3]:
from nltk.tokenize import sent_tokenize, word_tokenize
tokenized_text = sent_tokenize(text)
print(tokenized_text)

['Tokenization is the first step in text analytics.', 'The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization.']


In [4]:
tokenized_word = word_tokenize(text)
print(tokenized_word)

['Tokenization', 'is', 'the', 'first', 'step', 'in', 'text', 'analytics', '.', 'The', 'process', 'of', 'breaking', 'down', 'a', 'text', 'paragraph', 'into', 'smaller', 'chunks', 'such', 'as', 'words', 'or', 'sentences', 'is', 'called', 'Tokenization', '.']


In [5]:
from nltk.corpus import stopwords
import re

In [6]:
stop_words = set(stopwords.words("english"))
print(stop_words)

{'if', "you'll", 'was', 'these', 'we', 'during', 'same', 'against', 'yours', "don't", 'ourselves', "mightn't", "wasn't", "doesn't", 'themselves', 'each', 'out', 'both', 'yourself', 'some', 'hers', 'more', 'above', 'they', 'few', 'its', 'what', 'there', 'until', 'doesn', 'over', 'hasn', 'theirs', 'on', 'd', 'so', 're', 'very', 'which', 'won', 'and', 'them', 'down', 'that', 'she', 'couldn', "couldn't", 'or', 'once', 'don', 'their', 'can', 'needn', 'o', 'is', 'been', 'myself', 'herself', 'didn', 'll', 'through', 'i', 'of', "mustn't", "aren't", 'ain', 'shouldn', 'aren', 'weren', 'mustn', 'all', 'most', "she's", "needn't", 'just', 'are', 'at', 'off', 'this', 'for', 'were', 'nor', 'will', 't', 'itself', "should've", 'my', "didn't", 'should', 'ours', 'has', 's', 'when', 'your', 'wouldn', 'his', 'no', 'he', 'to', 'himself', 'wasn', 'but', "it's", 'now', 'because', 'between', 'who', 'by', 'the', 'those', "shouldn't", 'in', 'hadn', 'her', "weren't", 'haven', 'while', 'isn', 'yourselves', 'then',

In [7]:
text = "How to remove stop words with NLTK library in Python?"
text = re.sub('[^a-zA-Z]', ' ', text)
tokens = word_tokenize(text.lower())
filtered_text = [w for w in tokens if w not in stop_words]
print("Tokenized Sentence:", tokens)
print("Filtered Sentence:", filtered_text)

Tokenized Sentence: ['how', 'to', 'remove', 'stop', 'words', 'with', 'nltk', 'library', 'in', 'python']
Filtered Sentence: ['remove', 'stop', 'words', 'nltk', 'library', 'python']


In [8]:
from nltk.stem import PorterStemmer


In [9]:
e_words = ["wait", "waiting", "waited", "waits"]
ps = PorterStemmer()
for w in e_words:
    rootWord = ps.stem(w)
    print(rootWord)

wait
wait
wait
wait


In [10]:
from nltk.stem import WordNetLemmatizer

In [11]:
wordnet_lemmatizer = WordNetLemmatizer()
text = "studies studying cries cry"
tokenization = nltk.word_tokenize(text)
for w in tokenization:
    print("Lemma for {} is {}".format(w, wordnet_lemmatizer.lemmatize(w)))

Lemma for studies is study
Lemma for studying is studying
Lemma for cries is cry
Lemma for cry is cry


In [12]:
data = "The pink sweater fit her perfectly"
words = word_tokenize(data)
for word in words:
    print(nltk.pos_tag([word]))

[('The', 'DT')]
[('pink', 'NN')]
[('sweater', 'NN')]
[('fit', 'NN')]
[('her', 'PRP$')]
[('perfectly', 'RB')]


In [13]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer

In [14]:
documentA = 'Jupiter is the largest Planet'
documentB = 'Mars is the fourth planet from the Sun'

In [15]:
bagOfWordsA = documentA.split(' ')
bagOfWordsB = documentB.split(' ')

In [16]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

In [17]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1

numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1


In [18]:
def computeTF(wordDict, bagOfWords):
    tfDict = {}
    bagOfWordsCount = len(bagOfWords)
    for word, count in wordDict.items():
        tfDict[word] = count / float(bagOfWordsCount)
    return tfDict

tfA = computeTF(numOfWordsA, bagOfWordsA)
tfB = computeTF(numOfWordsB, bagOfWordsB)


In [19]:
def computeIDF(documents):
    import math
    N = len(documents)
    idfDict = dict.fromkeys(documents[0].keys(), 0)
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
    for word, val in idfDict.items():
        idfDict[word] = math.log(N / float(val))
    return idfDict

idfs = computeIDF([numOfWordsA, numOfWordsB])
print(idfs)


{'the': 0.0, 'Jupiter': 0.6931471805599453, 'Sun': 0.6931471805599453, 'fourth': 0.6931471805599453, 'from': 0.6931471805599453, 'largest': 0.6931471805599453, 'planet': 0.6931471805599453, 'Planet': 0.6931471805599453, 'is': 0.0, 'Mars': 0.6931471805599453}


In [20]:
def computeTFIDF(tfBagOfWords, idfs):
    tfidf = {}
    for word, val in tfBagOfWords.items():
        tfidf[word] = val * idfs[word]
    return tfidf

tfidfA = computeTFIDF(tfA, idfs)
tfidfB = computeTFIDF(tfB, idfs)
df = pd.DataFrame([tfidfA, tfidfB])
print(df)


   the   Jupiter       Sun    fourth      from   largest    planet    Planet  \
0  0.0  0.138629  0.000000  0.000000  0.000000  0.138629  0.000000  0.138629   
1  0.0  0.000000  0.086643  0.086643  0.086643  0.000000  0.086643  0.000000   

    is      Mars  
0  0.0  0.000000  
1  0.0  0.086643  


In [21]:
tfidfA = computeTFIDF(tfA, idfs)
tfidfA


{'the': 0.0,
 'Jupiter': 0.13862943611198905,
 'Sun': 0.0,
 'fourth': 0.0,
 'from': 0.0,
 'largest': 0.13862943611198905,
 'planet': 0.0,
 'Planet': 0.13862943611198905,
 'is': 0.0,
 'Mars': 0.0}

In [22]:
 tfidfB = computeTFIDF(tfB, idfs)
 tfidfB

{'the': 0.0,
 'Jupiter': 0.0,
 'Sun': 0.08664339756999316,
 'fourth': 0.08664339756999316,
 'from': 0.08664339756999316,
 'largest': 0.0,
 'planet': 0.08664339756999316,
 'Planet': 0.0,
 'is': 0.0,
 'Mars': 0.08664339756999316}

In [23]:
 import pandas as pd
 df = pd.DataFrame([tfidfA, tfidfB])
 df

Unnamed: 0,the,Jupiter,Sun,fourth,from,largest,planet,Planet,is,Mars
0,0.0,0.138629,0.0,0.0,0.0,0.138629,0.0,0.138629,0.0,0.0
1,0.0,0.0,0.086643,0.086643,0.086643,0.0,0.086643,0.0,0.0,0.086643
