In [4]:
import pandas as pd
import nltk

In [5]:
def computeTF(wordDict, bagOfWords): 
    tfDict = {}
    bagOfWordsCount = len(bagOfWords) 
    for word, count in wordDict.items():
         tfDict[word] = count / float(bagOfWordsCount) 
    return tfDict

In [6]:
def computeIDF(documents):
    import math
    N = len(documents)
    idfDict = dict.fromkeys(documents[0].keys(), 0) 
    for document in documents:
        for word, val in document.items():
            if val > 0:
                idfDict[word] += 1
                for word, val in idfDict.items():
                    if val > 0: idfDict[word] = math.log(N / float(val))
                    else: idfDict[word] = 0
    return idfDict

In [8]:
def computeTFIDF(tfBagOfWords, idfs): 
    tfidf = {}
    for word, val in tfBagOfWords.items(): 
        tfidf[word] = val * idfs[word]
    return tfidf
text= "Tokenization is the first step in text analytics. The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization."
print('The given sentences are: \n', text)

The given sentences are: 
 Tokenization is the first step in text analytics. The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization.


In [9]:
from nltk.tokenize import sent_tokenize 
tokenized_text= sent_tokenize(text) 
print("\n Sentence Tokenization: \n", tokenized_text)


 Sentence Tokenization: 
 ['Tokenization is the first step in text analytics.', 'The process of breaking down a text paragraph into smaller chunks such as words or sentences is called Tokenization.']


In [10]:
from nltk.tokenize import word_tokenize 
tokenized_word=word_tokenize(text) 
print('\nWord Tokeniztion: \n', tokenized_word)


Word Tokeniztion: 
 ['Tokenization', 'is', 'the', 'first', 'step', 'in', 'text', 'analytics', '.', 'The', 'process', 'of', 'breaking', 'down', 'a', 'text', 'paragraph', 'into', 'smaller', 'chunks', 'such', 'as', 'words', 'or', 'sentences', 'is', 'called', 'Tokenization', '.']


In [12]:
import re

In [13]:
from nltk.corpus import stopwords 
stop_words=set(stopwords.words("english")) 
# Removing stop words
text= "How to remove stop words with NLTK library in Python?" 
text= re.sub('[^a-zA-Z]', ' ',text) 
tokens = word_tokenize(text.lower()) 
filtered_text=[] 
for w in tokens: 
    if w not in stop_words: 
        filtered_text.append(w) 
print ("Tokenized Sentence:",tokens) 
print ("Filterd Sentence:",filtered_text)

Tokenized Sentence: ['how', 'to', 'remove', 'stop', 'words', 'with', 'nltk', 'library', 'in', 'python']
Filterd Sentence: ['remove', 'stop', 'words', 'nltk', 'library', 'python']


In [14]:
from nltk.stem import PorterStemmer 
e_words= ["wait", "waiting", "waited", "waits"] 
ps =PorterStemmer() 
for w in e_words: 
    rootWord=ps.stem(w) 
    print('Stemming for ',w,': ',rootWord)

Stemming for  wait :  wait
Stemming for  waiting :  wait
Stemming for  waited :  wait
Stemming for  waits :  wait


In [15]:
from nltk.stem import WordNetLemmatizer 
wordnet_lemmatizer = WordNetLemmatizer() 
text = "studies studying cries cry" 
tokenization = nltk.word_tokenize(text) 
for w in tokenization: 
    print("Lemma for {} is {}".format(w,wordnet_lemmatizer.lemmatize(w))) 

Lemma for studies is study
Lemma for studying is studying
Lemma for cries is cry
Lemma for cry is cry


# TFIDF

In [16]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [17]:
documentA = 'Jupiter is the largest planet' 
documentB = 'Mars is the fourth planet from the Sun'

In [18]:
bagOfWordsA = documentA.split(' ') 
bagOfWordsB = documentB.split(' ')

In [19]:
uniqueWords = set(bagOfWordsA).union(set(bagOfWordsB))

In [27]:
numOfWordsA = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsA:
    numOfWordsA[word] += 1
    numOfWordsB = dict.fromkeys(uniqueWords, 0)
for word in bagOfWordsB:
    numOfWordsB[word] += 1

In [28]:
tfA = computeTF(numOfWordsA, bagOfWordsA) 
tfB = computeTF(numOfWordsB, bagOfWordsB)

In [29]:
print('----------------Term Frequency----------------------')
df = pd.DataFrame([tfA, tfB]) 
print(df)

----------------Term Frequency----------------------
   planet     is  Jupiter   Mars    Sun   the  largest  fourth   from
0   0.200  0.200      0.2  0.000  0.000  0.20      0.2   0.000  0.000
1   0.125  0.125      0.0  0.125  0.125  0.25      0.0   0.125  0.125


In [31]:
idfs = computeIDF([numOfWordsA, numOfWordsB]) 
print('----------------Inverse Document Frequency----------------------')
print(idfs)
tfidfA = computeTFIDF(tfA, idfs) 
tfidfB = computeTFIDF(tfB, idfs)
print('------------------- TF-IDF--------------------------------------')
df = pd.DataFrame([tfidfA, tfidfB]) 
print(df)

----------------Inverse Document Frequency----------------------
{'planet': 0, 'is': 0, 'Jupiter': 1.9392430739608337, 'Mars': 0.5560285979740467, 'Sun': 1.1469641504314008, 'the': 0, 'largest': 1.5000848184268372, 'fourth': 1.0596601011416096, 'from': 0.6931471805599453}
------------------- TF-IDF--------------------------------------
   planet   is   Jupiter      Mars       Sun  the   largest    fourth  \
0     0.0  0.0  0.387849  0.000000  0.000000  0.0  0.300017  0.000000   
1     0.0  0.0  0.000000  0.069504  0.143371  0.0  0.000000  0.132458   

       from  
0  0.000000  
1  0.086643  
