In [1]:
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk import pos_tag

[nltk_data] Downloading package stopwords to /home/parth/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [2]:
sentence1 = "Stemming and lemmatization are different techniques used to reduce words to their root form, but they produce varying results. Lemmatization is better than stemming"

In [3]:
import string
def tokenize(sentence : str):
    punctuation = string.punctuation + '[]{}()<>'
    for char in punctuation:
        sentence = sentence.replace(char, " ")
    sentence = sentence.lower()
    token = sentence.split()
    return token

tokens = tokenize(sentence1)

In [4]:
def RemoveStopWord(token):
    stop_words = set(stopwords.words('english'))
    filtered_sentence = [word for word in token if not word in stop_words]
    return filtered_sentence

tokens = RemoveStopWord(tokens)
tokens

['stemming',
 'lemmatization',
 'different',
 'techniques',
 'used',
 'reduce',
 'words',
 'root',
 'form',
 'produce',
 'varying',
 'results',
 'lemmatization',
 'better',
 'stemming']

In [5]:
pos_tag_list = nltk.pos_tag(tokens)
pos_tag_list

[('stemming', 'VBG'),
 ('lemmatization', 'NN'),
 ('different', 'JJ'),
 ('techniques', 'NNS'),
 ('used', 'VBN'),
 ('reduce', 'VB'),
 ('words', 'NNS'),
 ('root', 'VBP'),
 ('form', 'NN'),
 ('produce', 'VBP'),
 ('varying', 'VBG'),
 ('results', 'NNS'),
 ('lemmatization', 'NN'),
 ('better', 'RBR'),
 ('stemming', 'NN')]

In [6]:
stemmer = PorterStemmer()
for w in tokens:
    print(w, stemmer.stem(w))

stemming stem
lemmatization lemmat
different differ
techniques techniqu
used use
reduce reduc
words word
root root
form form
produce produc
varying vari
results result
lemmatization lemmat
better better
stemming stem


In [7]:
lemmatizer = WordNetLemmatizer()
for w in tokens:
    print(w, lemmatizer.lemmatize(w))

stemming stemming
lemmatization lemmatization
different different
techniques technique
used used
reduce reduce
words word
root root
form form
produce produce
varying varying
results result
lemmatization lemmatization
better better
stemming stemming


In [8]:
def calculateTF(token):
    token_freq = {}
    for w in token:
        if w not in token_freq:
            token_freq[w] = token.count(w)/len(token)
    return token_freq

calculateTF(tokens)

{'stemming': 0.13333333333333333,
 'lemmatization': 0.13333333333333333,
 'different': 0.06666666666666667,
 'techniques': 0.06666666666666667,
 'used': 0.06666666666666667,
 'reduce': 0.06666666666666667,
 'words': 0.06666666666666667,
 'root': 0.06666666666666667,
 'form': 0.06666666666666667,
 'produce': 0.06666666666666667,
 'varying': 0.06666666666666667,
 'results': 0.06666666666666667,
 'better': 0.06666666666666667}

In [10]:
def calculateTF_IDF(documents):
    documents = sent_tokenize(documents)
    document_map ={}
    document_tf = {}
    unique_word = set()
    word_idf = {}
    for i, document in enumerate(documents):
        tokeniseWords = tokenize(document)
        document_map[i] = tokeniseWords
        document_tf[i] = calculateTF(tokeniseWords)
        for words in tokeniseWords:
            unique_word.add(words)
    for word in unique_word:
        count = 0
        for _, tokeniseWords in document_map.items():
            if word in tokeniseWords:
                count+=1
        word_idf[word] = count
    return word_idf, document_tf
    
word_idf, document_tf = calculateTF_IDF(sentence1)
print(word_idf)

{'than': 1, 'results': 1, 'different': 1, 'used': 1, 'their': 1, 'produce': 1, 'stemming': 2, 'varying': 1, 'root': 1, 'lemmatization': 2, 'better': 1, 'is': 1, 'and': 1, 'to': 1, 'words': 1, 'techniques': 1, 'they': 1, 'are': 1, 'but': 1, 'reduce': 1, 'form': 1}
