In [1]:
import nltk
from nltk.tokenize import word_tokenize
from nltk import pos_tag
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.stem import WordNetLemmatizer

In [2]:
sentence = "The quick brown fox jumps over the lazy dog."

In [5]:
#tokenization
tokens = word_tokenize(sentence)
print(tokens)

['The', 'quick', 'brown', 'fox', 'jumps', 'over', 'the', 'lazy', 'dog', '.']


In [11]:
#POS
tokens = word_tokenize(sentence)
pos_tags = pos_tag(tokens)
print(pos_tags)

[('The', 'DT'), ('quick', 'JJ'), ('brown', 'NN'), ('fox', 'NN'), ('jumps', 'VBZ'), ('over', 'IN'), ('the', 'DT'), ('lazy', 'JJ'), ('dog', 'NN'), ('.', '.')]


In [14]:
#Stop words removal
stop_words = set(stopwords.words('english'))
filtered_tokens = [token for token in tokens if token.lower() not in stop_words]
print(filtered_tokens)

['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog', '.']


In [15]:
#stemming
stemmer = PorterStemmer()
stemmed_tokens = [stemmer.stem(token) for token in tokens]
print(stemmed_tokens)

['the', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazi', 'dog', '.']


In [18]:
#Lemmatization
lemmatizer = WordNetLemmatizer()
lemmatized_tokens = [lemmatizer.lemmatize(token) for token in tokens]
print(lemmatized_tokens)

['The', 'quick', 'brown', 'fox', 'jump', 'over', 'the', 'lazy', 'dog', '.']


In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer

corpus = ["The quick brown fox jumps over the lazy dog."]
vectorizer = TfidfVectorizer()
tfidf = vectorizer.fit_transform(corpus)
print(tfidf)

  (0, 1)	0.30151134457776363
  (0, 4)	0.30151134457776363
  (0, 5)	0.30151134457776363
  (0, 3)	0.30151134457776363
  (0, 2)	0.30151134457776363
  (0, 0)	0.30151134457776363
  (0, 6)	0.30151134457776363
  (0, 7)	0.6030226891555273


In [20]:
terms = vectorizer.get_feature_names()
tfidf_scores = tfidf.toarray()[0]

for i in range(len(terms)):
    print(terms[i], ":", tfidf_scores[i])


brown : 0.30151134457776363
dog : 0.30151134457776363
fox : 0.30151134457776363
jumps : 0.30151134457776363
lazy : 0.30151134457776363
over : 0.30151134457776363
quick : 0.30151134457776363
the : 0.6030226891555273
