## **Import relevant libraries**


In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
from pprint import pprint
import pandas as pd
import numpy as np

## **Define text corpus**


In [None]:
corpus = [
    "Thor eating pizza, Loki is eating pizza, Ironman ate pizza already",
    "Apple is announcing new iphone tomorrow",
    "Tesla is announcing new model-3 tomorrow",
    "Google is announcing new pixel-6 tomorrow",
    "Microsoft is announcing new surface tomorrow",
    "Amazon is announcing new eco-dot tomorrow",
    "I am eating an orange you are eating grapes"
]



{'already': 0,
 'am': 1,
 'amazon': 2,
 'an': 3,
 'announcing': 4,
 'apple': 5,
 'are': 6,
 'ate': 7,
 'dot': 8,
 'eating': 9,
 'eco': 10,
 'google': 11,
 'grapes': 12,
 'iphone': 13,
 'ironman': 14,
 'is': 15,
 'loki': 16,
 'microsoft': 17,
 'model': 18,
 'new': 19,
 'orange': 20,
 'pixel': 21,
 'pizza': 22,
 'surface': 23,
 'tesla': 24,
 'thor': 25,
 'tomorrow': 26,
 'you': 27}


## **Tokenization and vector generation**


In [None]:
v = TfidfVectorizer()
v.fit(corpus)
transform_output = v.transform(corpus)



## **Get vocabulary**


In [None]:
#print vocabulary
pprint(v.vocabulary_)

## **Get token and vector based on a word**


In [None]:
i = v.vocabulary_.get('thor')
print(i)
print(v.idf_[i])

25
2.386294361119891


## **Print the idf of each word**


In [None]:

all_feature_names = v.get_feature_names_out()

for word in all_feature_names:
  indx = v.vocabulary_.get(word)

  #get the score
  idf_score = v.idf_[indx]
  print(f"{word}: {idf_score}")


already: 2.386294361119891
am: 2.386294361119891
amazon: 2.386294361119891
an: 2.386294361119891
announcing: 1.2876820724517808
apple: 2.386294361119891
are: 2.386294361119891
ate: 2.386294361119891
dot: 2.386294361119891
eating: 1.9808292530117262
eco: 2.386294361119891
google: 2.386294361119891
grapes: 2.386294361119891
iphone: 2.386294361119891
ironman: 2.386294361119891
is: 1.1335313926245225
loki: 2.386294361119891
microsoft: 2.386294361119891
model: 2.386294361119891
new: 1.2876820724517808
orange: 2.386294361119891
pixel: 2.386294361119891
pizza: 2.386294361119891
surface: 2.386294361119891
tesla: 2.386294361119891
thor: 2.386294361119891
tomorrow: 1.2876820724517808
you: 2.386294361119891


## **Print the transformed ouput from tf-idf**


In [None]:
print(transform_output.toarray())

[[0.24266547 0.         0.         0.         0.         0.
  0.         0.24266547 0.         0.40286636 0.         0.
  0.         0.         0.24266547 0.11527033 0.24266547 0.
  0.         0.         0.         0.         0.72799642 0.
  0.         0.24266547 0.         0.        ]
 [0.         0.         0.         0.         0.30652086 0.5680354
  0.         0.         0.         0.         0.         0.
  0.         0.5680354  0.         0.26982671 0.         0.
  0.         0.30652086 0.         0.         0.         0.
  0.         0.         0.30652086 0.        ]
 [0.         0.         0.         0.         0.30652086 0.
  0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.26982671 0.         0.
  0.5680354  0.30652086 0.         0.         0.         0.
  0.5680354  0.         0.30652086 0.        ]
 [0.         0.         0.         0.         0.30652086 0.
  0.         0.         0.         0.         0.         0.5680354
  0. 