### TF-IDF Vectorizer

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
tfidf = TfidfVectorizer(lowercase=False)

In [3]:
docs = ['the house had a tiny little mouse',
       'the cat saw the mouse',
       'the mouse ran away from the house',
       'the cat finally ate the mouse',
       'the end of the mouse story']

In [4]:
# Apply

new_docs = tfidf.fit_transform(docs)

In [5]:
print(new_docs)

  (0, 9)	0.23518497814732853
  (0, 8)	0.4935620852501245
  (0, 15)	0.4935620852501245
  (0, 6)	0.4935620852501245
  (0, 7)	0.39820278266020165
  (0, 14)	0.23518497814732853
  (1, 12)	0.5990921556092994
  (1, 2)	0.4833437789546282
  (1, 9)	0.2854706221078668
  (1, 14)	0.5709412442157336
  (2, 5)	0.457092872112502
  (2, 1)	0.457092872112502
  (2, 11)	0.457092872112502
  (2, 9)	0.21780720268373022
  (2, 7)	0.36877965112960637
  (2, 14)	0.43561440536746043
  (3, 0)	0.5139230069660121
  (3, 4)	0.5139230069660121
  (3, 2)	0.4146298460977916
  (3, 9)	0.2448870664395922
  (3, 14)	0.4897741328791844
  (4, 13)	0.49175318723159633
  (4, 10)	0.49175318723159633
  (4, 3)	0.49175318723159633
  (4, 9)	0.23432302854935097
  (4, 14)	0.46864605709870194


In [6]:
tfidf.get_feature_names_out()

array(['ate', 'away', 'cat', 'end', 'finally', 'from', 'had', 'house',
       'little', 'mouse', 'of', 'ran', 'saw', 'story', 'the', 'tiny'],
      dtype=object)

In [7]:
import pandas as pd

In [8]:
df = pd.DataFrame(new_docs.toarray(),
                 columns=tfidf.get_feature_names_out())

In [9]:
df

Unnamed: 0,ate,away,cat,end,finally,from,had,house,little,mouse,of,ran,saw,story,the,tiny
0,0.0,0.0,0.0,0.0,0.0,0.0,0.493562,0.398203,0.493562,0.235185,0.0,0.0,0.0,0.0,0.235185,0.493562
1,0.0,0.0,0.483344,0.0,0.0,0.0,0.0,0.0,0.0,0.285471,0.0,0.0,0.599092,0.0,0.570941,0.0
2,0.0,0.457093,0.0,0.0,0.0,0.457093,0.0,0.36878,0.0,0.217807,0.0,0.457093,0.0,0.0,0.435614,0.0
3,0.513923,0.0,0.41463,0.0,0.513923,0.0,0.0,0.0,0.0,0.244887,0.0,0.0,0.0,0.0,0.489774,0.0
4,0.0,0.0,0.0,0.491753,0.0,0.0,0.0,0.0,0.0,0.234323,0.491753,0.0,0.0,0.491753,0.468646,0.0


In [10]:
tf_idf = TfidfVectorizer(lowercase=False)

sents = ['The car is driven on the road',
         'The truck is driven on the highway']

In [11]:
# Apply

new_sent = tf_idf.fit_transform(sents)

In [12]:
print(new_sent)

  (0, 6)	0.46977773849858007
  (0, 7)	0.33425073008815387
  (0, 5)	0.33425073008815387
  (0, 2)	0.33425073008815387
  (0, 4)	0.33425073008815387
  (0, 1)	0.46977773849858007
  (0, 0)	0.33425073008815387
  (1, 3)	0.46977773849858007
  (1, 8)	0.46977773849858007
  (1, 7)	0.33425073008815387
  (1, 5)	0.33425073008815387
  (1, 2)	0.33425073008815387
  (1, 4)	0.33425073008815387
  (1, 0)	0.33425073008815387


In [13]:
df_sent = pd.DataFrame(new_sent.toarray(),
                 columns=tf_idf.get_feature_names_out())

In [14]:
df_sent

Unnamed: 0,The,car,driven,highway,is,on,road,the,truck
0,0.334251,0.469778,0.334251,0.0,0.334251,0.334251,0.469778,0.334251,0.0
1,0.334251,0.0,0.334251,0.469778,0.334251,0.334251,0.0,0.334251,0.469778


In [15]:
tfidf = TfidfVectorizer(lowercase=False)

In [16]:
# Apply

new_docs = tfidf.fit_transform(docs)

In [17]:
new = ['the car won the race']

new_tf = tfidf.transform(new)

In [18]:
print(new_tf)

  (0, 14)	1.0


In [19]:
df = pd.DataFrame(new_tf.toarray(),
                 columns=tfidf.get_feature_names_out())

In [20]:
df

Unnamed: 0,ate,away,cat,end,finally,from,had,house,little,mouse,of,ran,saw,story,the,tiny
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0


### TF-TDF Transformer

In [21]:
from sklearn.feature_extraction.text import TfidfTransformer

from sklearn.feature_extraction.text import CountVectorizer

In [22]:
# for tf idf transformation
ttrans = TfidfTransformer()

#### 1. create the count vector

In [23]:
# intantiate CountVectorizer()
cv = CountVectorizer()

# convert sentance in count vector
word_count_vector = cv.fit_transform(docs)

#### 2. fit count vector in TfidfTransformer()

In [24]:
# fit count vector in tf-idf transformer
docs_trans = ttrans.fit(word_count_vector)

In [25]:
print(docs_trans)

TfidfTransformer()


In [26]:
df = pd.DataFrame(ttrans.idf_,
                  index=cv.get_feature_names_out(),
                  columns=['idf_weights'])

In [27]:
df

Unnamed: 0,idf_weights
ate,2.098612
away,2.098612
cat,1.693147
end,2.098612
finally,2.098612
from,2.098612
had,2.098612
house,1.693147
little,2.098612
mouse,1.0
