Implementation of **TF IDF** from scratch

In [30]:
import numpy as np
from collections import Counter
import math
from sklearn.preprocessing import normalize
from scipy.sparse import csr_matrix
import warnings

In [7]:
def IDF_fun(corpus,word_vector):
  val=[]
  for word in word_vector:
    word_count_in_sen=0
    for sentance in corpus:
      if word in sentance.split(" "):
        word_count_in_sen=word_count_in_sen+1
      else:
        pass
    v=(1+len(corpus))/(1+word_count_in_sen)
    val.append( 1+math.log(v) )
  #print(np.array(val))
  return np.array(val)

In [8]:
def Fit(corpus):
  unique_words=set()
  for sentance in corpus:
    for word in sentance.split(" "):
      if len(word)<2:
        continue
      unique_words.add(word)
  #-------------------------------------------------------
  word_vec={word:idx for idx,word in enumerate(sorted(unique_words))}
  #------------------------------------------------------------
  idf=IDF_fun(corpus,word_vec)
  return word_vec,idf


In [9]:
def Transform(corpus,word_vector,IDF):

  #IDF=IDF_fun(corpus,word_vector)
  col=[]
  row=[]
  values=[]
  for r,sentance in enumerate(corpus):
    lst=sentance.split(" ")
    for idx,word in enumerate(word_vector):
      if word in lst:
        # calculating TF
        tf=lst.count(word)/len(lst)
        #calculating TFIDF
        tfidf=tf*IDF[idx]
        #-------------------------------
        values.append(tfidf)
        col.append(idx)
        row.append(r)
      else:
          pass
  #-------------------------------------------------------------------
  M=csr_matrix((values,(row,col)),shape=(len(corpus),len(word_vector)))
  M=normalize(M)
  #print(type(M))  
  return M

In [10]:
corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',]

In [11]:
#--------FIT----------------------
Vocab_corpus,IDF_val=Fit(corpus)
print("Vocab corpus =\n{}".format(Vocab_corpus))
print()
Feature_names=list(Vocab_corpus.keys())
print("Feature names=\n{}".format(Feature_names))
print()
print("IDF Values =\n{}".format(IDF_val))

Vocab corpus =
{'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8}

Feature names=
['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']

IDF Values =
[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [12]:
#---------Transpose------------------------------
Array=Transform(corpus,Feature_names,IDF_val)
print()
print("Shape = {}".format(Array.shape))
print()
print(Array[0])
print()
print(Array[0].toarray())


Shape = (4, 9)

  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


**SkLearn TFIDF vectorizer()**

In [13]:
from sklearn.feature_extraction.text import TfidfVectorizer

In [16]:
vec=TfidfVectorizer()
x=vec.fit_transform(corpus)

In [27]:
x.shape

(4, 9)

In [26]:
vec.idf_

array([1.91629073, 1.22314355, 1.51082562, 1.        , 1.91629073,
       1.91629073, 1.        , 1.91629073, 1.        ])

In [34]:
vec.get_feature_names_out()

array(['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third',
       'this'], dtype=object)

In [23]:
print(x[0])

  (0, 1)	0.46979138557992045
  (0, 2)	0.5802858236844359
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 8)	0.38408524091481483
