In [None]:
from scipy.sparse import csr_matrix

import math

from sklearn.preprocessing import normalize
from sklearn.utils.validation import check_is_fitted
from sklearn.feature_extraction.text import TfidfVectorizer

import numpy as np

In [None]:
class CustomTfidfVectorizer:

  def __init__(self,max_features=None):
    self.max_features=max_features

  def fit(self,raw_documents):
    '''Learn vocabulary and idf from training set.'''
    #vocabulary-
    unique_words = set()
    if isinstance(raw_documents,list):
      for row in raw_documents:
        for word in row.split():
          if len(word)<2:
            continue
          unique_words.add(word)
      vocab = sorted(list(unique_words)) 
      self.vocabulary_=vocab 
    else:
      print ("you need to pass raw_documents as list")

    #idf-
    idf= np.empty((0,)) #https://stackoverflow.com/a/24806479
    for word in vocab:
        t=0
        for row in raw_documents:
            if word in row:
               t+=1    
        idf=np.append(idf,(1+(math.log((1+len(raw_documents))/(1+t)))))
        self.idf_=idf
    
    #max_features:
    if self.max_features != None:
      idf_vocab = sorted([(vocab[i],idf[i]) for i in range(len(idf))]  , key=lambda i:i[1], reverse=True)[:self.max_features]
      idf_vocab=dict(idf_vocab)

      self.idf_=np.array(list(idf_vocab.values()))
      self.vocabulary_=list(idf_vocab.keys())

  def transform(self,raw_documents):
    '''Transform documents to l2 normalize sparse tf-idf matrix.
       Uses the vocabulary and idf values learned by fit'''

    check_is_fitted(self,msg="fit raw_documents before transform")

    tfidf=np.empty((0,))
    for row in raw_documents:
      row=row.split()
      c=0
      for word in row:
        if len(word)>1:
          c+=1
      tf=np.empty((0,))
      for word in self.vocabulary_:
         tf=np.append(tf,(row.count(word)/c))
      tfidf_row=np.multiply(tf,self.idf_) 
      tfidf=np.append(tfidf,tfidf_row)

    tfidf_dense=normalize(tfidf.reshape(len(raw_documents),len(self.vocabulary_)))
    tfidf_sparse=csr_matrix(tfidf_dense)
    
    return tfidf_sparse


# Comparing sklearn's TfidfVectorizer & Custom TfidfVectorizer:

In [None]:
corpus1 = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

In [None]:
#sklearn
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus1)
skl_output = vectorizer.transform(corpus1)

#custom
vectorizer_custom = CustomTfidfVectorizer()
vectorizer_custom.fit(corpus1)
custom_output = vectorizer_custom.transform(corpus1)

In [None]:
print("sklearn's vocab:",vectorizer.get_feature_names_out())
print("custom tfidf vocab:",vectorizer_custom.vocabulary_)

sklearn's vocab: ['and' 'document' 'first' 'is' 'one' 'second' 'the' 'third' 'this']
custom tfidf vocab: ['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [None]:
print("sklearn's idf:\n",vectorizer.idf_)
print("custom idf\n:",vectorizer_custom.idf_)

sklearn's idf:
 [1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]
custom idf
: [1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [None]:
print("sklearn's sparse output:\n",skl_output[0])
print('custom tfidf sparse output:\n',custom_output[0])
print("\nsklearn's dense output:\n",skl_output[0].toarray())
print("\ncustom tfidf dense output:\n",custom_output[0].toarray())

sklearn's sparse output:
   (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045
custom tfidf sparse output:
   (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149

sklearn's dense output:
 [[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]

custom tfidf dense output:
 [[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]
