### tf-idf scratch vs sklearn implementation

In [81]:

corpus = [
     'this is the first document',
     'this document is the second document',
     'and this is the third one',
     'is this the first document',
]

### SkLearn Implementation

In [None]:
from sklearn.feature_extraction.text import TfidfVectorizer
vectorizer = TfidfVectorizer()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

In [None]:
# sklearn feature names, they are sorted in alphabetic order by default.

print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [None]:
# Here we will print the sklearn tfidf vectorizer idf values after applying the fit method
# After using the fit function on the corpus the vocab has 9 words in it, and each has its idf value.

print(vectorizer.idf_)

[1.91629073 1.22314355 1.51082562 1.         1.91629073 1.91629073
 1.         1.91629073 1.        ]


In [None]:
# shape of sklearn tfidf vectorizer output after applying transform method.

skl_output.shape

(4, 9)

In [None]:
# sklearn tfidf values for first line of the above corpus.
# Here the output is a sparse matrix

print(skl_output[0])

  (0, 8)	0.38408524091481483
  (0, 6)	0.38408524091481483
  (0, 3)	0.38408524091481483
  (0, 2)	0.5802858236844359
  (0, 1)	0.46979138557992045


In [None]:
# sklearn tfidf values for first line of the above corpus.
# To understand the output better, here we are converting the sparse output matrix to dense matrix and printing it.
# Notice that this output is normalized using L2 normalization. sklearn does this by default.

print(skl_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


# Scratch implementation

In [10]:
from collections import Counter
from tqdm import tqdm
from scipy.sparse import csr_matrix
import math
from sklearn.preprocessing import normalize
import numpy

#### fitting the scratch model

In [58]:

def fit(dataset):    
    distinct_words = set() 
    
    if isinstance(dataset, (list,)):
        for row in dataset: 
            for word in row.split(" "): 
                if len(word) < 2:
                    continue
                distinct_words.add(word)
        distinct_words = sorted(list(distinct_words))
        vocab = {j:i for i,j in enumerate(distinct_words)}
        
        idf={}
        for key in vocab.keys():
            count=0
            for row in dataset:
                if key in row.split():
                    count+=1
                idf[key]=1+(math.log((1+len(dataset))/(count+1)))
        return vocab,idf
    else:
        print("oops! not a list :/ ")

#### Comparing with the feature names and idf 

In [59]:
vocab,idf = fit(corpus)
print(f"feature names are: {vocab}")
print("-"*25+"IDF"+"-"*25)
print(idf)

feature names are: {'and': 0, 'document': 1, 'first': 2, 'is': 3, 'one': 4, 'second': 5, 'the': 6, 'third': 7, 'this': 8}
-------------------------IDF-------------------------
{'and': 1.916290731874155, 'document': 1.2231435513142097, 'first': 1.5108256237659907, 'is': 1.0, 'one': 1.916290731874155, 'second': 1.916290731874155, 'the': 1.0, 'third': 1.916290731874155, 'this': 1.0}


#### transform module

In [92]:
def transform(dataset,vocab,idf):
    row_idx=[]
    column_idx=[]
    values=[]

    for idx, row in enumerate(dataset):
        word_freq = dict(Counter(row.split()))
        for key, value in vocab.items():

            if key in word_freq.keys():
                values.append((word_freq[key]/len(row))*(idf[key]))
                row_idx.append(idx)
                column_idx.append(value)
    m= csr_matrix((values, (row_idx,column_idx)), shape=(len(dataset),len(vocab)))
    return normalize(m)

In [95]:
tfidf = transform(corpus,vocab,idf)

In [98]:
tfidf.toarray()[0] ##same with sklearn outcome

array([0.        , 0.46979139, 0.58028582, 0.38408524, 0.        ,
       0.        , 0.38408524, 0.        , 0.38408524])