In [14]:
import numpy as np
import re
from collections import Counter
import math
from scipy.sparse import csr_matrix
from sklearn.preprocessing import normalize
from tqdm import tqdm

In [15]:
class tfidfvec:
    features = []
    idf_ = []
    feature_counts = dict()
    corpus = []
    numberofdocuments = 0
    tfidf = []
    tf = []
    vocabulary_ = {}
    
    def fit(self,cor):
        """fit method similar to sklearn fit vectoriser"""
        self.corpus = cor
        self.numberofdocuments = len(self.corpus)
        count_t = []
        
        # adding different word in corpus and all word in count_t variable
        for s in cor:
            for w in s.split():
                count_t.append(w)
                if w not in self.features:
                    self.features.append(w)
        # sorting features alphabetacly and storing all the word count in features_count
        self.features.sort()
        self.feature_counts = dict(Counter(count_t))
        
        # removing extra count in feature_count if a word occour more the one time
        count = 0
        for x in self.corpus:
            for y in self.feature_counts:
                count = sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(y), x))
                if count > 1:
                    self.feature_counts[y] -= count - 1
        
        # calculating idf for features and storing in idf_ attribute
        for wrd in tqdm(self.features):
            result = 1 + math.log( (1 + self.numberofdocuments) / (1 + self.feature_counts[wrd]) )
            self.idf_.append(result)
            
            
        # converting into numpy array
        self.features = np.array(self.features)
        self.idf_ = np.array(self.idf_)
        
        # vocbulary_
        index = 0
        for i in self.features:
            self.vocabulary_[i] = index
            index += 1
        
        
        
        # transform function
    def transform(self, cor):
        for sen in cor:
            for wrd in self.get_feature_names():
                count = sum(1 for _ in re.finditer(r'\b%s\b' % re.escape(wrd), sen))
                result = count / ( len( sen.split() ) )
                self.tf.append(result)
                
        # tfidf valueas storing in temp_tfidf_values
        for i in tqdm(range(len(self.corpus))):
            current_arr = []
            for j in range(len(self.features)):
                result = self.tf[i*(len(self.features))+j] * self.idf_[j]
                current_arr.append(result)
            self.tfidf.append(current_arr)
        
        # l2 normalization
        self.tfidf = csr_matrix(self.tfidf)
        self.tfidf = normalize(self.tfidf,'l2',axis=1)
        return self.tfidf
    
    # get feature name
    def get_feature_names(self):
        return list(self.features)

In [16]:
corpus = [
'this is the first document',
'this document is the second document',
'and this is the third one',
'is this the first document',
]

vectorizer = tfidfvec()
vectorizer.fit(corpus)
skl_output = vectorizer.transform(corpus)

100%|████████████████████████████████████████████████████████████████████████████████████████████| 9/9 [00:00<?, ?it/s]
100%|████████████████████████████████████████████████████████████████████████████████████████████| 4/4 [00:00<?, ?it/s]


In [17]:
print(vectorizer.get_feature_names())

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']


In [18]:
vectorizer.idf_

array([1.91629073, 1.22314355, 1.51082562, 1.        , 1.91629073,
       1.91629073, 1.        , 1.91629073, 1.        ])

In [19]:
skl_output.shape

(4, 9)

In [20]:
skl_output

<4x9 sparse matrix of type '<class 'numpy.float64'>'
	with 21 stored elements in Compressed Sparse Row format>

In [21]:
print(skl_output[0])

  (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149


In [22]:
print(skl_output[2].toarray())

[[0.51184851 0.         0.         0.26710379 0.51184851 0.
  0.26710379 0.51184851 0.26710379]]


In [23]:
vectorizer.vocabulary_

{'and': 0,
 'document': 1,
 'first': 2,
 'is': 3,
 'one': 4,
 'second': 5,
 'the': 6,
 'third': 7,
 'this': 8}