In [1]:
from collections import Counter #counting no. of elements in a list and return a dictionary
from scipy.sparse import lil_matrix
import math
from sklearn.preprocessing import normalize #normalize the vectorized 'sentence'/'word'
import numpy as np 

corpus = ['this is the first document',
          'this document is the second document',
          'and this is the third one',
          'is this the first document'] 


def IDF(corpus, unique_words):
    idf_dict={}
    N=len(corpus)
    
    for i in unique_words:
        count=0
        for sen in corpus:
            if i in sen.split():
                count=count+1
                
            idf_dict[i]=(math.log((1+N)/(count+1)))+1
    return idf_dict

def fit(whole_data):
    unique_words = set()
    if isinstance(whole_data, (list,)):
        for x in whole_data:
            for y in x.split():
                if len(y)<2:
                    continue
                unique_words.add(y)
        unique_words = sorted(list(unique_words))
        vocab = {j:i for i,j in enumerate(unique_words)}
        Idf_values_of_all_unique_words=IDF(whole_data,unique_words)
    return vocab, Idf_values_of_all_unique_words
Vocabulary, idf_of_vocabulary=fit(corpus) 

In [2]:
print(list(Vocabulary.keys()))
print(list(idf_of_vocabulary.values()))

['and', 'document', 'first', 'is', 'one', 'second', 'the', 'third', 'this']
[1.916290731874155, 1.2231435513142097, 1.5108256237659907, 1.0, 1.916290731874155, 1.916290731874155, 1.0, 1.916290731874155, 1.0]


In [3]:
unique_words = set()
if isinstance(corpus, (list,)):
    for x in corpus:
        for y in x.split():
            unique_words.add(y)
print(unique_words)
IDF(corpus,unique_words)

{'the', 'document', 'second', 'one', 'and', 'first', 'this', 'third', 'is'}


{'the': 1.0,
 'document': 1.2231435513142097,
 'second': 1.916290731874155,
 'one': 1.916290731874155,
 'and': 1.916290731874155,
 'first': 1.5108256237659907,
 'this': 1.0,
 'third': 1.916290731874155,
 'is': 1.0}

In [4]:
def transform(dataset,vocabulary,idf_values):
    sparse_matrix= lil_matrix( (len(dataset), len(vocabulary)), dtype=np.float64)
    for row  in range(0,len(dataset)):
        number_of_words_in_sentence=Counter(dataset[row].split())
        for word in dataset[row].split():
            if word in  list(vocabulary.keys()):
                tf_idf_value=(number_of_words_in_sentence[word]/len(dataset[row].split()))*(idf_values[word])
                sparse_matrix[row,vocabulary[word]]=tf_idf_value
    print("NORM FORM\n",normalize(sparse_matrix, norm='l2', axis=1, copy=True,return_norm=False))
    output = normalize(sparse_matrix, norm='l2', axis=1, copy=True, return_norm=False)
    return output
final_output = transform(corpus,Vocabulary,idf_of_vocabulary)
print(final_output.shape) 

NORM FORM
   (0, 1)	0.4697913855799205
  (0, 2)	0.580285823684436
  (0, 3)	0.3840852409148149
  (0, 6)	0.3840852409148149
  (0, 8)	0.3840852409148149
  (1, 1)	0.6876235979836937
  (1, 3)	0.2810886740337529
  (1, 5)	0.5386476208856762
  (1, 6)	0.2810886740337529
  (1, 8)	0.2810886740337529
  (2, 0)	0.511848512707169
  (2, 3)	0.267103787642168
  (2, 4)	0.511848512707169
  (2, 6)	0.267103787642168
  (2, 7)	0.511848512707169
  (2, 8)	0.267103787642168
  (3, 1)	0.4697913855799205
  (3, 2)	0.580285823684436
  (3, 3)	0.3840852409148149
  (3, 6)	0.3840852409148149
  (3, 8)	0.3840852409148149
(4, 9)


In [5]:
print(final_output[0].toarray())

[[0.         0.46979139 0.58028582 0.38408524 0.         0.
  0.38408524 0.         0.38408524]]


lil matrix, crc matrix, vector space model

https://analyticsindiamag.com/hands-on-implementation-of-tf-idf-from-scratch-in-python/

https://scikit-learn.org/stable/modules/preprocessing.html#preprocessing-normalization

