<a href="https://colab.research.google.com/github/otvc/PythonText/blob/main/TF_IDF.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [16]:
import numpy as np
import string
from sklearn.feature_extraction.text import TfidfVectorizer

In [15]:
vectorizer.get_feature_names_out()[vectorizer.idf_ > 1]

array(['car', 'highway', 'road', 'truck'], dtype=object)

In [108]:
class TfIdf:
  def __init__(self):
    self.words = np.array([])
    self.__tf = np.array([])
    self.__idf = np.array([])
    self.__tf_idf = np.array([])
    self.punct = string.punctuation + "…"+"»"+"«" + "—"
    self.letter_trans = str.maketrans(self.punct, " "*len(self.punct))

  def __split(self, corpus):
    if type(corpus) is not list:
      data = corpus.translate(self.letter_trans).lower().split()
    else:
      data = np.array([])
      for text in corpus:
        data = np.append(data, text.translate(self.letter_trans).lower().split())
      data = np.array(list(set(data)))
    return data

  def __str_tf(self, sub_str, text):
    return text.count(sub_str)

  def __get_tf(self, corpus):
    words_count = []
    for word in self.words:
      text_stats = []
      for text in corpus:
        text_stats.append(self.__str_tf(word, text))
      words_count.append(np.array(text_stats))
    words_count = np.array(words_count)
    tf = 1.0 * words_count / len(self.words)
    return [tf, words_count]

  def __get_idf(self, corpus):
    N = len(corpus)
    idf = []
    for word in self.words:
      wtc = 0
      for text in corpus:
        wtc += self.__str_tf(word, text) > 0
      idf.append(np.log10(1.0 * N / wtc))
    idf = np.array([idf]).T
    return idf

  def get_tf(self):
    return self.__tf

  def get_idf(self):
    return self.__idf

  def get_tf_idf(self):
    return self.__tf_idf

  def fit_transform(self, corpus):
    if type(corpus) is not list:
      corpus = np.array([corpus])
    self.words = self.__split(corpus)
    self.__tf, words_count = self.__get_tf(corpus)
    self.__idf = self.__get_idf(corpus)
    self.__tf_idf = self.__tf * np.array([self.__idf]).T
    return self


In [109]:
corpus = ["The car is driven on the road", 
          "The truck is driven on the highway"]

In [110]:
test_object = TfIdf()
test_object.fit_transform(corpus)

<__main__.TfIdf at 0x7fe9118426d0>

In [113]:
print(f"TF\n{test_object.get_tf()}\n---")
print(f"IDF\n{test_object.get_idf()}\n---")
print(f"TF-IDF\n{test_object.get_tf_idf()}\n---")

TF
[[0.    0.125]
 [0.125 0.   ]
 [0.125 0.125]
 [0.125 0.125]
 [0.    0.125]
 [0.125 0.   ]
 [0.125 0.125]
 [0.125 0.125]]
---
IDF
[[0.30103]
 [0.30103]
 [0.     ]
 [0.     ]
 [0.30103]
 [0.30103]
 [0.     ]
 [0.     ]]
---
TF-IDF
[[[0.         0.03762875]
  [0.03762875 0.        ]
  [0.         0.        ]
  [0.         0.        ]
  [0.         0.03762875]
  [0.03762875 0.        ]
  [0.         0.        ]
  [0.         0.        ]]]
---
