#CALCULATING THE TF-IDF OF A CORPUS
This file contains Python code for calculating the Term Frequency-Inverse Document Frequency (TF-IDF) scores of a given term in a corpus of documents. The implementation is done from scratch without using any specialized libraries for TF-IDF, which provides a deeper understanding of the TF-IDF computation process.

**Source**: https://medium.com/@er.iit.pradeep09/understanding-tf-idf-in-nlp-a-comprehensive-guide-26707db0cec5

In [18]:
import math

corpus = [
"The quick brown fox jumps over the lazy dog",
"The lazy dog likes to sleep all day",
"The brown fox prefers to eat cheese",
"The red fox jumps over the brown fox",
"The brown dog chases the fox"
]

In [19]:
def get_all_words_count_per_row(corpus):
  all_words_count_per_row = []
  for i in range(len(corpus)):
    n_words = len(corpus[i].split())
    all_words_count_per_row.append(n_words)

  return all_words_count_per_row

In [20]:
all_words_count_per_row = get_all_words_count_per_row(corpus = corpus)
print(all_words_count_per_row)

[9, 8, 7, 8, 6]


In [21]:
def get_num_chosen_term_per_row(corpus, term):
  n_chosen_term_per_row = []
  for i in range(len(corpus)):
    split_words = corpus[i].split()
    count = 0
    for word in split_words:
      if word == term:
        count +=1
    n_chosen_term_per_row.append(count)

  return n_chosen_term_per_row

In [22]:
n_chosen_term_per_row = get_num_chosen_term_per_row(corpus = corpus, term = 'fox')
print(n_chosen_term_per_row)

[1, 0, 1, 2, 1]


In [23]:
def get_frequency_scores(corpus, all_words_count_per_row, n_chosen_term_per_row):
  frequencies = []
  for i in range(len(corpus)):
    frequency = n_chosen_term_per_row[i]/all_words_count_per_row[i]
    frequencies.append(frequency)

  return frequencies

In [24]:
frequencies = get_frequency_scores(corpus = corpus,
                                 all_words_count_per_row = all_words_count_per_row,
                                 n_chosen_term_per_row = n_chosen_term_per_row)
for i in range(len(corpus)):
  print(frequencies[i])

0.1111111111111111
0.0
0.14285714285714285
0.25
0.16666666666666666


In [25]:
def get_doc_frequency(corpus, term):
  df = 0
  for i in range(len(corpus)):
    if term in corpus[i]:
      df +=1

  return df

In [26]:
df = get_doc_frequency(corpus = corpus, term = 'fox')
print(df)

4


In [27]:
def get_inverse_doc_frequency(corpus, df):
  idf = (math.log(len(corpus)/df))

  return idf

In [28]:
idf = get_inverse_doc_frequency(corpus = corpus, df = df)
print(idf)

0.22314355131420976


In [29]:
def get_tf_idf_calculation(corpus, frequencies, idf):
  tf_idf_scores = []
  for i in range(len(corpus)):
    tf_idf = frequencies[i] * idf
    tf_idf_scores.append(tf_idf)

  return tf_idf_scores

In [30]:
tf_idf = get_tf_idf_calculation(corpus = corpus, frequencies = frequencies, idf = idf)
print(tf_idf)

[0.024793727923801082, 0.0, 0.03187765018774425, 0.05578588782855244, 0.037190591885701625]


In [31]:
def compute_tf_idf(corpus, term):
  all_words_count_per_row = get_all_words_count_per_row(corpus = corpus)
  n_chosen_term_per_row = get_num_chosen_term_per_row(corpus = corpus, term = term)
  frequencies = get_frequency_scores(
                                 corpus = corpus,
                                 all_words_count_per_row = all_words_count_per_row,
                                 n_chosen_term_per_row = n_chosen_term_per_row)
  df = get_doc_frequency(corpus = corpus, term = term)
  idf = get_inverse_doc_frequency(corpus = corpus, df = df)
  tf_idf = get_tf_idf_calculation(corpus = corpus, frequencies = frequencies, idf = idf)

  return tf_idf

In [32]:
print("Tf-Idf of fox:", compute_tf_idf(corpus = corpus, term = 'fox'))

Tf-Idf of fox: [0.024793727923801082, 0.0, 0.03187765018774425, 0.05578588782855244, 0.037190591885701625]


In [33]:
#TESTS

def test_get_all_words_count_per_row():
  corpus = ["The lazy dog likes to sleep all day",
             "The brown fox prefers to eat cheese",
             "The red fox jumps over the brown fox"]
  all_words_count_per_row = get_all_words_count_per_row(corpus = corpus)
  assert all_words_count_per_row == [8, 7, 8]
  print("test passed")

def test_get_num_chosen_term_per_row():
   corpus = ["The lazy dog likes to sleep all day",
             "The brown fox prefers to eat cheese",
             "The red fox jumps over the brown fox"]
   n_chosen_term_per_row = get_num_chosen_term_per_row(corpus = corpus, term = 'fox')
   assert n_chosen_term_per_row == [0, 1, 2]
   print("test passed")

def test_get_frequency_scores():
  corpus = ["The lazy dog likes to sleep all day",
             "The brown fox prefers to eat cheese",
             "The red fox jumps over the brown fox"]
  frequencies = get_frequency_scores(corpus = corpus,
                                 all_words_count_per_row = [8, 7, 8],
                                 n_chosen_term_per_row = [0, 1, 2])
  assert math.isclose(frequencies[0], 0.0)
  assert math.isclose(frequencies[1], 0.14285714285714)
  assert math.isclose(frequencies[2], 0.25)
  print("test passed")

def test_get_doc_frequency():
  corpus = ["The lazy dog likes to sleep all day",
             "The brown fox prefers to eat cheese",
             "The red fox jumps over the brown fox"]
  df = get_doc_frequency(corpus = corpus, term = 'fox')
  assert df == 2
  print("test passed")

def test_get_inverse_doc_frequency():
  corpus = ["The lazy dog likes to sleep all day",
             "The brown fox prefers to eat cheese",
             "The red fox jumps over the brown fox"]
  idf = get_inverse_doc_frequency(corpus = corpus, df = 2)
  assert math.isclose(idf, 0.4055, abs_tol = 0.01)
  print("test passed")

def test_get_tf_idf_calculation():
  corpus = ["The lazy dog likes to sleep all day",
             "The brown fox prefers to eat cheese",
             "The red fox jumps over the brown fox"]
  tf_idf = get_tf_idf_calculation(corpus = corpus,
                                  frequencies = [0.0, 0.14, 0.25],
                                  idf = 0.405)
  assert math.isclose(tf_idf[0], 0.0, abs_tol = 0.01)
  assert math.isclose(tf_idf[1], 0.0567, abs_tol = 0.01)
  assert math.isclose(tf_idf[2], 0.10125, abs_tol = 0.01)
  print('test passed')

def test_compute_tf_idf():
  corpus = ["The lazy dog likes to sleep all day",
             "The brown fox prefers to eat cheese",
             "The red fox jumps over the brown fox"]
  tf_idf = compute_tf_idf(corpus = corpus, term = 'fox')
  assert math.isclose(tf_idf[0], 0.0, abs_tol = 0.01)
  assert math.isclose(tf_idf[1], 0.0567, abs_tol = 0.01)
  assert math.isclose(tf_idf[2], 0.10125, abs_tol = 0.01)
  print('test passed')

def test_compute_tf_idf():
  corpus = ["The pen is on the table",
             "The table is in the garden",
             "The pen is red and the pen is broken"]
  tf_idf = compute_tf_idf(corpus = corpus, term = 'pen')
  assert math.isclose(tf_idf[0], 0.06758333, abs_tol = 0.01)
  assert math.isclose(tf_idf[1],0.0, abs_tol = 0.01)
  assert math.isclose(tf_idf[2],0.09011111, abs_tol = 0.01)
  print('test passed')

def test_compute_tf_idf():
  corpus = ["The cat is chasing the mouse",
             "The mouse is running fast",
             "The cat got distracted and the mouse ran away"]
  tf_idf = compute_tf_idf(corpus = corpus, term = 'cat')
  assert math.isclose(tf_idf[0],0.06758333, abs_tol = 0.01)
  assert math.isclose(tf_idf[1],0.0, abs_tol = 0.01)
  assert math.isclose(tf_idf[2],0.04505556, abs_tol = 0.01)
  print('test passed')

def run_tests():
  test_get_all_words_count_per_row()
  test_get_num_chosen_term_per_row()
  test_get_frequency_scores()
  test_get_doc_frequency()
  test_get_inverse_doc_frequency()
  test_get_tf_idf_calculation()
  test_compute_tf_idf()
  test_compute_tf_idf()
  test_compute_tf_idf()

run_tests()

test passed
test passed
test passed
test passed
test passed
test passed
test passed
test passed
test passed
