#CALCULATIONG THE TF-IDF OF A CORPUS

**Source**: https://medium.com/@er.iit.pradeep09/understanding-tf-idf-in-nlp-a-comprehensive-guide-26707db0cec5

In [50]:
import math

corpus = [
"The quick brown fox jumps over the lazy dog",
"The lazy dog likes to sleep all day",
"The brown fox prefers to eat cheese",
"The red fox jumps over the brown fox",
"The brown dog chases the fox"
]


In [51]:
def get_all_words_count_per_row(corpus):
  all_words_count_per_row = []
  for i in range(len(corpus)):
    n_words = len(corpus[i].split())
    all_words_count_per_row.append(n_words)

  return all_words_count_per_row


In [52]:
all_words_count_per_row = get_all_words_count_per_row(corpus = corpus)
print(all_words_count_per_row)

[9, 8, 7, 8, 6]


In [53]:
def get_num_chosen_term_per_row(corpus, term):
  n_chosen_term_per_row = []
  for i in range(len(corpus)):
    split_words = corpus[i].split()
    count = 0
    for word in split_words:
      if word == term:
        count +=1
    n_chosen_term_per_row.append(count)

  return n_chosen_term_per_row






In [54]:
n_chosen_term_per_row = get_num_chosen_term_per_row(corpus = corpus, term = 'fox')
print(n_chosen_term_per_row)

[1, 0, 1, 2, 1]


In [55]:

def get_frequency_scores(corpus, all_words_count_per_row, n_chosen_term_per_row):
  frequencies = []
  for i in range(len(corpus)):
    frequency = n_chosen_term_per_row[i]/all_words_count_per_row[i]
    frequencies.append(frequency)

  return frequencies


In [56]:
frequencies = get_frequency_scores(corpus = corpus,
                                 all_words_count_per_row = all_words_count_per_row,
                                 n_chosen_term_per_row = n_chosen_term_per_row)
for i in range(len(corpus)):
  print(frequencies[i])

0.1111111111111111
0.0
0.14285714285714285
0.25
0.16666666666666666


In [57]:
def get_doc_frequency(corpus, term):
  df = 0
  for i in range(len(corpus)):
    if term in corpus[i]:
      df +=1

  return df

In [58]:
df = get_doc_frequency(corpus = corpus, term = 'fox')
print(df)

4


In [59]:
import math

def get_inverse_doc_frequency(corpus, df):
  #idf_base10 = math.log(len(corpus)/df, 10)
  idf_e = (math.log(len(corpus)/df))
  return idf_e #idf_base10


In [60]:
idf = get_inverse_doc_frequency(corpus = corpus, df = df)
print(idf)


0.22314355131420976


In [61]:
def get_tf_idf_calculation(corpus, frequencies, idf):
  tf_idf_scores = []
  for i in range(len(corpus)):
    tf_idf = frequencies[i] * idf
    tf_idf_scores.append(tf_idf)

  return tf_idf_scores



In [62]:
tf_idf = get_tf_idf_calculation(corpus = corpus, frequencies = frequencies, idf = idf)
print(tf_idf)

[0.024793727923801082, 0.0, 0.03187765018774425, 0.05578588782855244, 0.037190591885701625]


In [63]:
def compute_tf_idf(corpus, term):
  all_words_count_per_row = get_all_words_count_per_row(corpus = corpus)
  n_chosen_term_per_row = get_num_chosen_term_per_row(corpus = corpus, term = term)
  frequencies = get_frequency_scores(corpus = corpus,
                                 all_words_count_per_row = all_words_count_per_row,
                                 n_chosen_term_per_row = n_chosen_term_per_row)
  df = get_doc_frequency(corpus = corpus, term = term)
  idf = get_inverse_doc_frequency(corpus = corpus, df = df)
  tf_idf = get_tf_idf_calculation(corpus = corpus, frequencies = frequencies, idf = idf)

  return tf_idf




In [64]:
print("Tf-Idf of fox:", compute_tf_idf(corpus = corpus, term = 'fox'))

Tf-Idf of fox: [0.024793727923801082, 0.0, 0.03187765018774425, 0.05578588782855244, 0.037190591885701625]


In [65]:
#TESTS

def test_get_all_words_count_per_row():
  corpus = ["The lazy dog likes to sleep all day",
             "The brown fox prefers to eat cheese",
             "The red fox jumps over the brown fox"]
  all_words_count_per_row = get_all_words_count_per_row(corpus = corpus)
  assert all_words_count_per_row == [8, 7, 8]
  print("test passed")

test_get_all_words_count_per_row()



def test_get_num_chosen_term_per_row():
   corpus = ["The lazy dog likes to sleep all day",
             "The brown fox prefers to eat cheese",
             "The red fox jumps over the brown fox"]
   n_chosen_term_per_row = get_num_chosen_term_per_row(corpus = corpus, term = 'fox')
   print(n_chosen_term_per_row)
   assert n_chosen_term_per_row == [0, 1, 2]
   print("test passed")

test_get_num_chosen_term_per_row()



def test_get_frequency_scores():
  corpus = ["The lazy dog likes to sleep all day",
             "The brown fox prefers to eat cheese",
             "The red fox jumps over the brown fox"]
  frequencies = get_frequency_scores(corpus = corpus,
                                 all_words_count_per_row = [8, 7, 8],
                                 n_chosen_term_per_row = [0, 1, 2])
  print(frequencies)
  assert math.isclose(frequencies[0], 0.0)
  assert math.isclose(frequencies[1], 0.14285714285714)
  assert math.isclose(frequencies[2], 0.25)

  print("test passed")

test_get_frequency_scores()


def test_get_doc_frequency():
  corpus = ["The lazy dog likes to sleep all day",
             "The brown fox prefers to eat cheese",
             "The red fox jumps over the brown fox"]
  df = get_doc_frequency(corpus = corpus, term = 'fox')
  assert df == 2
  print("test passed")

test_get_doc_frequency()


def test_get_inverse_doc_frequency():
  corpus = ["The lazy dog likes to sleep all day",
             "The brown fox prefers to eat cheese",
             "The red fox jumps over the brown fox"]
  idf = get_inverse_doc_frequency(corpus = corpus, df = 2)
  print(idf)
  assert math.isclose(idf, 0.4055, abs_tol = 0.01)
  print("test passed")

test_get_inverse_doc_frequency()


def test_get_tf_idf_calculation():
  corpus = ["The lazy dog likes to sleep all day",
             "The brown fox prefers to eat cheese",
             "The red fox jumps over the brown fox"]
  tf_idf = get_tf_idf_calculation(corpus = corpus,
                                  frequencies = [0.0, 0.14, 0.25],
                                  idf = 0.405)
  print(tf_idf)
  assert math.isclose(tf_idf[0], 0.0, abs_tol = 0.01)
  assert math.isclose(tf_idf[1], 0.0567, abs_tol = 0.01)
  assert math.isclose(tf_idf[2], 0.10125, abs_tol = 0.01)

  print('test passed')

test_get_tf_idf_calculation()



def test_compute_tf_idf():
  corpus = ["The lazy dog likes to sleep all day",
             "The brown fox prefers to eat cheese",
             "The red fox jumps over the brown fox"]
  tf_idf = compute_tf_idf(corpus = corpus, term = 'fox')
  print(tf_idf)
  assert math.isclose(tf_idf[0], 0.0, abs_tol = 0.01)
  assert math.isclose(tf_idf[1], 0.0567, abs_tol = 0.01)
  assert math.isclose(tf_idf[2], 0.10125, abs_tol = 0.01)
  print('test passed')

test_compute_tf_idf()



def test_compute_tf_idf():
  corpus = ["The pen is on the table",
             "The table is in the garden",
             "The pen is red and the pen is broken"]
  tf_idf = compute_tf_idf(corpus = corpus, term = 'pen')
  print(tf_idf)
  assert math.isclose(tf_idf[0], 0.06758333, abs_tol = 0.01)
  assert math.isclose(tf_idf[1],0.0, abs_tol = 0.01)
  assert math.isclose(tf_idf[2],0.09011111, abs_tol = 0.01)
  print('test passed')

test_compute_tf_idf()


def test_compute_tf_idf():
  corpus = ["The cat is chasing the mouse",
             "The mouse is running fast",
             "The cat got distracted and the mouse ran away"]
  tf_idf = compute_tf_idf(corpus = corpus, term = 'cat')
  print(tf_idf)
  assert math.isclose(tf_idf[0],0.06758333, abs_tol = 0.01)
  assert math.isclose(tf_idf[1],0.0, abs_tol = 0.01)
  assert math.isclose(tf_idf[2],0.04505556, abs_tol = 0.01)
  print('test passed')

test_compute_tf_idf()

test passed
[0, 1, 2]
test passed
[0.0, 0.14285714285714285, 0.25]
test passed
test passed
0.4054651081081644
test passed
[0.0, 0.05670000000000001, 0.10125]
test passed
[0.0, 0.05792358687259491, 0.1013662770270411]
test passed
[0.06757751801802739, 0.0, 0.09010335735736986]
test passed
[0.06757751801802739, 0.0, 0.04505167867868493]
test passed
