# Term Frequency - Inverse Data Frequency

<b>NOTE</b>: This project is taken from "How to process textual data using TF-IDF in Python" by user DroidHead. Check out the blog post in [medium.freecodecamp.org](https://medium.freecodecamp.org/how-to-process-textual-data-using-tf-idf-in-python-cd2bbc0a94a3) for better explanations.

Create a function that will compute for term frequency.

In [None]:
def compute_tf(word_dict, bow):
    tf_dict = {}
    bow_count = len(bow)
    for word, count in word_dict.items():
        tf_dict[word] = count / float(bow_count)
    return tf_dict

Create a function that will compute for the rarity of words given a list of documents.

In [None]:
def compute_idf(doc_list):
    import math
    idf_dict = {}
    N = len(doc_list)
    
    idf_dict = dict.fromkeys(doc_list[0].keys(), 0)
    for doc in doc_list:
        for word, val in doc.items():
            if val > 0:
                if word in idf_dict:
                    idf_dict[word] += 1
                else:
                    idf_dict[word] = 0
                
    for word, val in idf_dict.items():
        try:
            idf_dict[word] = math.log10(N / float(val))
        except ZeroDivisionError:
            idf_dict[word] = None
        
    return idf_dict

Compute for the TFIDF.

In [None]:
def compute_tf_idf(tf_bow, idfs):
    tf_idf = {}
    for word, val in tf_bow.items():
        if word in idfs and idfs[word] is not None:
            tf_idf[word] = val * idfs[word]
        else:
            tf_idf[word] = 0
    return tf_idf

Prepare the data.

In [None]:
class str(str):
    def preprocess(self, accumulator=[]):
        import re
        self = re.sub(r'[^\w\s]','',self)
        parsed_doc = self.lower().split(" ")
        accumulator['data'] = set(accumulator['data']).union(set(parsed_doc))
        doc_dict = dict.fromkeys(parsed_doc, 0)
        for word in parsed_doc:
            doc_dict[word] += 1
        return (parsed_doc, doc_dict)

word_set = { 'data': [] }
doc_a = str("The unicorns and cats are majestic animals that should be taken care of and loved.")
(doc_a_set, doc_a_dict) = doc_a.preprocess(word_set)

doc_b = str("Gorgeous are those lovely unicorns that makes sure we are safe when we sleep.")
(doc_b_set, doc_b_dict) = doc_b.preprocess(word_set)

doc_c = str("Hoy! Bantay! Anong kinakagat mo nanaman diyan!?")
(doc_c_set, doc_c_dict) = doc_c.preprocess(word_set)

Peak into documents.

In [None]:
print("Document: {}\n\n Dict: {}\n\n".format(doc_a, doc_a_dict))
print("Document: {}\n\n Dict: {}\n\n".format(doc_b, doc_b_dict))
print("Document: {}\n\n Dict: {}\n\n".format(doc_c, doc_c_dict))
print(word_set['data'])

Organize data into a dataframe.

In [None]:
import pandas as pd
with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(pd.DataFrame([doc_a_dict, doc_b_dict, doc_c_dict]))

Compute for tf.

In [None]:
doc_a_tf = compute_tf(doc_a_dict, doc_a)
doc_b_tf = compute_tf(doc_b_dict, doc_b)
doc_c_tf = compute_tf(doc_c_dict, doc_c)

Peak into the term frequencies.

In [None]:
print("Document A: {}".format(doc_a_tf))

Compute for idf.

In [None]:
idfs = compute_idf([doc_a_dict, doc_b_dict, doc_c_dict])

Peak into idf.

In [None]:
print("IDF for Document A, B, and C: {}".format(idfs))

Compute for word similarity.

In [None]:
doc_a_tfidf = compute_tf_idf(doc_a_tf, idfs)
doc_b_tfidf = compute_tf_idf(doc_b_tf, idfs)
doc_c_tfidf = compute_tf_idf(doc_c_tf, idfs)

with pd.option_context('display.max_rows', None, 'display.max_columns', None):
    display(pd.DataFrame([doc_a_tfidf, doc_b_tfidf, doc_c_tfidf]))