## Term Frequency - Inverse Document Frequency (TF-IDF)

In [1]:
import pandas as pd
import numpy as np
from typing import List
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
corpus = [
    "the cat sat on the mat",
    "the dog chased the cat",
    "the cat and dog played together",
    "the cat is sleeping",
    "the dog barked loudly",
]

### 1. From Scratch Implementation

In [3]:
class TFIDFVectorizer:
    def __init__(self, smooth_idf: bool = True):
        self.__smooth_idf = smooth_idf
        self.__idf = None
        self.vocabulary = set()

    def fit_transform(self, corpus: List[str]):
        self.vocabulary = set(" ".join(corpus).strip().lower().split())

        n = len(corpus)
        tf_dict = {term: [] for term in self.vocabulary}
        idf_dict = {term: [] for term in self.vocabulary}

        # calculating TF term
        for doc in corpus:
            for term in self.vocabulary:
                doc_terms = doc.strip().lower().split()
                tf_value = doc_terms.count(term) / len(doc_terms)
                tf_dict[term].append(tf_value)

        # calculating IDF term
        for term in self.vocabulary:
            docs_with_term = sum(
                [
                    1 if d.strip().lower().split().count(term) != 0 else 0
                    for d in corpus
                ]
            )
            if self.__smooth_idf:
                idf_value = np.log((n + 1) / (docs_with_term + 1)) + 1
            else:
                idf_value = np.log(n / docs_with_term)
            idf_dict[term].append(idf_value)

        tf = pd.DataFrame(tf_dict)
        idf = pd.DataFrame(idf_dict)
        tfidf = tf * idf.values  # matrix vs vector multiplication (row wise)
        normalized_tfidf = tfidf.apply(
            lambda row: row / np.linalg.norm(row), axis=1
        )  # l2-norm
        self.__idf = idf.copy()

        return normalized_tfidf.sort_index(axis=1)

    def transform(self, corpus: List[str]):
        n = len(corpus)
        tf_dict = {term: [] for term in self.vocabulary}

        # calculating TF term
        for doc in corpus:
            for term in self.vocabulary:
                doc_terms = doc.strip().lower().split()
                tf_value = doc_terms.count(term) / len(doc_terms)
                tf_dict[term].append(tf_value)

        # getting IDF term and calculating TF-IDF
        tf = pd.DataFrame(tf_dict)
        tfidf = (
            tf * self.__idf.values
        )  # matrix vs vector multiplication (row wise)
        normalized_tfidf = tfidf.apply(
            lambda row: row / np.linalg.norm(row), axis=1
        )  # l2-norm

        return normalized_tfidf.sort_index(axis=1)

In [4]:
my_vectorizer = TFIDFVectorizer(smooth_idf=True)
my_tfidf_matrix = my_vectorizer.fit_transform(corpus)
my_tfidf_matrix

Unnamed: 0,and,barked,cat,chased,dog,is,loudly,mat,on,played,sat,sleeping,the,together
0,0.0,0.0,0.274068,0.0,0.0,0.0,0.0,0.486468,0.486468,0.0,0.486468,0.0,0.463609,0.0
1,0.0,0.0,0.344517,0.611516,0.409539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.582781,0.0
2,0.50044,0.0,0.281939,0.0,0.33515,0.0,0.0,0.0,0.0,0.50044,0.0,0.0,0.238462,0.50044
3,0.0,0.0,0.353188,0.0,0.0,0.626906,0.0,0.0,0.0,0.0,0.0,0.626906,0.298724,0.0
4,0.0,0.611353,0.0,0.0,0.40943,0.0,0.611353,0.0,0.0,0.0,0.0,0.0,0.291313,0.0


### 2. Sklearn Implementation

In [5]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

tfidf_df = pd.DataFrame(
    data=tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out()
)

tfidf_df.sort_index(axis=1)

Unnamed: 0,and,barked,cat,chased,dog,is,loudly,mat,on,played,sat,sleeping,the,together
0,0.0,0.0,0.274068,0.0,0.0,0.0,0.0,0.486468,0.486468,0.0,0.486468,0.0,0.463609,0.0
1,0.0,0.0,0.344517,0.611516,0.409539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.582781,0.0
2,0.50044,0.0,0.281939,0.0,0.33515,0.0,0.0,0.0,0.0,0.50044,0.0,0.0,0.238462,0.50044
3,0.0,0.0,0.353188,0.0,0.0,0.626906,0.0,0.0,0.0,0.0,0.0,0.626906,0.298724,0.0
4,0.0,0.611353,0.0,0.0,0.40943,0.0,0.611353,0.0,0.0,0.0,0.0,0.0,0.291313,0.0


### 3. Applying Transform to New Documents

In [6]:
query = "what is the cat doing ??"

In [7]:
# my vectorizer
query_vector = my_vectorizer.transform([query])
query_vector

Unnamed: 0,and,barked,cat,chased,dog,is,loudly,mat,on,played,sat,sleeping,the,together
0,0.0,0.0,0.453331,0.0,0.0,0.804659,0.0,0.0,0.0,0.0,0.0,0.0,0.383424,0.0


In [8]:
# sklearn vectorizer
query_vec = vectorizer.transform([query])
pd.DataFrame(
    data=query_vec.toarray(), columns=vectorizer.get_feature_names_out()
)

Unnamed: 0,and,barked,cat,chased,dog,is,loudly,mat,on,played,sat,sleeping,the,together
0,0.0,0.0,0.453331,0.0,0.0,0.804659,0.0,0.0,0.0,0.0,0.0,0.0,0.383424,0.0
