## Term Frequency - Inverse Document Frequency (TF-IDF)

### 1. From Scratch Implementation

In [1]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
corpus = [
    "the cat sat on the mat",
    "the dog chased the cat",
    "the cat and dog played together",
    "the cat is sleeping",
    "the dog barked loudly",
]

In [3]:
uni_terms = set(" ".join(corpus).strip().lower().split())
uni_terms

{'and',
 'barked',
 'cat',
 'chased',
 'dog',
 'is',
 'loudly',
 'mat',
 'on',
 'played',
 'sat',
 'sleeping',
 'the',
 'together'}

In [4]:
tf_dict = {term: [] for term in uni_terms}
idf_dict = {term: [] for term in uni_terms}
n = len(corpus)

# calculate TF
for doc in corpus:
    for term in uni_terms:
        doc_terms = doc.strip().split()
        tf_value = doc_terms.count(term) / len(doc_terms)
        tf_dict[term].append(tf_value)

# calculate IDF
for term in uni_terms:
    docs_with_term = sum(
        [1 if d.strip().split().count(term) != 0 else 0 for d in corpus]
    )
    idf_value = np.log((n + 1) / (docs_with_term + 1)) + 1
    idf_dict[term].append(idf_value)

In [5]:
tf = pd.DataFrame(tf_dict)
idf = pd.DataFrame(idf_dict)
tfidf = tf * idf.values
normalized_tfidf = tfidf.apply(lambda row: row / np.linalg.norm(row), axis=1)

In [6]:
normalized_tfidf.sort_index(axis=1)

Unnamed: 0,and,barked,cat,chased,dog,is,loudly,mat,on,played,sat,sleeping,the,together
0,0.0,0.0,0.274068,0.0,0.0,0.0,0.0,0.486468,0.486468,0.0,0.486468,0.0,0.463609,0.0
1,0.0,0.0,0.344517,0.611516,0.409539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.582781,0.0
2,0.50044,0.0,0.281939,0.0,0.33515,0.0,0.0,0.0,0.0,0.50044,0.0,0.0,0.238462,0.50044
3,0.0,0.0,0.353188,0.0,0.0,0.626906,0.0,0.0,0.0,0.0,0.0,0.626906,0.298724,0.0
4,0.0,0.611353,0.0,0.0,0.40943,0.0,0.611353,0.0,0.0,0.0,0.0,0.0,0.291313,0.0


### 2. Sklearn Implementation

In [7]:
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

In [8]:
tfidf_df = pd.DataFrame(
    data=tfidf_matrix.toarray(), columns=vectorizer.get_feature_names_out()
)
tfidf_df.sort_index(axis=1)

Unnamed: 0,and,barked,cat,chased,dog,is,loudly,mat,on,played,sat,sleeping,the,together
0,0.0,0.0,0.274068,0.0,0.0,0.0,0.0,0.486468,0.486468,0.0,0.486468,0.0,0.463609,0.0
1,0.0,0.0,0.344517,0.611516,0.409539,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.582781,0.0
2,0.50044,0.0,0.281939,0.0,0.33515,0.0,0.0,0.0,0.0,0.50044,0.0,0.0,0.238462,0.50044
3,0.0,0.0,0.353188,0.0,0.0,0.626906,0.0,0.0,0.0,0.0,0.0,0.626906,0.298724,0.0
4,0.0,0.611353,0.0,0.0,0.40943,0.0,0.611353,0.0,0.0,0.0,0.0,0.0,0.291313,0.0


### 3. Query Vector

In [9]:
query = "what is the cat doing ?"

In [14]:
aa = vectorizer.transform([query])

In [17]:
print(aa.toarray().shape)

(1, 14)


In [18]:
pd.DataFrame(data=aa.toarray(), columns=vectorizer.get_feature_names_out())

Unnamed: 0,and,barked,cat,chased,dog,is,loudly,mat,on,played,sat,sleeping,the,together
0,0.0,0.0,0.453331,0.0,0.0,0.804659,0.0,0.0,0.0,0.0,0.0,0.0,0.383424,0.0
