# TF-IDF

In [4]:
import pandas as pd
from collections import Counter
import math

documents = [
    "I love watching movies",
    "I enjoy reading books",
    "I love reading and watching movies"
]

# Preprocess sentences: lower case and tokenize
tokenized_documents = [doc.lower().split() for doc in documents]
print(tokenized_documents)

[['i', 'love', 'watching', 'movies'], ['i', 'enjoy', 'reading', 'books'], ['i', 'love', 'reading', 'and', 'watching', 'movies']]


In [5]:
# Calculate the term frequency
def term_frequency(doc):
    term_count = Counter(doc)
    # Counter({'i': 1, 'love': 1, 'watching': 1, 'movies': 1})
    
    total_terms = len(doc)
    return {term: count / total_terms for term, count in term_count.items()}

In [14]:
tf_values = [term_frequency(doc) for doc in tokenized_documents]

In [15]:
tf_values

[{'i': 0.25, 'love': 0.25, 'watching': 0.25, 'movies': 0.25},
 {'i': 0.25, 'enjoy': 0.25, 'reading': 0.25, 'books': 0.25},
 {'i': 0.16666666666666666,
  'love': 0.16666666666666666,
  'reading': 0.16666666666666666,
  'and': 0.16666666666666666,
  'watching': 0.16666666666666666,
  'movies': 0.16666666666666666}]

In [16]:
tf_df = pd.DataFrame(tf_values, index=['Doc1', 'Doc2', 'Doc3']).fillna(0)

In [17]:
tf_df

Unnamed: 0,i,love,watching,movies,enjoy,reading,books,and
Doc1,0.25,0.25,0.25,0.25,0.0,0.0,0.0,0.0
Doc2,0.25,0.0,0.0,0.0,0.25,0.25,0.25,0.0
Doc3,0.166667,0.166667,0.166667,0.166667,0.0,0.166667,0.0,0.166667


In [6]:
# Calculate the inverse document frequency
# 전체 말뭉치를 대상으로 한번만 실행
def inverse_document_frequency(docs):
    total_docs = len(docs)
    
    # {'movies', 'enjoy', 'reading', 'books', 'watching', 'love', 'and', 'i'}
    # docs내에 doc이 있고, doc내에 term들의 집합
    unique_terms = set(term for doc in docs for term in doc)
    
    # {'movies': 2, 'enjoy': 1, 'reading': 2, 'books': 1, 'watching': 2, 'love': 2, 'and': 1, 'i': 2}
    # unique_terms안의 term에 대해서 docs 내에 doc내에 term이 있을때마다 1씩 더함
    term_doc_counts = {term: sum(1 for doc in docs if term in doc) for term in unique_terms}
    
    return {term: math.log(total_docs / count) for term, count in term_doc_counts.items()}

In [7]:
idf_values = inverse_document_frequency(tokenized_documents)

In [8]:
idf_values

{'watching': 0.4054651081081644,
 'enjoy': 1.0986122886681098,
 'i': 0.0,
 'love': 0.4054651081081644,
 'books': 1.0986122886681098,
 'reading': 0.4054651081081644,
 'movies': 0.4054651081081644,
 'and': 1.0986122886681098}

In [9]:
idf_df = pd.DataFrame([idf_values], index=['IDF']).fillna(0)

In [10]:
idf_df

Unnamed: 0,watching,enjoy,i,love,books,reading,movies,and
IDF,0.405465,1.098612,0.0,0.405465,1.098612,0.405465,0.405465,1.098612


In [18]:
# Calculate the TF-IDF values
def tf_idf(tf, idf):
    return {term: tf_val * idf[term] for term,tf_val in tf.items()}

In [19]:
tf_idf_values = [tf_idf(tf, idf_values) for tf in tf_values]

In [20]:
tf_idf_df = pd.DataFrame(tf_idf_values, index=['Doc1', 'Doc2', 'Doc3']).fillna(0)

In [21]:
tf_idf_df

Unnamed: 0,i,love,watching,movies,enjoy,reading,books,and
Doc1,0.0,0.101366,0.101366,0.101366,0.0,0.0,0.0,0.0
Doc2,0.0,0.0,0.0,0.0,0.274653,0.101366,0.274653,0.0
Doc3,0.0,0.067578,0.067578,0.067578,0.0,0.067578,0.0,0.183102
