In [1]:
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
import numpy as np

In [2]:
corpus = [
    'The quick brown fox jumps over the lazy dog.',
    'A dog is lazy, but a fox is quick.',
    'The new movie features a quick-witted fox.',
    'Jumping is fun, but a lazy dog likes to sleep.',
    'The quick brown dog is not lazy.'
]

print(f"Total documents (corpus size): {len(corpus)}\n")

Total documents (corpus size): 5



In [5]:
#Initialize the TfidfVectorizer
vectorizer = TfidfVectorizer(stop_words='english')

# Fit and Transform the corpus
tfidf_matrix = vectorizer.fit_transform(corpus)

In [6]:
# Get the feature names (words/tokens)
feature_names = vectorizer.get_feature_names_out()

# Convert the sparse TF-IDF matrix to a dense array and then to a DataFrame
tfidf_df = pd.DataFrame(
    tfidf_matrix.toarray(),
    index=[f'Document {i+1}' for i in range(len(corpus))],
    columns=feature_names
)

# Display the IDF values calculated for each unique term
print("--- Inverse Document Frequency (IDF) per Term ---")
idf_values = pd.DataFrame({
    'Term': feature_names,
    'IDF': vectorizer.idf_
}).sort_values(by='IDF', ascending=False).reset_index(drop=True)
print(idf_values.head(10))
print("\n" + "="*80 + "\n")

# Display the final TF-IDF feature matrix
print("--- TF-IDF Feature Matrix (Document-Term Matrix) ---")
print("Each cell is the TF-IDF weight for the word (column) in the document (row).")
print(tfidf_df.round(3))
print("\n" + "="*80 + "\n")

# Example: Finding the top 3 most important words in Document 1
doc_index = 0
top_indices = tfidf_df.iloc[doc_index].nlargest(3).index.tolist()
top_scores = tfidf_df.iloc[doc_index].nlargest(3).values

print(f"--- Top 3 Key Features in Document 1 ---")
for word, score in zip(top_indices, top_scores):
    print(f"  - '{word}': {score:.3f}")

--- Inverse Document Frequency (IDF) per Term ---
       Term       IDF
0       fun  2.098612
1  features  2.098612
2   jumping  2.098612
3     movie  2.098612
4     likes  2.098612
5     jumps  2.098612
6    witted  2.098612
7     sleep  2.098612
8       new  2.098612
9     brown  1.693147


--- TF-IDF Feature Matrix (Document-Term Matrix) ---
Each cell is the TF-IDF weight for the word (column) in the document (row).
            brown    dog  features    fox    fun  jumping  jumps   lazy  \
Document 1  0.462  0.323     0.000  0.383  0.000    0.000  0.572  0.323   
Document 2  0.000  0.476     0.000  0.566  0.000    0.000  0.000  0.476   
Document 3  0.000  0.000     0.458  0.307  0.000    0.000  0.000  0.000   
Document 4  0.000  0.262     0.000  0.000  0.464    0.464  0.000  0.262   
Document 5  0.637  0.445     0.000  0.000  0.000    0.000  0.000  0.445   

            likes  movie    new  quick  sleep  witted  
Document 1  0.000  0.000  0.000  0.323  0.000   0.000  
Document 2  0.