# Exercises: Hands-on Practice

1. **Visualize Top TF-IDF Terms**:
   Modify the DataFrame to display the top 3 terms with the highest TF-IDF values for each document.

2. **Add New Documents**:
   Add new documents to the corpus and observe how the IDF values change.

3. **Explain Multiple Terms**:
   Extend the `explain_tfidf` function to display calculations for multiple terms at once.

4. **Custom Preprocessing**:
   Modify the `TfidfVectorizer` to include custom preprocessing, such as lowercasing, removing stopwords, or stemming.

5. **Analyze the Impact of Parameters**:
   Experiment with the `TfidfVectorizer` parameters, such as `max_df`, `min_df`, and `ngram_range`. Observe their effect on the TF-IDF matrix.

6. **Compare with Raw Term Frequencies**:
   Use `CountVectorizer` to calculate raw term frequencies and compare them with the TF-IDF values.

7. **Real Dataset**:
   Apply TF-IDF to a real-world dataset, such as movie reviews or news articles. Visualize the most important terms in each document.


# Ex 1

In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd

# Sample documents
documents = [
    "The cat sat on the mat.",
    "The dog chased the cat.",
    "The mat was on the floor."
]

# Compute TF-IDF
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(documents)
feature_names = vectorizer.get_feature_names_out()

# Convert to DataFrame for visualization
df = pd.DataFrame(
    tfidf_matrix.toarray(),
    columns=feature_names,
    index=[f'Doc {i+1}' for i in range(len(documents))]
)

# Extract top 3 terms for each document
for doc_id in range(len(documents)):
    print(f"\nTop 3 terms for Document {doc_id + 1}:")
    top_terms = df.iloc[doc_id].sort_values(ascending=False).head(3)
    for term, score in top_terms.items():
        print(f"  {term}: {score:.4f}")



Top 3 terms for Document 1:
  the: 0.5812
  sat: 0.4920
  cat: 0.3742

Top 3 terms for Document 2:
  the: 0.5926
  chased: 0.5017
  dog: 0.5017

Top 3 terms for Document 3:
  the: 0.5536
  floor: 0.4687
  was: 0.4687


# Ex 2

In [2]:
# Extend the corpus
new_documents = documents + [
    "The quick brown fox jumps over the lazy dog.",
    "The mat was in the living room.",
    "The floor was clean and shiny."
]

# Recompute TF-IDF
new_vectorizer = TfidfVectorizer()
new_tfidf_matrix = new_vectorizer.fit_transform(new_documents)
new_feature_names = new_vectorizer.get_feature_names_out()

# Display updated IDF values
idf_values = new_vectorizer.idf_
idf_df = pd.DataFrame(
    {'Term': new_feature_names, 'IDF': idf_values}
).sort_values(by='IDF', ascending=False)

print("\nUpdated IDF Values:")
print(idf_df.head(10))



Updated IDF Values:
      Term       IDF
0      and  2.252763
9    jumps  2.252763
18   shiny  2.252763
17     sat  2.252763
16    room  2.252763
15   quick  2.252763
14    over  2.252763
11  living  2.252763
1    brown  2.252763
10    lazy  2.252763


# Ex 3

explian for multiple terms in a document

In [3]:
# def explain_multiple_tfidf(terms, doc_index):
#     for term in terms:
#         if term in feature_names:
#             term_index = list(feature_names).index(term)
#             tf = tfidf_matrix[doc_index, term_index]
#             idf = vectorizer.idf_[term_index]
#             tfidf = tf * idf
#             print(f"\nExplanation for term '{term}' in Document {doc_index + 1}:")
#             print(f"  TF (Term Frequency): {tf}")
#             print(f"  IDF (Inverse Document Frequency): {idf:.4f}")
#             print(f"  TF-IDF: {tfidf:.4f}")
#         else:
#             print(f"\nTerm '{term}' not found in the vocabulary.")

# # Example explanation for multiple terms
# explain_multiple_tfidf(["cat", "mat", "dog"], 0)


def explain_tfidf_multiple(terms, doc_index):

    for term in terms:

        # Find the index of the term

        term_index = list(feature_names).index(term)

        # Calculate TF (Term Frequency)

        tf = vectorizer.transform([documents[doc_index]]).toarray()[0][term_index]

        # Get IDF (Inverse Document Frequency)

        idf = vectorizer.idf_[term_index]

        # Calculate TF-IDF

        tfidf = tf * idf

        # Display results

        print(f"\nExplanation for term '{term}' in Document {doc_index + 1}:")

        print(f"TF (Term Frequency): {tf}")

        print(f"IDF (Inverse Document Frequency): {idf:.4f}")

        print(f"TF-IDF: {tfidf:.4f}")

explain_tfidf_multiple(["cat", "dog", "mat"], 0)



Explanation for term 'cat' in Document 1:
TF (Term Frequency): 0.37420725915942793
IDF (Inverse Document Frequency): 1.2877
TF-IDF: 0.4819

Explanation for term 'dog' in Document 1:
TF (Term Frequency): 0.0
IDF (Inverse Document Frequency): 1.6931
TF-IDF: 0.0000

Explanation for term 'mat' in Document 1:
TF (Term Frequency): 0.37420725915942793
IDF (Inverse Document Frequency): 1.2877
TF-IDF: 0.4819


# Ex 4

In [4]:
import nltk
nltk.download('punkt_tab')

from nltk.stem import PorterStemmer
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize


# Preprocess using PorterStemmer
stemmer = PorterStemmer()

def new_preprocessor(text):
    tokens = word_tokenize(text.lower())
    stemmed_tokens = [stemmer.stem(token) for token in tokens if token.isalpha()]
    return ' '.join(stemmed_tokens)

# Apply custom preprocessing to TF-IDF
custom_vectorizer = TfidfVectorizer(preprocessor=new_preprocessor)
custom_tfidf_matrix = custom_vectorizer.fit_transform(documents)
custom_feature_names = custom_vectorizer.get_feature_names_out()

# Convert to DataFrame for visualization
custom_df = pd.DataFrame(
    custom_tfidf_matrix.toarray(),
    columns=custom_feature_names,
    index=[f'Doc {i+1}' for i in range(len(documents))]
)

print("\nTF-IDF Matrix:")
print(custom_df)



TF-IDF Matrix:
            cat     chase       dog     floor       mat        on       sat  \
Doc 1  0.374207  0.000000  0.000000  0.000000  0.374207  0.374207  0.492038   
Doc 2  0.381519  0.501651  0.501651  0.000000  0.000000  0.000000  0.000000   
Doc 3  0.000000  0.000000  0.000000  0.468699  0.356457  0.356457  0.000000   

            the        wa  
Doc 1  0.581211  0.000000  
Doc 2  0.592567  0.000000  
Doc 3  0.553642  0.468699  


[nltk_data] Downloading package punkt_tab to
[nltk_data]     /Users/oysterable/nltk_data...
[nltk_data]   Package punkt_tab is already up-to-date!


# Ex 5

In [5]:
# Experiment with some new parameters
param_vectorizer = TfidfVectorizer(max_df=0.8, min_df=2, ngram_range=(1, 2))
param_tfidf_matrix = param_vectorizer.fit_transform(documents)
param_feature_names = param_vectorizer.get_feature_names_out()

# Convert to DataFrame
param_df = pd.DataFrame(
    param_tfidf_matrix.toarray(),
    columns=param_feature_names,
    index=[f'Doc {i+1}' for i in range(len(documents))]
)

print("\nTF-IDF Matrix:")
print(param_df)



TF-IDF Matrix:
            cat       mat        on    on the   the cat   the mat
Doc 1  0.408248  0.408248  0.408248  0.408248  0.408248  0.408248
Doc 2  0.707107  0.000000  0.000000  0.000000  0.707107  0.000000
Doc 3  0.000000  0.500000  0.500000  0.500000  0.000000  0.500000


# Ex 6

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

# Compute raw term frequencies
count_vectorizer = CountVectorizer()
count_matrix = count_vectorizer.fit_transform(documents)
count_feature_names = count_vectorizer.get_feature_names_out()

# Convert raw counts to DataFrame
count_df = pd.DataFrame(
    count_matrix.toarray(),
    columns=count_feature_names,
    index=[f'Doc {i+1}' for i in range(len(documents))]
)

print("\nRaw Term Frequencies:")
print(count_df)

print("\nTF-IDF Values:")
print(df)



Raw Term Frequencies:
       cat  chased  dog  floor  mat  on  sat  the  was
Doc 1    1       0    0      0    1   1    1    2    0
Doc 2    1       1    1      0    0   0    0    2    0
Doc 3    0       0    0      1    1   1    0    2    1

TF-IDF Values:
            cat    chased       dog     floor       mat        on       sat  \
Doc 1  0.374207  0.000000  0.000000  0.000000  0.374207  0.374207  0.492038   
Doc 2  0.381519  0.501651  0.501651  0.000000  0.000000  0.000000  0.000000   
Doc 3  0.000000  0.000000  0.000000  0.468699  0.356457  0.356457  0.000000   

            the       was  
Doc 1  0.581211  0.000000  
Doc 2  0.592567  0.000000  
Doc 3  0.553642  0.468699  


# Ex 7

References:

https://scikit-learn.org/1.5/datasets/real_world.html#the-20-newsgroups-text-dataset 

In [3]:
from sklearn.datasets import fetch_20newsgroups

# Load a small subset of the 20 Newsgroups dataset from scikit-learn
newsgroups = fetch_20newsgroups(subset='train', categories=['sci.space', 'rec.sport.baseball'], remove=('headers', 'footers', 'quotes'))

list(newsgroups.target_names)

['rec.sport.baseball', 'sci.space']

In [None]:

real_documents = newsgroups.data[:10]  # Limit to 10 documents for simplicity
real_documents

In [None]:

# Compute TF-IDF
real_vectorizer = TfidfVectorizer()
real_tfidf_matrix = real_vectorizer.fit_transform(real_documents)
real_feature_names = real_vectorizer.get_feature_names_out()

# Load into dataframe
real_df = pd.DataFrame(
    real_tfidf_matrix.toarray(),
    columns=real_feature_names,
    index=[f'Doc {i+1}' for i in range(len(real_documents))]
)

print("\nTF-IDF Matrix for Real Dataset:")
print(real_df.head())
