In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import numpy as np

# Define the corpus of customer feedback messages
corpus = [
    "The delivery was fast and the product was excellent.",
    "Very poor packaging, but the delivery was quick.",
    "I love the quality of this product!",
    "Product was damaged, very disappointing experience.",
    "Excellent service and quick response from the team."
]

# (a) Use TfidfVectorizer to transform the corpus into a document-term matrix
print("=" * 70)
print("PART (a): TF-IDF Vectorization")
print("=" * 70)

vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform(corpus)

print(f"\nDocument-Term Matrix Shape: {tfidf_matrix.shape}")
print(f"Number of documents: {tfidf_matrix.shape[0]}")
print(f"Number of unique terms: {tfidf_matrix.shape[1]}")

print("\nFeature names (terms):")
print(vectorizer.get_feature_names_out())

print("\nTF-IDF Matrix (dense format):")
print(tfidf_matrix.toarray())

# (b) Compute the cosine similarity between all document pairs
print("\n" + "=" * 70)
print("PART (b): Cosine Similarity Between All Document Pairs")
print("=" * 70)

similarity_matrix = cosine_similarity(tfidf_matrix)

print("\nCosine Similarity Matrix:")
print(similarity_matrix)

print("\nPairwise similarities:")
for i in range(len(corpus)):
    for j in range(i + 1, len(corpus)):
        print(f"Document {i} vs Document {j}: {similarity_matrix[i][j]:.4f}")

# (c) Identify and print the two most similar documents
print("\n" + "=" * 70)
print("PART (c): Two Most Similar Documents")
print("=" * 70)

# Create a copy of the similarity matrix and set diagonal to -1 to ignore self-similarity
similarity_copy = similarity_matrix.copy()
np.fill_diagonal(similarity_copy, -1)

# Find the maximum similarity value (most similar pair)
max_similarity = np.max(similarity_copy)
max_indices = np.where(similarity_copy == max_similarity)

# Get the indices of the most similar documents
doc1_idx = max_indices[0][0]
doc2_idx = max_indices[1][0]

print(f"\nThe two most similar documents are:")
print(f"\nDocument {doc1_idx}: \"{corpus[doc1_idx]}\"")
print(f"Document {doc2_idx}: \"{corpus[doc2_idx]}\"")
print(f"\nSimilarity Score: {max_similarity:.4f}")

print("\n" + "=" * 70)

PART (a): TF-IDF Vectorization

Document-Term Matrix Shape: (5, 23)
Number of documents: 5
Number of unique terms: 23

Feature names (terms):
['and' 'but' 'damaged' 'delivery' 'disappointing' 'excellent' 'experience'
 'fast' 'from' 'love' 'of' 'packaging' 'poor' 'product' 'quality' 'quick'
 'response' 'service' 'team' 'the' 'this' 'very' 'was']

TF-IDF Matrix (dense format):
[[0.31730802 0.         0.         0.31730802 0.         0.31730802
  0.         0.39329511 0.         0.         0.         0.
  0.         0.26339432 0.         0.         0.         0.
  0.         0.44315121 0.         0.         0.52678864]
 [0.         0.41816992 0.         0.33737686 0.         0.
  0.         0.         0.         0.         0.         0.41816992
  0.41816992 0.         0.         0.33737686 0.         0.
  0.         0.23558964 0.         0.33737686 0.28005327]
 [0.         0.         0.         0.         0.         0.
  0.         0.         0.         0.4580648  0.4580648  0.
  0.      