In [1]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from difflib import SequenceMatcher
from transformers import pipeline
import numpy as np

# Sample paragraphs
para1 = "The quick brown cat falls over the lazy dog."
para2 = "The slow red fox jump over a lazy rat."

# 1. Preprocessing and Cosine Similarity (TF-IDF)
print("=== TF-IDF Cosine Similarity ===")
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform([para1, para2])
cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
print(f"Cosine Similarity (TF-IDF): {cosine_sim:.4f}")

print("\n=== Levenshtein Distance ===")
levenshtein_sim = SequenceMatcher(None, para1, para2).ratio()
print(f"Levenshtein Similarity: {levenshtein_sim:.4f}")

# 3. Semantic Matching with BERT
print("\n=== Semantic Similarity with BERT ===")
bert_model = pipeline('feature-extraction', model='bert-large-uncased')
# Get embeddings for both paragraphs
bert_emb1 = np.mean(bert_model(para1), axis=1)
bert_emb2 = np.mean(bert_model(para2), axis=1)
# Compute Cosine Similarity on BERT embeddings
bert_cosine_sim = cosine_similarity(bert_emb1, bert_emb2)[0][0]
print(f"Cosine Similarity (BERT): {bert_cosine_sim:.4f}")

  from .autonotebook import tqdm as notebook_tqdm


=== TF-IDF Cosine Similarity ===
Cosine Similarity (TF-IDF): 0.2798

=== Levenshtein Distance ===
Levenshtein Similarity: 0.5366

=== Semantic Similarity with BERT ===


Device set to use cpu


TypeError: unsupported format string passed to numpy.ndarray.__format__

In [2]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from difflib import SequenceMatcher
from sentence_transformers import SentenceTransformer
import numpy as np

# Sample paragraphs
para1 = "The quick brown cat falls over the lazy dog."
para2 = "The brown cat falls over the dog."

# 1. Preprocessing and Cosine Similarity (TF-IDF)
print("=== TF-IDF Cosine Similarity ===")
tfidf = TfidfVectorizer()
tfidf_matrix = tfidf.fit_transform([para1, para2])
cosine_sim = cosine_similarity(tfidf_matrix[0], tfidf_matrix[1])[0][0]
print(f"Cosine Similarity (TF-IDF): {cosine_sim:.4f}")

# 2. Levenshtein Distance
print("\n=== Levenshtein Distance ===")
levenshtein_sim = SequenceMatcher(None, para1, para2).ratio()
print(f"Levenshtein Similarity: {levenshtein_sim:.4f}")

# 3. Harmonic Mean of TF-IDF and Levenshtein Similarity
hm = 2 * (cosine_sim * levenshtein_sim) / (cosine_sim + levenshtein_sim)
print(f"\n=== Harmonic Mean (TF-IDF & Levenshtein) ===")
print(f"Harmonic Mean: {hm:.4f}")

# 4. Semantic Matching with Sentence Transformers
print("\n=== Semantic Similarity with Sentence Transformers ===")
model = SentenceTransformer('all-MiniLM-L6-v2')  # Lightweight model
emb1 = model.encode(para1, normalize_embeddings=True)
emb2 = model.encode(para2, normalize_embeddings=True)
bert_cosine_sim = cosine_similarity([emb1], [emb2])[0][0]
print(f"Cosine Similarity (Sentence Transformers): {bert_cosine_sim:.4f}")


=== TF-IDF Cosine Similarity ===
Cosine Similarity (TF-IDF): 0.8336

=== Levenshtein Distance ===
Levenshtein Similarity: 0.8571

=== Harmonic Mean (TF-IDF & Levenshtein) ===
Harmonic Mean: 0.8452

=== Semantic Similarity with Sentence Transformers ===
Cosine Similarity (Sentence Transformers): 0.8475
