# **Experiment-11**

### Objective: 
Write a program to compute the similarity between any two sentences of text using different metrics for analyzing textual similarity.

In [1]:
%pip install nltk pandas scikit-learn




[notice] A new release of pip is available: 24.2 -> 24.3.1
[notice] To update, run: python.exe -m pip install --upgrade pip



Collecting scikit-learn
  Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl.metadata (13 kB)
Collecting scipy>=1.6.0 (from scikit-learn)
  Downloading scipy-1.14.1-cp312-cp312-win_amd64.whl.metadata (60 kB)
Collecting threadpoolctl>=3.1.0 (from scikit-learn)
  Using cached threadpoolctl-3.5.0-py3-none-any.whl.metadata (13 kB)
Downloading scikit_learn-1.5.2-cp312-cp312-win_amd64.whl (11.0 MB)
   ---------------------------------------- 0.0/11.0 MB ? eta -:--:--
   ---------------- ----------------------- 4.5/11.0 MB 30.0 MB/s eta 0:00:01
   ------------------------------------- -- 10.2/11.0 MB 27.7 MB/s eta 0:00:01
   ---------------------------------------- 11.0/11.0 MB 26.4 MB/s eta 0:00:00
Downloading scipy-1.14.1-cp312-cp312-win_amd64.whl (44.5 MB)
   ---------------------------------------- 0.0/44.5 MB ? eta -:--:--
   ---- ----------------------------------- 4.5/44.5 MB 20.7 MB/s eta 0:00:02
   -------- ------------------------------- 9.7/44.5 MB 23.2 MB/s eta 0:00:02
   -

In [2]:
# Import necessary libraries
import nltk
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from nltk.corpus import wordnet as wn
import numpy as np
import re

# Download required NLTK data files
nltk.download('punkt', download_dir="C:/nltk_data")
nltk.download('stopwords', download_dir="C:/nltk_data")
nltk.download('wordnet', download_dir="C:/nltk_data")

[nltk_data] Downloading package punkt to C:/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to C:/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to C:/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [3]:
# Define two sentences for similarity analysis
sentence1 = "The quick brown fox jumps over the lazy dog."
sentence2 = "A fast brown animal leaps over a sleeping canine."

print("Sentence 1:", sentence1)
print("Sentence 2:", sentence2)

Sentence 1: The quick brown fox jumps over the lazy dog.
Sentence 2: A fast brown animal leaps over a sleeping canine.


In [4]:
# Define a function to preprocess the text
def preprocess_text(text):
    # Convert to lowercase
    text = text.lower()
    # Remove punctuation and special characters
    text = re.sub(r'[^\w\s]', '', text)
    # Tokenize the text
    tokens = word_tokenize(text)
    # Remove stopwords
    tokens = [word for word in tokens if word not in stopwords.words('english')]
    return tokens

# Preprocess both sentences
tokens1 = preprocess_text(sentence1)
tokens2 = preprocess_text(sentence2)

print("Processed Tokens for Sentence 1:", tokens1)
print("Processed Tokens for Sentence 2:", tokens2)

Processed Tokens for Sentence 1: ['quick', 'brown', 'fox', 'jumps', 'lazy', 'dog']
Processed Tokens for Sentence 2: ['fast', 'brown', 'animal', 'leaps', 'sleeping', 'canine']


In [5]:
# Combine the sentences into a corpus
corpus = [sentence1, sentence2]

# Initialize the TF-IDF Vectorizer
vectorizer = TfidfVectorizer()

# Fit and transform the corpus
tfidf_matrix = vectorizer.fit_transform(corpus)

# Compute cosine similarity
cosine_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])[0][0]
print("Cosine Similarity (TF-IDF):", cosine_sim)

Cosine Similarity (TF-IDF): 0.13049436152984825


In [6]:
# Define a function to compute Jaccard Similarity
def jaccard_similarity(tokens1, tokens2):
    set1 = set(tokens1)
    set2 = set(tokens2)
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

# Compute Jaccard Similarity
jaccard_sim = jaccard_similarity(tokens1, tokens2)
print("Jaccard Similarity:", jaccard_sim)

Jaccard Similarity: 0.09090909090909091


In [7]:
# Define a function to compute Levenshtein Distance
def levenshtein_distance(s1, s2):
    if len(s1) < len(s2):
        return levenshtein_distance(s2, s1)

    if len(s2) == 0:
        return len(s1)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]

# Compute Levenshtein Distance
levenshtein_dist = levenshtein_distance(sentence1, sentence2)
print("Levenshtein Distance:", levenshtein_dist)

Levenshtein Distance: 32


In [8]:
# Define a function to compute WordNet similarity
def wordnet_similarity(tokens1, tokens2):
    synsets1 = [wn.synsets(word)[0] for word in tokens1 if wn.synsets(word)]
    synsets2 = [wn.synsets(word)[0] for word in tokens2 if wn.synsets(word)]

    score = 0.0
    count = 0

    for synset1 in synsets1:
        for synset2 in synsets2:
            sim = synset1.path_similarity(synset2)
            if sim is not None:
                score += sim
                count += 1

    if count == 0:
        return 0.0
    return score / count

# Compute WordNet Similarity
wordnet_sim = wordnet_similarity(tokens1, tokens2)
print("WordNet-Based Similarity:", wordnet_sim)

WordNet-Based Similarity: 0.10795859026357747


In [9]:
print("\nSummary of Sentence Similarity Metrics:")
print(f"Cosine Similarity (TF-IDF): {cosine_sim:.4f}")
print(f"Jaccard Similarity: {jaccard_sim:.4f}")
print(f"Levenshtein Distance: {levenshtein_dist}")
print(f"WordNet-Based Similarity: {wordnet_sim:.4f}")


Summary of Sentence Similarity Metrics:
Cosine Similarity (TF-IDF): 0.1305
Jaccard Similarity: 0.0909
Levenshtein Distance: 32
WordNet-Based Similarity: 0.1080
