In [1]:
# Install libraries
!pip install scikit-learn python-Levenshtein nltk

Collecting python-Levenshtein
  Downloading python_Levenshtein-0.25.1-py3-none-any.whl.metadata (3.7 kB)
Collecting Levenshtein==0.25.1 (from python-Levenshtein)
  Downloading Levenshtein-0.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (3.3 kB)
Collecting rapidfuzz<4.0.0,>=3.8.0 (from Levenshtein==0.25.1->python-Levenshtein)
  Downloading rapidfuzz-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Downloading python_Levenshtein-0.25.1-py3-none-any.whl (9.4 kB)
Downloading Levenshtein-0.25.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (177 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m177.4/177.4 kB[0m [31m6.3 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading rapidfuzz-3.9.4-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl (3.4 MB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.4/3.4 MB[0m [31m20.7 MB/s[0m eta [36m0:00:00[0m
[?25hInstalling collected packages: r

In [2]:
# Import libraries
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import CountVectorizer
import Levenshtein
from nltk.translate.bleu_score import sentence_bleu

In [3]:
# Define the strings to compare
string1 = "This is a sample string."
string2 = "This is another example of a string."

### Cosine Similarity

Cosine similarity measures the cosine of the angle between two non-zero vectors in a multi-dimensional space, indicating how similar the vectors are to each other. To evaluate its results, a cosine similarity score close to 1 indicates high similarity, 0 indicates orthogonality (no similarity), and -1 indicates complete dissimilarity. In text similarity, cosine similarity is relevant because it quantifies the similarity between two strings by comparing their vectorized representations.

In [4]:
# Cosine Similarity
vectorizer = TfidfVectorizer()
tfidf_matrix = vectorizer.fit_transform([string1, string2])
cos_sim = cosine_similarity(tfidf_matrix[0:1], tfidf_matrix[1:2])
print(f"Cosine Similarity: {cos_sim[0][0]}")

Cosine Similarity: 0.4501755023269898


### Jaccard Similarity

Jaccard similarity measures the size of the intersection divided by the size of the union of two sets, indicating the proportion of shared elements. To evaluate its results, a Jaccard similarity score of 1 means the sets are identical, 0 means they have no elements in common, and values in between indicate the degree of overlap. In text similarity, Jaccard similarity compares the commonality between two sets of words from text documents, highlighting how much of the content is shared.

In [5]:
# Jaccard Similarity
def jaccard_similarity(str1, str2):
    set1 = set(str1.split())
    set2 = set(str2.split())
    intersection = set1.intersection(set2)
    union = set1.union(set2)
    return len(intersection) / len(union)

jaccard_sim = jaccard_similarity(string1, string2)
print(f"Jaccard Similarity: {jaccard_sim}")

Jaccard Similarity: 0.5


### Levenshtein Distance (Edit Distance)

Levenshtein distance measures the minimum number of single-character edits (insertions, deletions, or substitutions) needed to change one string into another. To evaluate its results, a lower Levenshtein distance indicates greater similarity between the two strings, with a distance of 0 meaning the strings are identical. In text similarity, Levenshtein distance is relevant because it quantifies the effort required to transform one text into another, capturing the direct differences between them.








In [6]:
# Levenshtein Distance (Edit Distance)
lev_distance = Levenshtein.distance(string1, string2)
print(f"Levenshtein Distance: {lev_distance}")

Levenshtein Distance: 13


### BLEU Score

BLEU score (Bilingual Evaluation Understudy) measures the similarity between a candidate text and one or more reference texts based on the precision of n-grams. To evaluate its results, a BLEU score close to 1 indicates high similarity, meaning the candidate text closely matches the reference text(s), while a score closer to 0 indicates low similarity. In text similarity, BLEU score is relevant because it evaluates the quality of text generation or translation by comparing the overlap of n-grams, capturing both precision and the fluency of the candidate text.

In [7]:
# BLEU Score
reference = string1.split()
candidate = string2.split()
bleu_score = sentence_bleu([reference], candidate)
print(f"BLEU Score: {bleu_score}")

BLEU Score: 8.286571670851008e-155


The hypothesis contains 0 counts of 3-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
The hypothesis contains 0 counts of 4-gram overlaps.
Therefore the BLEU score evaluates to 0, independently of
how many N-gram overlaps of lower order it contains.
Consider using lower n-gram order or use SmoothingFunction()
