# Compare two strings to each other v1

This notebook provides all of the different ways to evaluate if two strings are similar to each other.

In [3]:
from difflib import SequenceMatcher
from fuzzywuzzy import fuzz
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
import Levenshtein

def jaccard_similarity(str1, str2):
    set1 = set(str1)
    set2 = set(str2)
    intersection = len(set1.intersection(set2))
    union = len(set1.union(set2))
    return intersection / union

def cosine_sim(str1, str2):
    vectorizer = TfidfVectorizer().fit_transform([str1, str2])
    vectors = vectorizer.toarray()
    return cosine_similarity(vectors)[0][1]

def hamming_distance(str1, str2):
    if len(str1) != len(str2):
        raise ValueError("Strings must be of the same length")
    return sum(el1 != el2 for el1, el2 in zip(str1, str2))

def similarity_score(str1, str2):
    # Levenshtein Distance normalized
    lev_distance = Levenshtein.distance(str1, str2)
    lev_score = 1 - (lev_distance / max(len(str1), len(str2)))

    # Jaccard Similarity
    jaccard_score = jaccard_similarity(str1, str2)

    # Cosine Similarity
    cosine_score = cosine_sim(str1, str2)

    # SequenceMatcher Similarity
    seq_matcher_score = SequenceMatcher(None, str1, str2).ratio()

    # Hamming Distance normalized (ensure strings are of same length)
    max_len = max(len(str1), len(str2))
    str1_padded = str1.ljust(max_len)
    str2_padded = str2.ljust(max_len)
    hamming_dist = hamming_distance(str1_padded, str2_padded)
    hamming_score = 1 - (hamming_dist / max_len)

    # FuzzyWuzzy Similarity
    fuzzy_score = fuzz.ratio(str1, str2) / 100

    # Combine all scores (average)
    combined_score = (lev_score + jaccard_score + cosine_score +
                      seq_matcher_score + hamming_score + fuzzy_score) / 6

    return combined_score

# Example usage
string1 = "kitten"
string2 = "sitting"

score = similarity_score(string1, string2)
print(f"Overall Similarity Score: {score}")

Overall Similarity Score: 0.46780219780219784


In [4]:
# Example usage
string1 = "here"
string2 = "there"

score = similarity_score(string1, string2)
print(f"Overall Similarity Score: {score}")

Overall Similarity Score: 0.5548148148148149


In [5]:
# Example usage
string1 = "abc123"
string2 = "abc123"

score = similarity_score(string1, string2)
print(f"Overall Similarity Score: {score}")

Overall Similarity Score: 1.0


In [6]:
# Example usage
string1 = "abc123"
string2 = "123abc"

score = similarity_score(string1, string2)
print(f"Overall Similarity Score: {score}")

Overall Similarity Score: 0.3333333333333333


In [7]:
# Example usage
string1 = "abc123"
string2 = "3abc12"

score = similarity_score(string1, string2)
print(f"Overall Similarity Score: {score}")

Overall Similarity Score: 0.555


In [8]:
# Example usage
string1 = "abc123"
string2 = "abc-123"

score = similarity_score(string1, string2)
print(f"Overall Similarity Score: {score}")

Overall Similarity Score: 0.6643223443223444


In [12]:
# Example usage
string1 = "abc123"
string2 = " abc123  "

score = similarity_score(string1, string2)
print(f"Overall Similarity Score: {score}")

Overall Similarity Score: 0.7243386243386243
