# Python Text Algorithms Assignment

This notebook contains solutions for:
1. Top 5 Most Frequent Words
2. Flexible Anagram Checker
3. Simple Text Similarity Score

The implementations focus on correctness, readability, and efficiency.


# Code Implementations

In [1]:
import re
from collections import Counter
from typing import List, Tuple


def _words_from_text(text: str) -> List[str]:
    """
    Normalize text to a list of lowercase words.
    - Ignores punctuation
    - Keeps only alphabetic sequences (a-z)
    """
    return re.findall(r"[a-z]+", text.lower())


def top_5_words(text: str) -> List[Tuple[str, int]]:
    """
    Find the top 5 most frequent words (case-insensitive, punctuation ignored).
    Tie-breaker:
      1) higher frequency first
      2) alphabetical order for deterministic output
    Returns a list of (word, count).
    """
    words = _words_from_text(text)
    freq = Counter(words)

    # Sort by count desc, word asc for stable output
    return sorted(freq.items(), key=lambda x: (-x[1], x[0]))[:5]


def flexible_anagram(str1: str, str2: str) -> str:
    """
    Return 'YES' if str1 and str2 are flexible anagrams, else 'NO'.

    Flexible anagram definition:
    - If same length: at most one mismatch (i.e., anagram OR one replacement away)
    - If length differs by 1: one extra/missing character allowed (insert/delete)
    - If length differs by > 1: not flexible anagrams
    """
    a = "".join(ch.lower() for ch in str1 if ch.isalpha())
    b = "".join(ch.lower() for ch in str2 if ch.isalpha())

    if abs(len(a) - len(b)) > 1:
        return "NO"

    c1, c2 = Counter(a), Counter(b)

    # Total multiset difference across all characters
    diff = 0
    for ch in set(c1) | set(c2):
        diff += abs(c1.get(ch, 0) - c2.get(ch, 0))

    if len(a) == len(b):
        # anagram => diff=0
        # one replacement => diff=2 (one char count down, another up)
        return "YES" if diff <= 2 else "NO"
    else:
        # one insert/delete => diff must be exactly 1
        return "YES" if diff == 1 else "NO"


def text_similarity(sentence1: str, sentence2: str) -> float:
    """
    Similarity = (2 * |A and B|) / (|A| + |B|)
    where A and B are sets of UNIQUE words in sentence1 and sentence2.
    Ignores punctuation and case.
    Returns a float rounded to 2 decimal places.
    """
    set1 = set(_words_from_text(sentence1))
    set2 = set(_words_from_text(sentence2))

    if not set1 and not set2:
        return 0.00

    inter = set1 & set2
    score = (2 * len(inter)) / (len(set1) + len(set2))
    return round(score, 2)


# Sample Runs

In [2]:
# -------------------------
# Sample run: Q1 top_5_words
# -------------------------
text = "The quick brown fox jumps over the lazy dog. The fox was very quick and very smart."
top5 = top_5_words(text)
for w, c in top5:
    print(w, c)

print("\n-------------------------\n")

# ------------------------------
# Sample runs: Q2 flexible_anagram
# ------------------------------
tests_q2 = [
    ("abcd", "abce"),   # YES
    ("abc", "abcd"),    # YES
    ("abc", "abxyz"),   # NO
    ("aabb", "abbb"),   # YES
    ("abc", "def"),     # NO
    ("abcd", "abc"),    # YES
]
for s1, s2 in tests_q2:
    print(f"{s1!r}, {s2!r} -> {flexible_anagram(s1, s2)}")

print("\n-------------------------\n")

# ------------------------------
# Sample runs: Q3 text_similarity
# ------------------------------
tests_q3 = [
    ("Artificial intelligence is transforming the world.", "AI is changing the world."),
    ("Generative AI creates new content.", "AI models can generate text, images, or music."),
    ("Cats are lovely animals.", "Dogs are friendly pets."),
]
for s1, s2 in tests_q3:
    print(text_similarity(s1, s2))


the 3
fox 2
quick 2
very 2
and 1

-------------------------

'abcd', 'abce' -> YES
'abc', 'abcd' -> YES
'abc', 'abxyz' -> NO
'aabb', 'abbb' -> YES
'abc', 'def' -> NO
'abcd', 'abc' -> YES

-------------------------

0.55
0.15
0.25
