<a href="https://colab.research.google.com/github/navneetkrc/Deep_learning_experiments/blob/master/Damerau_Levenshtein_Distance_Korean.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install jellyfish



In [7]:
import unicodedata
import sys # For float('inf') - Although not strictly needed with int initialization

def calculate_dl_matrix_korean(text1, text2, normalize=True):
    """
    Calculates the Damerau-Levenshtein distance matrix between two Korean strings.

    Args:
        text1 (str): The first Korean string (source, rows).
        text2 (str): The second Korean string (target, columns).
        normalize (bool): If True, normalizes both strings to NFC.

    Returns:
        list[list[int]]: The 2D matrix representing the edit distances.
                         Returns None if inputs are not strings.
        int: The final Damerau-Levenshtein distance.
             Returns -1 if inputs are not strings.
    """
    # Basic input type check
    if not isinstance(text1, str) or not isinstance(text2, str):
        print("Error: Both inputs must be strings.")
        return None, -1

    # --- Normalization (Recommended) ---
    if normalize:
        s1 = unicodedata.normalize('NFC', text1)
        s2 = unicodedata.normalize('NFC', text2)
    else:
        s1 = text1
        s2 = text2

    len_s1 = len(s1)
    len_s2 = len(s2)

    # Initialize the distance matrix
    # Rows: length of s1 + 1, Columns: length of s2 + 1
    d = [[0] * (len_s2 + 1) for _ in range(len_s1 + 1)]

    # Initialize first row (insertions to get target string)
    for j in range(len_s2 + 1):
        d[0][j] = j

    # Initialize first column (deletions to get empty string)
    for i in range(len_s1 + 1):
        d[i][0] = i

    # --- Fill the matrix ---
    for i in range(1, len_s1 + 1):
        for j in range(1, len_s2 + 1):
            # Cost of substitution (0 if chars match, 1 otherwise)
            cost = 0 if s1[i - 1] == s2[j - 1] else 1

            # Calculate costs from neighbours
            deletion_cost = d[i - 1][j] + 1
            insertion_cost = d[i][j - 1] + 1
            substitution_cost = d[i - 1][j - 1] + cost

            d[i][j] = min(deletion_cost, insertion_cost, substitution_cost)

            # Check for transposition
            # Need i > 1 and j > 1 to look back two characters
            if i > 1 and j > 1 and s1[i - 1] == s2[j - 2] and s1[i - 2] == s2[j - 1]:
                transposition_cost = d[i - 2][j - 2] + 1 # Cost is 1 for the transpose
                d[i][j] = min(d[i][j], transposition_cost)

    # Final distance is in the bottom-right corner
    final_distance = d[len_s1][len_s2]

    return d, final_distance

def print_dl_matrix(matrix, text1, text2, normalize=True):
    """
    Prints the Damerau-Levenshtein distance matrix in a readable format.

    Args:
        matrix (list[list[int]]): The distance matrix.
        text1 (str): The original source string (for row headers).
        text2 (str): The original target string (for column headers).
        normalize (bool): If True, normalizes strings for header display consistency.

    Returns:
        int: The calculated cell width used for printing. Returns 0 if matrix is None.
    """
    if matrix is None:
        print("Matrix is None, cannot print.")
        return 0 # Return 0 if no matrix

    # Normalize for display consistency if needed
    if normalize:
        s1 = unicodedata.normalize('NFC', text1)
        s2 = unicodedata.normalize('NFC', text2)
    else:
        s1 = text1
        s2 = text2

    len_s1 = len(s1)
    len_s2 = len(s2)

    # Determine appropriate cell width based on max value in matrix
    max_val = 0
    for r in range(len_s1 + 1):
        for c in range(len_s2 + 1):
             # Check if it's an integer before comparing
             if isinstance(matrix[r][c], int) and matrix[r][c] > max_val:
                 max_val = matrix[r][c]
    # Also consider width of header characters if they are multi-byte
    # (A simple max length check is usually sufficient for visual alignment)
    max_header_len = 2 # For ""
    for char in s1: max_header_len = max(max_header_len, len(char.encode()))
    for char in s2: max_header_len = max(max_header_len, len(char.encode()))

    # Use max of value length and a reasonable default/header len
    cell_width = max(2, len(str(max_val)), max_header_len)

    # --- Printing Logic ---
    # Print header row (target string)
    print(f"{'':>{cell_width}} {'':>{cell_width}} ", end="") # Space for row header column + initial ""
    for char in s2:
        print(f"{char:>{cell_width}} ", end="")
    print() # Newline

    # Print separator
    # Length: Width for row header col + space + (num_cols * (width + space))
    separator_len = cell_width + 1 + (len_s2 + 1) * (cell_width + 1)
    print(f"{'-' * separator_len}")


    # Print matrix rows
    for i in range(len_s1 + 1):
        # Print row header (source string character)
        row_header = '""' if i == 0 else s1[i - 1]
        print(f"{row_header:>{cell_width}} |", end="") # Print row header + separator

        # Print matrix values for the current row
        for j in range(len_s2 + 1):
            value = matrix[i][j]
            print(f"{value:{cell_width}d} ", end="") # Print value padded
        print() # Newline after each row

    # Return the calculated cell width so it can be used outside
    return cell_width


# --- Example Usage ---
string1 = "플레그십"
string2 = "플래그십"

print(f"Calculating Damerau-Levenshtein matrix for:")
print(f"Source (s1): '{string1}'")
print(f"Target (s2): '{string2}'\n")

dl_matrix, final_dist = calculate_dl_matrix_korean(string1, string2)

if dl_matrix:
    print("--- Damerau-Levenshtein Matrix ---")
    # **Capture the returned cell_width**
    cell_width = print_dl_matrix(dl_matrix, string1, string2)

    # **Use the captured cell_width for the final separator**
    # Adjust length calculation to match the one used inside print_dl_matrix
    if cell_width > 0: # Only print separator if matrix was printed
       separator_len = cell_width + 1 + (len(string2) + 1) * (cell_width + 1)
       print("-" * separator_len)

    print(f"\nFinal Damerau-Levenshtein distance: {final_dist}")

Calculating Damerau-Levenshtein matrix for:
Source (s1): '플레그십'
Target (s2): '플래그십'

--- Damerau-Levenshtein Matrix ---
          플   래   그   십 
------------------------
 "" |  0   1   2   3   4 
  플 |  1   0   1   2   3 
  레 |  2   1   1   2   3 
  그 |  3   2   2   1   2 
  십 |  4   3   3   2   1 
------------------------

Final Damerau-Levenshtein distance: 1


In [2]:
import jellyfish
import unicodedata # Recommended for normalization

def calculate_dl_distance_korean(text1, text2, normalize=True):
    """
    Calculates the Damerau-Levenshtein distance between two Korean strings.

    Korean text can sometimes exist in different Unicode normalization forms
    (NFC - precomposed, NFD - decomposed). Comparing strings in different
    forms can lead to unexpected distances. Normalizing ensures consistency.
    NFC (Normalization Form Composed) is generally preferred for comparison.

    Args:
        text1 (str): The first Korean string.
        text2 (str): The second Korean string.
        normalize (bool): If True (default), normalizes both strings to NFC
                          before calculating the distance. Set to False if you are
                          certain your strings are already consistently normalized
                          or if you specifically want to compare the raw byte sequences.

    Returns:
        int: The Damerau-Levenshtein distance between the two strings.
             Returns -1 if inputs are not strings.
    """
    # Basic input type check
    if not isinstance(text1, str) or not isinstance(text2, str):
        print("Error: Both inputs must be strings.")
        return -1

    # --- Normalization (Recommended) ---
    # Ensures that different representations of the same character
    # (e.g., precomposed '가' vs decomposed 'ㄱ'+'ㅏ') are treated as identical.
    if normalize:
        text1_normalized = unicodedata.normalize('NFC', text1)
        text2_normalized = unicodedata.normalize('NFC', text2)
    else:
        text1_normalized = text1
        text2_normalized = text2

    # Calculate the distance using jellyfish
    try:
        distance = jellyfish.damerau_levenshtein_distance(text1_normalized, text2_normalized)
        return distance
    except Exception as e:
        # Catch potential errors from the library, although unlikely for basic strings
        print(f"An error occurred during distance calculation: {e}")
        return -1

# --- Examples ---

# Example 1: Identical strings
string_a = "안녕하세요"
string_b = "안녕하세요"
distance_ab = calculate_dl_distance_korean(string_a, string_b)
print(f"Comparing '{string_a}' and '{string_b}'")
print(f"Damerau-Levenshtein distance: {distance_ab}") # Expected: 0
print("-" * 20)

# Example 2: Single substitution typo
string_c = "안녕하셍요" # Mistyped '셍' instead of '세'
distance_ac = calculate_dl_distance_korean(string_a, string_c)
print(f"Comparing '{string_a}' and '{string_c}' (Substitution)")
print(f"Damerau-Levenshtein distance: {distance_ac}") # Expected: 1
print("-" * 20)

# Example 3: Single deletion
string_d = "안녕하요" # Missing '세'
distance_ad = calculate_dl_distance_korean(string_a, string_d)
print(f"Comparing '{string_a}' and '{string_d}' (Deletion)")
print(f"Damerau-Levenshtein distance: {distance_ad}") # Expected: 1
print("-" * 20)

# Example 4: Single insertion
string_e = "안녕하세용요" # Added extra '용'
distance_ae = calculate_dl_distance_korean(string_a, string_e)
print(f"Comparing '{string_a}' and '{string_e}' (Insertion)")
print(f"Damerau-Levenshtein distance: {distance_ae}") # Expected: 1
print("-" * 20)

# Example 5: Transposition typo
string_f = "안녕하요세" # Swapped '세' and '요'
distance_af = calculate_dl_distance_korean(string_a, string_f)
print(f"Comparing '{string_a}' and '{string_f}' (Transposition)")
print(f"Damerau-Levenshtein distance: {distance_af}") # Expected: 1 (DL catches this, standard Levenshtein would be 2)
print("-" * 20)

# Example 6: Multiple edits
string_g = "안녕 하세요" # Added space (insertion)
distance_ag = calculate_dl_distance_korean(string_a, string_g)
print(f"Comparing '{string_a}' and '{string_g}' (Space Insertion)")
print(f"Damerau-Levenshtein distance: {distance_ag}") # Expected: 1
print("-" * 20)

# Example 7: Completely different strings
string_h = "반갑습니다"
distance_ah = calculate_dl_distance_korean(string_a, string_h)
print(f"Comparing '{string_a}' and '{string_h}' (Different Words)")
print(f"Damerau-Levenshtein distance: {distance_ah}") # Expected: Higher number (e.g., 5)
print("-" * 20)

# Example 8: Normalization difference (if normalization is off)
# '가' (precomposed Hangul Syllable)
nfc_ga = '\uAC00'
# 'ㄱ' + 'ㅏ' (decomposed Hangul Jamo)
nfd_ga = '\u1100\u1161'

# With normalization (default)
dist_norm = calculate_dl_distance_korean(nfc_ga, nfd_ga)
print(f"Comparing '{nfc_ga}' and '{nfd_ga}' (Normalization ON)")
print(f"Damerau-Levenshtein distance: {dist_norm}") # Expected: 0

# Without normalization
dist_no_norm = calculate_dl_distance_korean(nfc_ga, nfd_ga, normalize=False)
print(f"\nComparing '{nfc_ga}' and '{nfd_ga}' (Normalization OFF)")
print(f"Damerau-Levenshtein distance: {dist_no_norm}") # Expected: 2 (delete 'ㄱ', delete 'ㅏ', insert '가')
print("-" * 20)

Comparing '안녕하세요' and '안녕하세요'
Damerau-Levenshtein distance: 0
--------------------
Comparing '안녕하세요' and '안녕하셍요' (Substitution)
Damerau-Levenshtein distance: 1
--------------------
Comparing '안녕하세요' and '안녕하요' (Deletion)
Damerau-Levenshtein distance: 1
--------------------
Comparing '안녕하세요' and '안녕하세용요' (Insertion)
Damerau-Levenshtein distance: 1
--------------------
Comparing '안녕하세요' and '안녕하요세' (Transposition)
Damerau-Levenshtein distance: 1
--------------------
Comparing '안녕하세요' and '안녕 하세요' (Space Insertion)
Damerau-Levenshtein distance: 1
--------------------
Comparing '안녕하세요' and '반갑습니다' (Different Words)
Damerau-Levenshtein distance: 5
--------------------
Comparing '가' and '가' (Normalization ON)
Damerau-Levenshtein distance: 0

Comparing '가' and '가' (Normalization OFF)
Damerau-Levenshtein distance: 1
--------------------


In [3]:
import jellyfish
import unicodedata

def calculate_dl_distance_korean(text1, text2, normalize=True):
    """
    Calculates the Damerau-Levenshtein distance between two Korean strings.
    (Same function as provided previously)
    """
    if not isinstance(text1, str) or not isinstance(text2, str):
        print("Error: Both inputs must be strings.")
        return -1

    if normalize:
        text1_normalized = unicodedata.normalize('NFC', text1)
        text2_normalized = unicodedata.normalize('NFC', text2)
    else:
        text1_normalized = text1
        text2_normalized = text2

    try:
        distance = jellyfish.damerau_levenshtein_distance(text1_normalized, text2_normalized)
        return distance
    except Exception as e:
        print(f"An error occurred during distance calculation: {e}")
        return -1

# --- Calculation ---
string1 = "플레그십"
string2 = "플래그십"

distance = calculate_dl_distance_korean(string1, string2)

# --- Output ---
print(f"Comparing '{string1}' and '{string2}'")
print(f"Damerau-Levenshtein distance: {distance}")
print("-" * 20)

# --- Explanation in Korean ---
if distance == 1:
    print(f"'{string1}'과 '{string2}'의 데머라우-레벤슈타인 거리는 {distance}입니다.")
    print("두 단어는 두 번째 글자인 '레'와 '래'가 다르므로, 한 번의 '대치(substitution)' 연산으로 같게 만들 수 있습니다.")
elif distance == 0:
     print(f"'{string1}'과 '{string2}'는 동일한 문자열이므로 거리는 0입니다.")
else:
     print(f"'{string1}'과 '{string2}'의 데머라우-레벤슈타인 거리는 {distance}입니다.")

Comparing '플레그십' and '플래그십'
Damerau-Levenshtein distance: 1
--------------------
'플레그십'과 '플래그십'의 데머라우-레벤슈타인 거리는 1입니다.
두 단어는 두 번째 글자인 '레'와 '래'가 다르므로, 한 번의 '대치(substitution)' 연산으로 같게 만들 수 있습니다.
