In [13]:
import nltk
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize
from collections import Counter

First task

In [14]:
def wagner_fischer(word1, word2):
    len_1 = len(word1)
    len_2 = len(word2)

    dist = [[0] * (len_2 + 1) for _ in range(len_1 + 1)]

    for i in range(len_1 + 1):
        dist[i][0] = i
    for j in range(len_2 + 1):
        dist[0][j] = j

    for i in range(1, len_1 + 1):
        for j in range(1, len_2 + 1):
            if word1[i - 1] == word2[j - 1]:
                cost = 0
            else:
                cost = 1
            dist[i][j] = min(dist[i - 1][j] + 1,      
                             dist[i][j - 1] + 1,      
                             dist[i - 1][j - 1] + cost)  

    return dist[len_1][len_2]

word1 = "magic"
word2 = "people"
print(f"Length between '{word1}' and '{word2}' is {wagner_fischer(word1, word2)}")

Length between 'magic' and 'people' is 6


In [15]:
def qwerty_distance(char1, char2):
    qwerty_keyboard = {
        'q': {'w', 'a', 's'},
        'w': {'q', 'a', 's', 'd', 'e'},
        'e': {'w', 's', 'd', 'f', 'r'},
        'r': {'e', 'd', 'f', 'g', 't'},
        't': {'r', 'f', 'g', 'h', 'y'},
        'y': {'t', 'g', 'h', 'j', 'u'},
        'u': {'y', 'h', 'j', 'k', 'i'},
        'i': {'u', 'j', 'k', 'l', 'o'},
        'o': {'i', 'k', 'l', 'p'},
        'p': {'o', 'l'},

        'a': {'q', 'w', 's', 'z'},
        's': {'q', 'w', 'e', 'a', 'd', 'z', 'x'},
        'd': {'w', 'e', 'r', 's', 'f', 'x', 'c'},
        'f': {'e', 'r', 't', 'd', 'g', 'c', 'v'},
        'g': {'r', 't', 'y', 'f', 'h', 'v', 'b'},
        'h': {'t', 'y', 'u', 'g', 'j', 'b', 'n'},
        'j': {'y', 'u', 'i', 'h', 'k', 'n', 'm'},
        'k': {'u', 'i', 'o', 'j', 'l', 'm'},
        'l': {'i', 'o', 'p', 'k'},

        'z': {'a', 's', 'x'},
        'x': {'z', 's', 'd', 'c'},
        'c': {'x', 'd', 'f', 'v'},
        'v': {'c', 'f', 'g', 'b'},
        'b': {'v', 'g', 'h', 'n'},
        'n': {'b', 'h', 'j', 'm'},
        'm': {'n', 'j', 'k'}
    }
    
    if char1 in qwerty_keyboard and char2 in qwerty_keyboard[char1]:
        return 1 
    else:
        return float('inf') 

def modified_wagner_fischer(word1, word2):
    len_1 = len(word1)
    len_2 = len(word2)

    dist = [[0] * (len_2 + 1) for _ in range(len_1 + 1)]

    for i in range(len_1 + 1):
        dist[i][0] = i
    for j in range(len_2 + 1):
        dist[0][j] = j

    for i in range(1, len_1 + 1):
        for j in range(1, len_2 + 1):
            if word1[i - 1] == word2[j - 1]:
                cost = 0
            else:
                cost = qwerty_distance(word1[i - 1], word2[j - 1])
            dist[i][j] = min(dist[i - 1][j] + 1,      
                             dist[i][j - 1] + 1,     
                             dist[i - 1][j - 1] + cost)  

    return dist[len_1][len_2]

word1 = "magic"
word2 = "people"
print(f"Length between '{word1}' and '{word2}' is {modified_wagner_fischer(word1, word2)}")

Length between 'magic' and 'people' is 10


In [16]:
def damerau_levenshtein_distance(char1, char2):
    if char1 == char2:
        return 0
    else:
        return 1

def qwerty_distance(char1, char2):
    qwerty_keyboard = {
        'q': {'w', 'a', 's'},
        'w': {'q', 'a', 's', 'd', 'e'},
        'e': {'w', 's', 'd', 'f', 'r'},
        'r': {'e', 'd', 'f', 'g', 't'},
        't': {'r', 'f', 'g', 'h', 'y'},
        'y': {'t', 'g', 'h', 'j', 'u'},
        'u': {'y', 'h', 'j', 'k', 'i'},
        'i': {'u', 'j', 'k', 'l', 'o'},
        'o': {'i', 'k', 'l', 'p'},
        'p': {'o', 'l'},

        'a': {'q', 'w', 's', 'z'},
        's': {'q', 'w', 'e', 'a', 'd', 'z', 'x'},
        'd': {'w', 'e', 'r', 's', 'f', 'x', 'c'},
        'f': {'e', 'r', 't', 'd', 'g', 'c', 'v'},
        'g': {'r', 't', 'y', 'f', 'h', 'v', 'b'},
        'h': {'t', 'y', 'u', 'g', 'j', 'b', 'n'},
        'j': {'y', 'u', 'i', 'h', 'k', 'n', 'm'},
        'k': {'u', 'i', 'o', 'j', 'l', 'm'},
        'l': {'i', 'o', 'p', 'k'},

        'z': {'a', 's', 'x'},
        'x': {'z', 's', 'd', 'c'},
        'c': {'x', 'd', 'f', 'v'},
        'v': {'c', 'f', 'g', 'b'},
        'b': {'v', 'g', 'h', 'n'},
        'n': {'b', 'h', 'j', 'm'},
        'm': {'n', 'j', 'k'}
    }
    
    if char1 in qwerty_keyboard and char2 in qwerty_keyboard[char1]:
        return 1  
    else:
        return float('inf')  

def modified_wagner_fischer(word1, word2):
    len_1 = len(word1)
    len_2 = len(word2)

    dist = [[0] * (len_2 + 1) for _ in range(len_1 + 1)]

    for i in range(len_1 + 1):
        dist[i][0] = i
    for j in range(len_2 + 1):
        dist[0][j] = j

    for i in range(1, len_1 + 1):
        for j in range(1, len_2 + 1):
            if word1[i - 1] == word2[j - 1]:
                cost = 0
            else:
                cost = qwerty_distance(word1[i - 1], word2[j - 1])

            # Транспонування
            if i > 1 and j > 1 and word1[i - 1] == word2[j - 2] and word1[i - 2] == word2[j - 1]:
                transposition_cost = dist[i - 2][j - 2] + cost
                dist[i][j] = min(dist[i][j], transposition_cost)

            dist[i][j] = min(dist[i][j], dist[i - 1][j] + 1,     
                             dist[i][j - 1] + 1,                 
                             dist[i - 1][j - 1] + cost)         

    return dist[len_1][len_2]

word1 = "magic"
word2 = "people"
print("Length between '{}' and '{}' is {}".format(word1, word2, modified_wagner_fischer(word1, word2)))

Length between 'magic' and 'people' is 0


Second task

In [17]:
nltk.download('gutenberg')
text = gutenberg.raw('melville-moby_dick.txt')
words = word_tokenize(text)

bag_of_words = Counter(words)

print("Bag-of-Words:")
print(bag_of_words)

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


Bag-of-Words:


In [18]:
nltk.download('gutenberg')
text = gutenberg.raw('melville-moby_dick.txt')
words = word_tokenize(text)
word_counts = Counter(words)
N = len(word_counts)
print("There are", N, "words in vocabulary")

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\user\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


There are 20742 words in vocabulary


In [19]:
d = gutenberg.raw('melville-moby_dick.txt')
words = d.split()

word_counts = Counter(words)
for word, count in word_counts.items():
    print(f"The word '{word}' occurs {count} times.")

The word '[Moby' occurs 1 times.
The word 'Dick' occurs 33 times.
The word 'by' occurs 1093 times.
The word 'Herman' occurs 1 times.
The word 'Melville' occurs 1 times.
The word '1851]' occurs 1 times.
The word 'ETYMOLOGY.' occurs 1 times.
The word '(Supplied' occurs 2 times.
The word 'a' occurs 4472 times.
The word 'Late' occurs 1 times.
The word 'Consumptive' occurs 1 times.
The word 'Usher' occurs 1 times.
The word 'to' occurs 4439 times.
The word 'Grammar' occurs 1 times.
The word 'School)' occurs 1 times.
The word 'The' occurs 531 times.
The word 'pale' occurs 12 times.
The word 'Usher--threadbare' occurs 1 times.
The word 'in' occurs 3824 times.
The word 'coat,' occurs 8 times.
The word 'heart,' occurs 17 times.
The word 'body,' occurs 19 times.
The word 'and' occurs 5881 times.
The word 'brain;' occurs 6 times.
The word 'I' occurs 1724 times.
The word 'see' occurs 190 times.
The word 'him' occurs 554 times.
The word 'now.' occurs 19 times.
The word 'He' occurs 187 times.
The wor