# **Programming Assessment \#4**

Names: MAJARREIS, Isaiah Thane | RUELOS, Miguel Antonio

More information on the assessment is found in our Canvas course.

# **Load Data**

*While you don't have to separate your code into blocks, it might be easier if you separated loading your data from actually implementation of your code. Consider placing all loading of data into the code block below.*

In [4]:
from collections import defaultdict
import nltk
from nltk.corpus import gutenberg
from nltk.tokenize import word_tokenize
from nltk.metrics import edit_distance
from tabulate import tabulate

# Error Model
error_model = {}

with open('count_1edit.txt', 'r') as file:
    for line in file:
        parts = line.strip().split(' ')
        if len(parts) == 2:
            wrong_right, count = parts
            if '|' in wrong_right:
                wrong, right = wrong_right.split('|')
                if wrong and right:
                    if wrong not in error_model:
                        error_model[wrong] = {}
                    error_model[wrong][right] = int(count)
                else:
                    print(f"Issue with line: {line.strip()}")
            else:
                print(f"Issue with line: {line.strip()}")
        else:
            print(f"Issue with line: {line.strip()}")

# Language Model
nltk.download('gutenberg')
corpus = gutenberg.words()
vocab = set(corpus)
word_count = len(corpus)
word_frequency = nltk.FreqDist(corpus)

Issue with line: e|i	917
Issue with line: a|e	856
Issue with line: i|e	771
Issue with line: e|a	749
Issue with line: a|i	559
Issue with line: t|te	478
Issue with line: r|re	392
Issue with line: s|c	383
Issue with line: e|ea	354
Issue with line: a|o	353
Issue with line: o|a	352
Issue with line: a|al	352
Issue with line: i|a	313
Issue with line: re|r	299
Issue with line: e|o	295
Issue with line: ea|e	285
Issue with line: te|t	271
Issue with line: l|le	263
Issue with line: e|es	258
Issue with line: o|ou	235
Issue with line: n|m	230
Issue with line: o|e	216
Issue with line: a|ai	216
Issue with line: le|l	215
Issue with line: n|ne	213
Issue with line: e|er	211
Issue with line: c|s	209
Issue with line: e|ed	206
Issue with line: a|ac	199
Issue with line: r|ri	194
Issue with line: er|re	189
Issue with line: c|ch	188
Issue with line: a|an	184
Issue with line: ne|n	174
Issue with line: o|u	171
Issue with line: ar|a	171
Issue with line: u|e	162
Issue with line: er|e	162
Issue with line: e|u	160
I

[nltk_data] Downloading package gutenberg to
[nltk_data]     C:\Users\isaia\AppData\Roaming\nltk_data...
[nltk_data]   Package gutenberg is already up-to-date!


# **Noisy Channel Model Implementation**

*Again, you don't have to follow this directly, but consider placing your implementation of the model in the code block below.*

In [5]:
# Spell Check Function
def spell_check(word):
    if word in vocab:
        return f"No errors found in '{word}'"

    candidates = {}
    for w in vocab:
        if abs(len(word) - len(w)) > 1:
            continue

        if edit_distance(word, w) == 1:
            candidates[w] = edit_distance(word, w)

    if not candidates:
        return f"No suggestions found for '{word}'"

    output_data = []
    for candidate, edit_type in candidates.items():
        # Error Model Probability: P(c)
        p_c = 1  # Placeholder for P(c) probability

        # Probability of generating the user-input word from the candidate: P(w|c)
        p_w_given_c = 1.0 / (word_count + 1)  # Laplace smoothing; initialize to avoid zero probability

        if word_frequency[candidate] > 0:
            p_w_given_c = word_frequency[candidate] / word_count

        # Combining P(c) and P(w|c) for each candidate
        combined_probability = p_c * p_w_given_c

        output_data.append([word, candidate, edit_type, f'...', p_c, p_w_given_c, combined_probability])

    # Sort suggestions by combined probability in descending order
    output_data.sort(key=lambda x: x[6], reverse=True)

    # Return the suggestion with the highest combined probability
    best_suggestion = output_data[0]
    return tabulate(output_data, headers=["word", "candidate", "edit_type", "edit", "P(c)", "P(w|c)", "P(c) x P(w|c)"], tablefmt="pretty")

# Example
input_word = "mohter"
result = spell_check(input_word)
print(result)

+--------+-----------+-----------+------+------+-----------------------+-----------------------+
|  word  | candidate | edit_type | edit | P(c) |        P(w|c)         |     P(c) x P(w|c)     |
+--------+-----------+-----------+------+------+-----------------------+-----------------------+
| mohter |  morter   |     1     | ...  |  1   | 4.195890087514824e-06 | 4.195890087514824e-06 |
+--------+-----------+-----------+------+------+-----------------------+-----------------------+
