<a href="https://colab.research.google.com/github/mralamdari/NLP-Probabilistic-Models/blob/main/NLP_Probabilistic_Models_Beginner.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Auto Connect

In [None]:
import os
import re
import collections
import numpy as np
import pandas as pd

In [None]:
!wget https://raw.githubusercontent.com/amanjeetsahu/Natural-Language-Processing-Specialization/master/Natural%20Language%20Processing%20with%20Probabilistic%20Models/Week%201/shakespeare.txt

In [None]:
def process_data(path):
  with open(path, 'r') as f:
    data = f.read().lower()
  data = re.findall('\w+',data)  # [a-zA-Z0-9_]
  return data

In [None]:
path = '/content/shakespeare.txt'
word_list = process_data(path)
vocab = set(word_list)  # this will be your new vocabulary
print(word_list[0:12])

['o', 'for', 'a', 'muse', 'of', 'fire', 'that', 'would', 'ascend', 'the', 'brightest', 'heaven']


In [None]:
print(len(vocab))

6116


In [None]:
def frequency_table(word_list):
  return collections.Counter(word_list)

In [None]:
freq_dict = frequency_table(word_list)

In [None]:
freq_dict

In [None]:
def probability_table(freq_dict):
  for key, value in freq_dict.items():
    freq_dict[key] = value / sum(freq_dict.values())
  return freq_dict

In [None]:
prob_dict = probability_table(freq_dict)

In [None]:
prob_dict

#String Manipulations

In [None]:
def edit_delete(word):
  split_list = []
  delete_list = []
  len_word = len(word)+1
  for i in range(len_word):
    part_1 = word[:i]
    part_2 = word[i:]
    split_list.append((part_1, part_2))
    one_deleted = ''.join([word[:i], word[i+1:]])
    delete_list.append(one_deleted)
  return split_list, delete_list

In [None]:
split_list, delete_list = edit_delete('cans')

In [None]:
split_list

[('', 'cans'), ('c', 'ans'), ('ca', 'ns'), ('can', 's'), ('cans', '')]

In [None]:
delete_list

['ans', 'cns', 'cas', 'can', 'cans']

In [None]:
def edit_switch(word):
  split_list = []
  switch_list = []
  len_word = len(word)+1
  for i in range(len_word):
    
    part_1 = word[:i]
    part_2 = word[i:]
    split_list.append((part_1, part_2))

    if i < len_word-2:
      one_switchd = ''.join([word[:i] , word[i+1], word[i], word[i+2:]])
      switch_list.append(one_switchd)
  return split_list, switch_list

In [None]:
split_list, switch_list = edit_switch('eta')

In [None]:
split_list

[('', 'eta'), ('e', 'ta'), ('et', 'a'), ('eta', '')]

In [None]:
switch_list

['tea', 'eat']

In [None]:
def edit_replace(word):
  split_list = []
  replace_list = []
  len_word = len(word)
  for i in range(len_word):
    part_1 = word[:i]
    part_2 = word[i:]
    split_list.append((part_1, part_2))

    chars = 'abcdefghijklmnopqrstuvwxyz'
    for c in chars:
      one_replaced = word.replace(word[i], c)
      replace_list.append(one_replaced)
  return split_list, sorted(list(set(replace_list)))

In [None]:
split_list, switch_list = edit_replace('can')

In [None]:
split_list

[('', 'can'), ('c', 'an'), ('ca', 'n')]

In [None]:
switch_list

In [None]:
def edit_insert(word):
  split_list = []
  insert_list = []
  len_word = len(word)+1
  for i in range(len_word):
    part_1 = word[:i]
    part_2 = word[i:]
    split_list.append((part_1, part_2))

    chars = 'abcdefghijklmnopqrstuvwxyz'
    for c in chars:
      one_inserted = ''.join([part_1, c, part_2])
      insert_list.append(one_inserted)
  return split_list, insert_list

In [None]:
split_list, switch_list = edit_insert('at')

In [None]:
split_list

[('', 'at'), ('a', 't'), ('at', '')]

In [None]:
switch_list

In [None]:
def edit_letters(word,  allow_switches=True):

  edited_word = set()

  edited_word.update(edit_delete(word)[1])
  if allow_switches:
    edited_word.update(edit_switch(word)[1])
  edited_word.update(edit_replace(word)[1])
  edited_word.update(edit_insert(word)[1])

  return edited_word

In [None]:
def edit_2_letters(word, allow_switches=True):
  
  seconed_edited_word = set()

  first_edited = edit_letters(word, allow_switches=True)
  for w in first_edited:
    if w:
      next_edited = edit_letters(w, allow_switches)
      seconed_edited_word.update(next_edited)

  return seconed_edited_word

In [None]:
edit_letters = edit_2_letters('at')
len(edit_letters)

7154

In [None]:
edit_letters

In [None]:
def spelling_suggestions(word, vocab, allow_switches=True):
  
  suggestions = edit_2_letters(word, allow_switches)
  suggestion_list = []

  for word in suggestions:
    word_p = prob_dict[word]
    if word_p != 0:
      suggestion_list.append((word, word_p))
  return suggestion_list

In [None]:
spelling_suggestions('pree', vocab)

In [None]:
spelling_suggestions('przd', vocab)

In [None]:
spelling_suggestions('przdq', vocab)

[('pride', 0.0006648254919146472)]

In [None]:
def min_edit_distance(source, target, insert_cost=1, delete_cost=1, replace_cost=2):
  
  width = len(target) + 1
  height = len(source) + 1 

  mat = np.zeros((height, width))

  #Initialization
  mat[0] = range(width)
  mat[:, 0] = range(height)

  #All Matrix operations
  for i in range(1, height):
    for j in range(1, width):
      delete_distance = mat[i-1, j] + delete_cost
      insert_distance = mat[i, j-1] + insert_cost
      replace_param = replace_cost if source[i-1] != target[j-1] else 0
      replace_distance= mat[i-1, j-1] + replace_param
      min_distance = min(insert_distance, delete_distance, replace_distance)
      mat[i, j] = min_distance
  
  return mat, min_distance

In [None]:
distance_mat, min_distance = min_edit_distance('play', 'stay', insert_cost=1, delete_cost=1, replace_cost=2)

In [None]:
distance_mat

array([[0., 1., 2., 3., 4.],
       [1., 2., 3., 4., 5.],
       [2., 3., 4., 5., 6.],
       [3., 4., 5., 4., 5.],
       [4., 5., 6., 5., 4.]])

In [None]:
min_distance

4.0

In [None]:
distance_mat, min_distance = min_edit_distance('plzub', 'cloud', insert_cost=1, delete_cost=1, replace_cost=2)

In [None]:
distance_mat

array([[0., 1., 2., 3., 4., 5.],
       [1., 2., 3., 4., 5., 6.],
       [2., 3., 2., 3., 4., 5.],
       [3., 4., 3., 4., 5., 6.],
       [4., 5., 4., 5., 4., 5.],
       [5., 6., 5., 6., 5., 6.]])

In [None]:
min_distance

6.0

In [None]:
distance_mat, min_distance = min_edit_distance('eer', 'near', insert_cost=1, delete_cost=1, replace_cost=2)

In [None]:
distance_mat

array([[0., 1., 2., 3., 4.],
       [1., 2., 1., 2., 3.],
       [2., 3., 2., 3., 4.],
       [3., 4., 3., 4., 3.]])

In [None]:
min_distance

3.0