in this notebook i am trying to use this solution : https://medium.com/swlh/a-machine-learning-model-to-understand-fancy-abbreviations-trained-on-tolkien-36601b73ecbb  and https://stackoverflow.com/questions/43510778/python-how-to-intuit-word-from-abbreviated-text-using-nlp and code from : https://github.com/avidale/weirdMath/blob/master/nlp/abbreviation_spellchecker_english.ipynb on a romanic bangla spellchecker task

Dataset link : https://www.kaggle.com/mobassir/romanic-bangla-songs-lyrics or https://github.com/mobassir94/Romanic-Bangla-spell-checker/blob/main/RomanicBanglaSongsLyrics.txt 

In [1]:
from collections import defaultdict, Counter
import numpy as np
import pandas as pd

class LanguageNgramModel:
    """ Remember and predict which letters usually follows which. """
    def __init__(self, order=1, smoothing=1.0, recursive=0.001):
        self.order = order
        self.smoothing = smoothing
        self.recursive = recursive
    
    def fit(self, corpus):
        """ Estimate all counts on a text """
        self.counter_ = defaultdict(lambda: Counter())
        self.unigrams_ = Counter()
        self.vocabulary_ = set()
        for i, token in enumerate(corpus[self.order:]):
            context = corpus[i:(i+self.order)]
            self.counter_[context][token] += 1
            self.unigrams_[token] +=1
            self.vocabulary_.add(token)
        self.vocabulary_ = sorted(list(self.vocabulary_))
        if self.recursive > 0 and self.order > 0:
            self.child_ = LanguageNgramModel(self.order-1, self.smoothing, self.recursive)
            self.child_.fit(corpus)
            
    def get_counts(self, context):
        """ Get smoothed count of each letter appearing after context """
        if self.order:
            local = context[-self.order:]
        else:
            local = ''
        freq_dict = self.counter_[local]
        freq = pd.Series(index=self.vocabulary_)
        for i, token in enumerate(self.vocabulary_):
            freq[token] = freq_dict[token] + self.smoothing
        if self.recursive > 0 and self.order > 0:
            child_freq = self.child_.get_counts(context) * self.recursive
            freq += child_freq
        return freq
    
    def predict_proba(self, context):
        """ Get smoothed probability of each letter appearing after context """
        counts = self.get_counts(context)
        return counts / counts.sum()
    
    def single_log_proba(self, context, continuation):
        """ Estimate log-probability that context is followed by continuation """
        result = 0.0
        for token in continuation:
            result += np.log(self.predict_proba(context)[token])
            context += token
        return result
    
    def single_proba(self, context, continuation):
        """ Estimate probability that context is followed by continuation """
        return np.exp(self.single_log_proba(context, continuation))

In [2]:
class MissingLetterModel:
    """ Remember and predict which letters are usually missing. """
    def __init__(self, order=0, smoothing_missed=0.3, smoothing_total=1.0):
        self.order = order
        self.smoothing_missed = smoothing_missed
        self.smoothing_total = smoothing_total
    def fit(self, sentence_pairs):
        self.missed_counter_ = defaultdict(lambda: Counter())
        self.total_counter_ = defaultdict(lambda: Counter())
        for (original, observed) in sentence_pairs:
            for i, (original_letter, observed_letter) in enumerate(zip(original[self.order:], observed[self.order:])):
                context = original[i:(i+self.order)]
                if observed_letter == '-':
                    self.missed_counter_[context][original_letter] += 1
                self.total_counter_[context][original_letter] += 1 
    def predict_proba(self, context, last_letter):
        """ Estimate probability that last_letter after context is missed """
        if self.order:
            local = context[-self.order:]
        else:
            local = ''
        missed_freq = self.missed_counter_[local][last_letter] + self.smoothing_missed
        total_freq = self.total_counter_[local][last_letter] + self.smoothing_total
        return missed_freq / total_freq
    
    def single_log_proba(self, context, continuation, actual=None):
        """ Estimate log-probability of continuaton being distorted to actual after context. 
        If actual is None, assume no distortion
        """
        if not actual:
            actual = continuation
        result = 0.0
        for orig_token, act_token in zip(continuation, actual):
            pp = self.predict_proba(context, orig_token)
            if act_token == '-':
                pp = 1 - pp
            result += np.log(pp)
            context += orig_token
        return result
    
    def single_proba(self, context, continuation, actual=None):
        """ Estimate probability of continuaton being distorted to actual after context. 
        If actual is None, assume no distortion
        """
        return np.exp(self.single_log_proba(context, continuation, actual))

In [3]:
lang_model = LanguageNgramModel(1)
lang_model.fit(' abracadabra ')
lang_model.predict_proba(' bra')



     0.181777
a    0.091297
b    0.272529
c    0.181686
d    0.181686
r    0.091025
dtype: float64

In [4]:
missed_model = MissingLetterModel(0)
missed_model.fit([('abracadabra', 'abr-c-d-br-')]) 
missed_model.predict_proba('abr', 'a'), missed_model.predict_proba('abr', 'b')

(0.7166666666666667, 0.09999999999999999)

In [5]:
missed_model.single_proba('', 'abra', 'abr-')

0.0020305555555555532

In [6]:
from heapq import heappush, heappop

In [7]:
def generate_options(prefix_proba, prefix, suffix, lang_model, missed_model, optimism=0.5, cache=None):
    options = []
    for letter in lang_model.vocabulary_ + ['']:
        if letter:  # assume a missing letter
            next_letter = letter
            new_suffix = suffix
            new_prefix = prefix + next_letter
            proba_missing_state = - np.log(missed_model.predict_proba(prefix, letter))
        else:  # assume no missing letter
            next_letter = suffix[0]
            new_suffix = suffix[1:]
            new_prefix = prefix + next_letter
            proba_missing_state = - np.log((1 - missed_model.predict_proba(prefix, next_letter)))
        proba_next_letter = - np.log(lang_model.single_proba(prefix, next_letter))
        if cache:
            proba_suffix = cache[len(new_suffix)] * optimism
        else:
            proba_suffix = - np.log(lang_model.single_proba(new_prefix, new_suffix)) * optimism
        proba = prefix_proba + proba_next_letter + proba_missing_state + proba_suffix
        options.append((proba, new_prefix, new_suffix, letter, proba_suffix))
    return options
print(generate_options(0, ' ', 'brac ', lang_model, missed_model))

[(6.929663174828117, '  ', 'brac ', ' ', 3.7800651217336947), (5.042879645338754, ' a', 'brac ', 'a', 3.4572571306016755), (8.09487194753453, ' b', 'brac ', 'b', 3.846661605771999), (7.623807861705187, ' c', 'brac ', 'c', 3.7800651217336947), (7.623807861705187, ' d', 'brac ', 'd', 3.7800651217336947), (8.09487194753453, ' r', 'brac ', 'r', 3.846661605771999), (4.858238261775765, ' b', 'rac ', '', 2.8072524973494524)]




In [8]:
def noisy_channel(word, lang_model, missed_model, freedom=1.0, max_attempts=1000, optimism=0.1, verbose=True):
    query = word + ' '
    prefix = ' '
    prefix_proba = 0.0
    suffix = query
    full_origin_logprob = -lang_model.single_log_proba(prefix, query)
    no_missing_logprob = -missed_model.single_log_proba(prefix, query)
    best_logprob = full_origin_logprob + no_missing_logprob
    # add empty beginning to the heap
    heap = [(best_logprob * optimism, prefix, suffix, '', best_logprob * optimism)]
    # add the default option (no missing letters) to candidates
    candidates = [(best_logprob, prefix + query, '', None, 0.0)]
    if verbose:
        # todo: include distortion probability
        print('baseline score is', best_logprob)
    # prepare cache for suffixes (the slowest operation)
    cache = {}
    for i in range(len(query)+1):
        future_suffix = query[:i]
        cache[len(future_suffix)] = -lang_model.single_log_proba('', future_suffix) # rough approximation
        cache[len(future_suffix)] += -missed_model.single_log_proba('', future_suffix) # at least add missingness
    
    for i in range(max_attempts):
        if not heap:
            break
        next_best = heappop(heap)
        if verbose:
            print(next_best)
        if next_best[2] == '':  # it is a leaf
            # this is the best leaf as far, add it to candidates
            if next_best[0] <= best_logprob + freedom:
                candidates.append(next_best)
                # update the best likelihood
                if next_best[0] < best_logprob:
                    best_logprob = next_best[0]
        else: # it is not a leaf - generate more options
            prefix_proba = next_best[0] - next_best[4] # all proba estimate minus suffix
            prefix = next_best[1]
            suffix = next_best[2]
            new_options = generate_options(prefix_proba, prefix, suffix, lang_model, missed_model, optimism, cache)
            # add only the solution potentioally no worse than the best + freedom
            for new_option in new_options: 
                if new_option[0] < best_logprob + freedom:
                    heappush(heap, new_option)
    if verbose:
        print('heap size is', len(heap), 'after', i, 'iterations')
    result = {}
    for candidate in candidates:
        if candidate[0] <= best_logprob + freedom:
            result[candidate[1][1:-1]] = candidate[0]
    return result

In [9]:
result = noisy_channel('brc', lang_model, missed_model, freedom=2.0, optimism=0.5, verbose=True)
print(result)



baseline score is 14.659531132722798
(7.329765566361399, ' ', 'brc ', '', 7.329765566361399)
(7.729102491649175, ' b', 'rc ', '', 5.6781167272228625)
(6.82819709010665, ' br', 'c ', '', 3.689648873198813)
(7.4281382278577714, ' brc', ' ', '', 2.0472553582899407)
(7.68318306227505, ' brc ', '', '', -0.0)
(8.142544971129297, ' bra', 'c ', 'a', 3.689648873198813)
(8.36814476033081, ' brac', ' ', '', 2.0472553582899407)
(8.623189594748087, ' brac ', '', '', -0.0)
(8.838538268507152, ' a', 'brc ', 'a', 7.252915753770074)
(8.669109024122214, ' ab', 'rc ', '', 5.6781167272228625)
(7.768203622579689, ' abr', 'c ', '', 3.689648873198813)
(8.36814476033081, ' abrc', ' ', '', 2.0472553582899407)
(8.623189594748087, ' abrc ', '', '', -0.0)
(9.013760742594851, ' brca', ' ', 'a', 2.0472553582899407)
(9.028155327065601, ' brca ', '', '', -0.0)
(9.082551503602335, ' abra', 'c ', 'a', 3.689648873198813)
(9.30815129280385, ' abrac', ' ', '', 2.0472553582899407)
(9.563196127221126, ' abrac ', '', '', -0.

In [10]:
#https://stackoverflow.com/questions/42339876/error-unicodedecodeerror-utf-8-codec-cant-decode-byte-0xff-in-position-0-in

path = '/content/RomanicBanglaSongsLyrics.txt' #'/content/02 - The Two Towers.txt'

#with open(path , "rb") as f: # encoding = 'utf-8'
with open(path, encoding="utf8", errors='ignore') as f:
  text = f.read()
import re
text2 = re.sub(r'[^a-z ]+', '', text.lower().replace('\n', ' '))
print(text2[0:100])

indubala go o tumi kon akashe thako joshna kaare makho kaar uthone poro jhoriya dubiya morilam moriy


In [11]:
all_letters = ''.join(list(sorted(list(set(text2)))))
print(repr(all_letters))

' abcdefghijklmnoprstuvwxyz'


In [12]:
missing_set = [
] + [(all_letters, '-' * len(all_letters))] * 3 + [(all_letters, all_letters)] * 10 + [('aeiouy', '------')] * 30

In [13]:
for i in range(5):
    tmp = LanguageNgramModel(i, 1.0, 0.001)
    tmp.fit(text2[0:-5000])
    print(i, tmp.single_log_proba(' ', text2[-5000:]))



0 -13758.807954992411
1 -11061.652624343682
2 -10221.536405370183
3 -9850.448507206293
4 -10931.706338065407


In [14]:
big_lang_m = LanguageNgramModel(4, 0.001, 0.01)
big_lang_m.fit(text2)
big_err_m = MissingLetterModel(0, 0.1)
big_err_m.fit(missing_set)

In [15]:
noisy_channel('sm', big_lang_m, big_err_m, max_attempts=10000, optimism=0.9, freedom=3.0, verbose=False)



{'sm': 12.720902147980569,
 'smi': 13.159121250759666,
 'sms': 12.42163896407554,
 'somo': 11.903848945026619,
 'somoy': 10.248452872881996}

In [17]:
noisy_channel('rng', big_lang_m, big_err_m, max_attempts=10000, optimism=0.9, freedom=3.0, verbose=False)



{'rang': 11.646567206156382, 'ranga': 9.565287173166471}

In [19]:
noisy_channel('btl', big_lang_m, big_err_m, max_attempts=10000, optimism=0.9, freedom=3.0, verbose=False)



{'batal': 14.950405429262126,
 'btl': 16.183094115328966,
 'btla': 17.83495639320846,
 'btle': 16.726944475787985,
 'btli': 17.839844858833537,
 'btlo': 16.715150594367298}

In [24]:
noisy_channel('kala', big_lang_m, big_err_m, max_attempts=10000, optimism=0.9, freedom=3.0, verbose=False)




{'kala': 13.810543964342386}

In [29]:
noisy_channel('mntn', big_lang_m, big_err_m, max_attempts=10000, optimism=0.9, freedom=3.0, verbose=False)




{'montron': 14.023660631079446, 'montrona': 15.547536742377916}

In [25]:
!!apt install enchant
!pip install pyenchant==3.0.0

Collecting pyenchant==3.0.0
[?25l  Downloading https://files.pythonhosted.org/packages/4c/de/3c676e57059c8b958a6f7e00e99ac2c1587c0782d9e157beaf1497479504/pyenchant-3.0.0-py3-none-any.whl (56kB)
[K     |█████▉                          | 10kB 16.9MB/s eta 0:00:01[K     |███████████▋                    | 20kB 22.2MB/s eta 0:00:01[K     |█████████████████▌              | 30kB 13.3MB/s eta 0:00:01[K     |███████████████████████▎        | 40kB 10.2MB/s eta 0:00:01[K     |█████████████████████████████▏  | 51kB 5.6MB/s eta 0:00:01[K     |████████████████████████████████| 61kB 3.8MB/s 
[?25hInstalling collected packages: pyenchant
Successfully installed pyenchant-3.0.0


# Trying Banglish Spellchecker again using Levenshtein distance :  https://github.com/mobassir94/banglishChecker

In [30]:

ben = []
banglishsongs = "/content/RomanicBanglaSongsLyrics.txt"
text_file = open(banglishsongs, "r")
lines = text_file.readlines()
for i  in range(len(lines)):
  data = lines[i].rstrip("\n")
  data = data.split('\t')
  ben.append(data[0])
text_file.close()

In [32]:
len(ben)

6288

In [33]:
ben[0]

'Indubala Go O..'

In [37]:
ben = " ".join(ben)

In [38]:

!pip install python-Levenshtein
import Levenshtein as lev

def getMatch(string1,string2):
    min_sim = .80
    output = []
    res = [[lev.jaro_winkler(x,y) for x in string1.split()] for y in string2.split()]
    print(res)
    for x in res:
        if max(x) >= min_sim:
            output.append(string1.split()[x.index(max(x))])
    return output



In [41]:
getMatch(ben, "mntn")

[[0.4583333333333333, 0.0, 0.0, 0.5, 0.5277777777777778, 0.0, 0.48333333333333334, 0.47222222222222215, 0.0, 0.535, 0.0, 0.611111111111111, 0.0, 0.0, 0.0, 0.5125, 0.0, 0.0, 0.0, 0.5125, 0.0, 0.0, 0.47222222222222215, 0.0, 0.0, 0.4583333333333333, 0.0, 0.4583333333333333, 0.0, 0.48333333333333334, 0.0, 0.0, 0.47222222222222215, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.55, 0.5952380952380952, 0.0, 0.0, 0.0, 0.46428571428571425, 0.0, 0.47222222222222215, 0.6333333333333333, 0.48333333333333334, 0.48333333333333334, 0.47222222222222215, 0.48333333333333334, 0.48333333333333334, 0.0, 0.0, 0.5277777777777778, 0.0, 0.5277777777777778, 0.0, 0.5, 0.0, 0.0, 0.5, 0.48333333333333334, 0.5, 0.48333333333333334, 0.0, 0.0, 0.0, 0.0, 0.0, 0.48333333333333334, 0.0, 0.0, 0.48333333333333334, 0.45, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.0, 0.5, 0.48333333333333334, 0.5, 0.48333333333333334, 0.0, 0.0, 0.0, 0.5, 0.0, 0.47222222222222215, 0.0, 0.0, 0.5, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.611111111111111, 0.0, 0.464285714

['monta']

In [42]:
getMatch(ben, "kala")

[[0.4583333333333333, 0.0, 0.0, 0.0, 0.575, 0.638888888888889, 0.48333333333333334, 0.47222222222222215, 0.8266666666666667, 0.4666666666666666, 0.6666666666666666, 0.0, 0.0, 0.0, 0.47222222222222215, 0.5833333333333334, 0.47222222222222215, 0.5952380952380952, 0.47222222222222215, 0.5833333333333334, 0.47222222222222215, 0.5952380952380952, 0.47222222222222215, 0.0, 0.4583333333333333, 0.4583333333333333, 0.0, 0.4583333333333333, 0.0, 0.0, 0.638888888888889, 0.47222222222222215, 0.0, 0.0, 0.0, 0.7222222222222222, 0.0, 0.6944444444444443, 0.55, 0.55, 0.0, 0.0, 0.6722222222222222, 0.47222222222222215, 0.47222222222222215, 0.0, 0.48333333333333334, 0.611111111111111, 0.0, 0.8266666666666667, 0.535, 0.611111111111111, 0.6333333333333333, 0.6333333333333333, 0.8, 0.0, 0.0, 0.5, 0.0, 0.5277777777777778, 0.0, 0.7833333333333333, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5, 0.0, 0.48333333333333334, 0.48333333333333334, 0.0, 0.48333333333333334, 0.47222222222222215, 0.4666666666666666, 0.0, 0.0, 0.0, 0.4833

['kala']

In [43]:
getMatch(ben, "rng")

[[0.4861111111111111, 0.0, 0.0, 0.0, 0.5555555555555555, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5277777777777778, 0.0, 0.0, 0.4861111111111111, 0.5, 0.0, 0.0, 0.4861111111111111, 0.5, 0.0, 0.0, 0.5111111111111111, 0.4861111111111111, 0.4861111111111111, 0.0, 0.4861111111111111, 0.0, 0.5111111111111111, 0.0, 0.0, 0.5, 0.0, 0.5277777777777778, 0.0, 0.5111111111111111, 0.0, 0.0, 0.0, 0.5277777777777778, 0.4920634920634921, 0.0, 0.0, 0.0, 0.48412698412698413, 0.0, 0.5, 0.5111111111111111, 0.0, 0.0, 0.5, 0.6888888888888888, 0.6888888888888888, 0.0, 0.5277777777777778, 0.5555555555555555, 0.0, 0.5555555555555555, 0.0, 0.5555555555555555, 0.0, 0.0, 0.5277777777777778, 0.5111111111111111, 0.5555555555555555, 0.5111111111111111, 0.0, 0.5428571428571429, 0.5111111111111111, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.0, 0.5555555555555555, 0.0, 0.0, 0.0, 0.7222222222222222, 0.0, 0.0, 0.5277777777777778, 0.5111111111111111, 0.5555555555555555, 0.5111111111111111, 0.0, 0.5599999999999999, 0.5, 0.527777777777777

['rong']

In [26]:
#https://www.geeksforgeeks.org/python-spelling-checker-using-enchant/


# import the enchant module 
import enchant 
  
# create dictionary for the language 
# in use(en_US here) 
dict = enchant.Dict("en_US") 
  
# list of words 
words = ["cmputr", "watr", "study", "wrte"] 
  
# find those words that may be misspelled  
misspelled =[] 
for word in words: 
    if dict.check(word) == False: 
        misspelled.append(word) 
print("The misspelled words are : " + str(misspelled)) 
  
# suggest the correct spelling of 
# the missplelled words 
for word in misspelled: 
    print("Suggestion for " + word + " : " + str(dict.suggest(word))) 


The misspelled words are : ['cmputr', 'watr', 'wrte']
Suggestion for cmputr : ['computer']
Suggestion for watr : ['wart', 'watt', 'war', 'water']
Suggestion for wrte : ['rte', 'write', 'wrote', 'w rte']


In [27]:
!pip install autocorrect
from autocorrect import Speller

spell = Speller(lang='en')

print(spell('mntin'))
print(spell('mussage'))
print(spell('survice'))
print(spell('hte'))

Collecting autocorrect
[?25l  Downloading https://files.pythonhosted.org/packages/a0/71/eb8c1f83439dfe6cbe1edb03be1f1110b242503b61950e7c292dd557c23e/autocorrect-2.2.2.tar.gz (621kB)
[K     |████████████████████████████████| 624kB 5.4MB/s 
[?25hBuilding wheels for collected packages: autocorrect
  Building wheel for autocorrect (setup.py) ... [?25l[?25hdone
  Created wheel for autocorrect: filename=autocorrect-2.2.2-cp36-none-any.whl size=621491 sha256=66aa405d96aaa38cf99bbfc816a08d5c38943b76a7eaa0c0ce553b0ca4790db6
  Stored in directory: /root/.cache/pip/wheels/b4/0b/7d/98268d64c8697425f712c897265394486542141bbe4de319d6
Successfully built autocorrect
Installing collected packages: autocorrect
Successfully installed autocorrect-2.2.2
until
message
service
the


In [28]:
!pip install nltk
import nltk
from nltk.corpus import words
nltk.download('words')
"would" in words.words()

[nltk_data] Downloading package words to /root/nltk_data...
[nltk_data]   Unzipping corpora/words.zip.


True