## Create the dictionary file for labels (first from A-Z and then from a-)

In [50]:
f = open('deps.txt')
lines = f.read().splitlines()
f.close()

deps_dict = {}
counter = 65
for dep in lines:
    deps_dict[dep.replace(' ', '').lower()] = chr(counter)
    if(counter == 90):
        counter = 97
    else:
        counter += 1

# deprecated labels to be taken care of
deps_dict['npadvmod'] = 'b'
deps_dict['nmod'] = 'a'
print(deps_dict)

import numpy as np
np.save('deps_dict.npy', deps_dict)

{'nounmod': 'a', 'predet': 'l', 'dep': 'S', 'prep': 'm', 'nsubjpass': 'd', 'nsubj': 'c', 'acomp': 'B', 'appos': 'G', 'parataxis': 'g', 'root': 'r', 'ccomp': 'M', 'csubjpass': 'Q', 'relcl': 'q', 'case': 'K', 'pcomp': 'h', 'det': 'T', 'nmod': 'a', 'preconj': 'k', 'acl': 'A', 'advmod': 'D', 'auxpass': 'J', 'nummod': 'e', 'dobj': 'U', 'meta': 'Y', 'oprd': 'f', 'expl': 'V', 'csubj': 'P', 'agent': 'E', 'intj': 'W', 'aux': 'I', 'pobj': 'i', 'quantmod': 'p', 'advcl': 'C', 'amod': 'F', 'prt': 'n', 'conj': 'O', 'neg': 'Z', 'npmod': 'b', 'attr': 'H', 'poss': 'j', 'punct': 'o', 'compound': 'N', 'mark': 'X', 'cc': 'L', 'npadvmod': 'b', 'xcomp': 's', 'dative': 'R'}


## Sample produce a sentence code

In [2]:
import spacy
nlp = spacy.load('en_core_web_sm')
doc = nlp("We are walking.")

sent_struct = []
for token in doc:
    sent_struct.append(token.dep_.lower())

sentence_code = ''.join(sent_struct)
print(sentence_code)

nsubjauxrootpunct


In [20]:
def encode_sentence(english):
    doc = nlp(english)

    sent_struct = []
    for token in doc:
        sent_struct.append(deps_dict[token.dep_.lower()])

    sentence_code = ''.join(sent_struct)
    return sentence_code

## First, create a dictionary of the sentence codes (with their occurences)

In [4]:
import nltk
from nltk.corpus import brown
from nltk.tokenize.moses import MosesDetokenizer
mdetok = MosesDetokenizer()

In [5]:
sentence_code_dict = {}

for book in brown.fileids():
    for sent in brown.sents(book):
        munged_sentence = ' '.join(sent).replace('``', '"').replace("''", '"').replace('`', "'")
        payload = nlp(mdetok.detokenize(munged_sentence.split(), return_str=True))
        sent_struct = []
        for token in payload:
            sent_struct.append(deps_dict[token.dep_.lower()])
    
        sentence_code = ''.join(sent_struct)
        #print(payload.text)
        #print(sentence_code)
        #print('--')
        if sentence_code in sentence_code_dict:
            sentence_code_dict[sentence_code] += 1
        else:
            sentence_code_dict[sentence_code] = 1

# print(sentence_code_dict)

In [102]:
custom_sentence = 'Ronaldo has been the most consistent performer today.'
payload = nlp(custom_sentence)
sent_struct = []
for token in payload:
    sent_struct.append(deps_dict[token.dep_.lower()])
sentence_code = ''.join(sent_struct)
print(sentence_code)
if sentence_code in sentence_code_dict:
    sentence_code_dict[sentence_code] += 1
else:
    sentence_code_dict[sentence_code] = 1

cIrTDFHbo


In [103]:
sentence_code_dict['TcIrmTio']

7

## Now this dictionary needs to be written to file (both as npy and text file)

In [104]:
f = open('sentence_codes.txt', 'w')

for key, value in sentence_code_dict.items():
    f.write(key + ' ' + str(value) + '\n')

f.close()

import numpy as np
np.save('sentence_correction_dict.npy', sentence_code_dict)

## Watch sentence correction in action

In [26]:
import os

import numpy as np
from sympound import sympound

import platform
distancefun = None
if platform.system() != "Windows":
    from pyxdameraulevenshtein import damerau_levenshtein_distance
    distancefun = damerau_levenshtein_distance
else:
    from jellyfish import levenshtein_distance
    distancefun = levenshtein_distance


ssc = sympound(distancefun=distancefun, maxDictionaryEditDistance=3)

def test():
    # ssc.create_dictionary_entry("bonjour", 1) # optional, only if adding dictionary items is required
    
    sc_dict = np.load('sentence_correction_dict.npy').item()
    
    ssc.load_dictionary("sentence_codes.txt", term_index=0, count_index=1)
    payload = encode_sentence('Ronaldo has been most consistent performer on today.')
    
    if payload in sc_dict:
        print('correct!')
        return
    
    try:
        #result = ssc.lookup_compound(input_string=payload, edit_distance_max=1) # choose edit_distance carefully
        result = ssc.lookup(input_string=payload, verbosity=0, edit_distance_max=1) # choose edit_distance carefully
    except TypeError:
        print('no matches!')
        return
    
    if(result == []):
        return
    
    print(result[-1])
    result = str(result[-1]).split(':')
    print('wrong :      ' + payload)
    print('correction : ' + result[0])
    print('position : ' + result[2])
    
    ssc.save_pickle("symspell.pickle")
    #ssc.load_pickle("symspell.pickle")

test()

cIrjFHmio:3:1
wrong :      cIrDFHmio
correction : cIrjFHmio
position : 1


## Decode the encoded sentence structure string

In [49]:
f = open('deps.txt')
lines = f.read().splitlines()
f.close()

inv_deps_dict = {}
counter = 65
for dep in lines:
    inv_deps_dict[chr(counter)] = dep.replace(' ', '').lower()
    if(counter == 90):
        counter = 97
    else:
        counter += 1

# deprecated labels to be taken care of
inv_deps_dict['b'] = 'npadvmod'
inv_deps_dict['a'] = 'nmod'
print(inv_deps_dict)

import numpy as np
np.save('inv_deps_dict.npy', inv_deps_dict)

{'h': 'pcomp', 'i': 'pobj', 'Z': 'neg', 'Q': 'csubjpass', 'K': 'case', 'B': 'acomp', 'g': 'parataxis', 'o': 'punct', 'U': 'dobj', 'R': 'dative', 'A': 'acl', 'j': 'poss', 'S': 'dep', 'f': 'oprd', 'W': 'intj', 'm': 'prep', 'G': 'appos', 'F': 'amod', 'd': 'nsubjpass', 's': 'xcomp', 'C': 'advcl', 'Y': 'meta', 'I': 'aux', 'l': 'predet', 'n': 'prt', 'q': 'relcl', 'V': 'expl', 'k': 'preconj', 'D': 'advmod', 'r': 'root', 'T': 'det', 'N': 'compound', 'O': 'conj', 'a': 'nmod', 'E': 'agent', 'p': 'quantmod', 'c': 'nsubj', 'e': 'nummod', 'L': 'cc', 'H': 'attr', 'b': 'npadvmod', 'P': 'csubj', 'J': 'auxpass', 'X': 'mark', 'M': 'ccomp'}


In [24]:
def decode_coding(code):
    code_list = list(code.replace(' ', ''))
    decoded_list = []
    for char in code_list:
        decoded_list.append(inv_deps_dict[char])
    return decoded_list

In [26]:
mycode = 'TcIIrmTio'
decode_coding(mycode)

['det', 'nsubj', 'aux', 'aux', 'root', 'prep', 'det', 'pobj', 'punct']

In [27]:
def correct():
    # ssc.create_dictionary_entry("bonjour", 1) # optional, only if adding dictionary items is required
    
    sc_dict = np.load('sentence_correction_dict.npy').item()
    
    ssc.load_dictionary("sentence_codes.txt", term_index=0, count_index=1)
    payload = encode_sentence('Ronaldo has been most consistent performer in today.')
    
    if payload in sc_dict:
        print('correct!')
        return
    
    try:
        #result = ssc.lookup_compound(input_string=payload, edit_distance_max=1) # choose edit_distance carefully
        result = ssc.lookup(input_string=payload, verbosity=0, edit_distance_max=1) # choose edit_distance carefully
    except TypeError:
        print('no matches!')
        return
    
    if(result == []):
        return
    
    print(result[-1])
    result = str(result[-1]).split(':')
    print('wrong :      ' + payload)
    print('correction : ' + result[0])
    #print('position : ' + result[2])
    
    correct_broke = list(result[0])
    wrong_broke = list(payload)
    
    ssc.save_pickle("symspell.pickle")
    #ssc.load_pickle("symspell.pickle")

correct()

cIrDTHmio:9223372036854775807:1
wrong :      cIrDFHmio
correction : cIrDTHmio


## Comparing strings and finding out the exact error

In [47]:
correct = list('cIFrTDFHboT')
wrong = list('cIrDFHboT')
common = [ e for e in correct if e in wrong and (wrong.pop(wrong.index(e)) or True)]
common

['c', 'I', 'F', 'r', 'T', 'D', 'H', 'b', 'o']

In [37]:
x = ['1', '2', '3', '4']
x.pop(3)

'4'

In [39]:
wrong

['c', 'I', 'r', 'D', 'F', 'H', 'b', 'o', 'T']

In [40]:
def Diff(li1, li2):
    li_dif = [i for i in li1 + li2 if i not in li2]
    return li_dif
 
# Driver Code
li1 = ['c', 'I', 'F', 'r', 'T', 'D', 'F', 'H', 'b', 'o', 'T']
li2 = ['c', 'I', 'r', 'D', 'F', 'H', 'b', 'o', 'T']
li3 = Diff(li1, li2)
print(li3)

[]


In [30]:
li1 + li2

[10, 15, 20, 25, 30, 35, 30, 25, 40, 35, 30]