# Modified Spelling Correction Algorithm for Detecting and Tracking Errors# 

In [4]:
import re, csv
from collections import Counter


def words(text): return re.findall(r'\w+', text.lower())


#WORDS = Counter(words(open('/home/niddal/Desktop/PhD_projects/Projects/Text_classification/Adversarial_attack/Wikicorpus.txt').read()))
WORDS = Counter(words(open('/home/niddal/Desktop/PhD_projects/Projects/Text_classification/Adversarial_attack/blog_test.txt').read()))


def P(word, N=sum(WORDS.values())):
    "Probability of `word`."
    return WORDS[word] / N


def correction(word):
    "Most probable spelling correction for word."
    return max(candidates(word), key=P)



def decode(word):
    letters_map = {'@': 'a', '$': 's', '!': 'i', '1': 'l', '0': 'o', '(': 'c', '+': 't'}
    symbol_chars = 0
    word = list(word.lower())
    for i in range(len(word)):
        if word[i] in letters_map:
            word[i] = letters_map.get(word[i])
            symbol_chars += 1
    return ''.join(word), symbol_chars


def deleteDuplicates(word):
    dups_dictionary = {word: 0}
    splits = [(word[:i], word[i:]) for i in range(1, len(word))]
    deletes = [L + R[1:] for L, R in splits if L[-1] == R[0]]
    if len(deletes) > 0:
        repeated_chars = 1
        dups_dictionary[deletes[0]] = repeated_chars
        # Try to remove any other duplicates like removing other o's in shooow
        while len(deletes) > 1:
            word = list(set(deletes))[0]
            splits = [(word[:i], word[i:]) for i in range(1, len(word))]
            deletes = [L + R[1:] for L, R in splits if L[-1] == R[0]]
            repeated_chars += 1
            dups_dictionary[deletes[0]] = repeated_chars
        return dups_dictionary
    return None


def transpose(word):
    splits = [(word[:i], word[i:]) for i in range(len(word) + 1)]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R) > 1]
    return transposes


def candidates(word):
    "Generate possible spelling corrections for word."
    return known([word]) or known(edits1(word)) or known(edits2(word)) or [word]


def known(words):
    "The subset of `words` that appear in the dictionary of WORDS."
    return set(w for w in words if w in WORDS)


def edits1(word):
    "All edits that are one edit away from `word`."
    letters    = 'abcdefghijklmnopqrstuvwxyz'
    splits     = [(word[:i], word[i:])    for i in range(len(word) + 1)]
    deletes    = [L + R[1:]               for L, R in splits if R]
    transposes = [L + R[1] + R[0] + R[2:] for L, R in splits if len(R)>1]
    replaces   = [L + c + R[1:]           for L, R in splits if R for c in letters]
    inserts    = [L + c + R               for L, R in splits for c in letters]
    return set(deletes + transposes + replaces + inserts)


def edits2(word):
    "All edits that are two edits away from `word`."
    return (e2 for e1 in edits1(word) for e2 in edits1(e1))


def correct_text(text):
    if len(text.split(' ')) > 1:
        print('Start of words in "{}"'.format(text))
        for w in text.split(' '):
            print('***',end='')
            correct_word(w)
        print('End of words in "{}"'.format(text))
    else:
        correct_word(text)



In [5]:
def correct_word(word):
    original_word = word.lower()
    correction_dictionary = {'Original_word': original_word, 'correction':'',
                             'symbol_chars': 0, 'repeated_chars': 0, 'swap_chars': 0, 'OOV': 0}
    
        #Search if the input is exist in the dictionary
    if original_word in WORDS:
        correction_dictionary['correction'] = original_word
        return correction_dictionary
    
    #Check if the input is a number
    if original_word.isdigit():
        correction_dictionary['correction'] = original_word
        return correction_dictionary
    else:
        word = original_word
    
    # Search for symbols, decode them and try correction on decoded word
    decoded_word, symbols_count = decode(word)
    if symbols_count > 0:
        # Try correction after decode
        correction_dictionary['symbol_chars'] += symbols_count
        if correction(decoded_word) == decoded_word and decoded_word in WORDS:
            #correction_dictionary['symbol_chars'] += symbols_count
            correction_dictionary['correction'] = decoded_word
            return correction_dictionary
        # If we still didn't get a match after decode,
        # then we will use the decoded word for more analysis
        else:
            word = decoded_word

    # Try deleteDuplicates
    if deleteDuplicates(word):
        duplicates_dictionary = deleteDuplicates(word)
        for trimmed_word in duplicates_dictionary:
            if correction(trimmed_word) == trimmed_word and trimmed_word in WORDS:
                correction_dictionary['repeated_chars'] += duplicates_dictionary.get(trimmed_word)
                correction_dictionary['correction'] = trimmed_word
                #print('Input word is: {}, possible match is: {}, {}' \
                 #   .format(original_word, trimmed_word, correction_dictionary))
                return correction_dictionary
        # If we still didn't get a match after deleteDuplicates,
        # then we will use the deleteDuplicates word for more analysis
        # if there are trimmed duplicates
        if duplicates_dictionary.get(trimmed_word) > 0:
            correction_dictionary['repeated_chars'] += duplicates_dictionary.get(trimmed_word)
            word = trimmed_word

    # Try transpose
    
    transposes = transpose(word)
    for transposed_word in transposes:
        if correction(transposed_word) == transposed_word and transposed_word in WORDS:
            correction_dictionary['swap_chars'] += 1
            correction_dictionary['correction'] = transposed_word
            #print('Input word is: {}, possible match is: {}, {}' \
             #   .format(original_word, transposed_word, correction_dictionary))
            return correction_dictionary
       

    # Otherwise, we will match the exact input word
    # or the suggested correction of input word
    # if it exists in the wordlist
    if original_word in WORDS:
        correction_dictionary['correction'] = original_word
        return correction_dictionary
   

    
    if correction(original_word) in WORDS:
        correction_dictionary['correction'] = correction(original_word)
        correction_dictionary['symbol_chars'] = 0
        correction_dictionary['repeated_chars'] = 0
        correction_dictionary['swap_chars'] = 0
        correction_dictionary['OOV'] += 1

        return correction_dictionary

In [6]:
correct_word('vi@gra')

{'Original_word': 'vi@gra',
 'correction': 'viagra',
 'symbol_chars': 1,
 'repeated_chars': 0,
 'swap_chars': 0,
 'OOV': 0}

# CSV File Correction

In [66]:
def csv_correction(csv_filename):
    # Read input file and save the list into input_list variable
    with open(csv_filename, 'r', encoding='utf-8-sig') as input_file:
        try:
            input_list = [row[0] for row in csv.reader(input_file)]
        except:
            print('Error opening input file')

    # Save output list
    fieldNames = ['Original_word', 'correction', 'symbol_chars', 'repeated_chars', 'swap_chars', 'OOV']
    csvfile = open('/home/niddal/Desktop/PhD_projects/Projects/Text_classification/Adversarial_attack/attacks/Offensive-2019/OCR-pipeline-Results/Manuplated/correction/res_15.csv', 'w')
    writer = csv.DictWriter(csvfile, fieldnames=fieldNames, delimiter=',')
    writer.writeheader()

    # Fill the output file
    for current_row in input_list:
        # If the input is a sentence
        if len(current_row.split(' ')) > 1:
            sentence_dictionary = {'Original_word': current_row, 'correction': '',
                                   'symbol_chars': 0, 'repeated_chars': 0, 'swap_chars': 0, 'OOV': 0}
            print('Start of words in "{}"'.format(current_row))
            for w in current_row.split(' '):
                print('***', end='')
                word_correction = correct_word(w)
                if len(sentence_dictionary['correction']) == 0:
                    sentence_dictionary['correction'] = word_correction['correction']
                else:
                    sentence_dictionary['correction'] += ' ' + word_correction['correction']

                sentence_dictionary['symbol_chars'] += word_correction['symbol_chars']
                sentence_dictionary['repeated_chars'] += word_correction['repeated_chars']
                sentence_dictionary['swap_chars'] += word_correction['swap_chars']
                sentence_dictionary['OOV'] += word_correction['OOV']
            writer.writerow(sentence_dictionary)
            print('End of words in "{}"'.format(current_row))
        else:
            writer.writerow(correct_word(current_row))
    csvfile.close()