### This notebook about making a dictionary of typos correction as OUTPUT to be used in the final cleaning of the data

In [2]:
from bs4 import BeautifulSoup as bs #to clear HTML
import contractions #to reveal conractions ex: I'll ==> I will
import pandas as pd 
import re
from collections import Counter
import time # to calculate run time for each cell.
import string 
import pkg_resources
from symspellpy import SymSpell, Verbosity #to spell checking.

In [3]:
ques = pd.read_pickle("./Que.pkl") #Questions Data befor & after cleaning it from HTML
ques.head()

Unnamed: 0,Title,Body,Text,Code
0,How can I find the full path to a font from it...,<p>I am using the Photoshop's javascript API t...,I am using the Photoshop's javascript API to f...,[]
1,Get a preview JPEG of a PDF on Windows?,<p>I have a cross-platform (Python) applicatio...,I have a cross-platform (Python) application w...,[]
2,Continuous Integration System for a Python Cod...,<p>I'm starting work on a hobby project with a...,I'm starting work on a hobby project with a py...,[]
3,cx_Oracle: How do I iterate over a result set?,<p>There are several ways to iterate over a re...,There are several ways to iterate over a resul...,[]
4,Using 'in' to match an attribute of Python obj...,<p>I don't remember whether I was dreaming or ...,I don't remember whether I was dreaming or not...,"[foo in iter_attr(array of python objects, att..."


In [4]:
#reveal conractions
ques['Text'] = ques['Text'].apply(lambda x: contractions.fix(x))
ques['Title'] = ques['Title'].apply(lambda x: contractions.fix(x))

In [5]:
#removing expected noise in our Text.
noise = '!"$%&\'()*+,-./:;?@[\\]^_`{|}~\n<=>' 
for i in noise:
    ques['Text'] = ques['Text'].str.replace(i,' ',regex=True)
    ques['Title'] = ques['Title'].str.replace(i,' ',regex=True)

In [6]:
#this fuction to clear data from extra spaces , new lines,....
def clean(text):
    text = re.sub(r"\'", "'", text) # match all literal apostrophe pattern then replace them by a single whitespace
    text = re.sub(r"\n", " ", text) # match all literal Line Feed (New line) pattern then replace them by a single whitespace
    text = re.sub(r"\xa0", " ", text) # match all literal non-breakable space pattern then replace them by a single whitespace
    text = re.sub('\s+', ' ', text) # match all one or more whitespace then replace them by a single whitespace
    text = text.strip(' ')
    return text

In [7]:
#apply clean() on our data frame
ques['Text'] = ques['Text'].apply(lambda x: clean(x))
ques['Title'] = ques['Title'].apply(lambda x: clean(x))

In [8]:
ques['Text'][11] # data after full cleaning

'I have got a menu in Python That part was easy I am using to get the selection from the user The problem is that and input require the user to press Enter after they make a selection Is there any way to make the program act immediately upon a keystroke here is what I have got so far It would be great to have something like'

In [9]:
corpus = [] #corpus of text of ques['text','Title'] 
for i in range(len(ques)):
    [corpus.append(word.lower())for word in ques['Text'][i].split(' ')]
    [corpus.append(word.lower())for word in ques['Title'][i].split(' ')]

In [10]:
unique_words = set(corpus) #using set to reduce runtime of checking typos process 

In [11]:
print(f"all words in text = {len(corpus)}")
print(f"unique words in text = {len(unique_words)}")

all words in text = 63956411
unique words in text = 331706


In [12]:
#to determine if the word is a common word in python or not. 
#if so we won't typo checking model will avoid it.

term_frequncy = {}
for i in corpus:
    term_frequncy.setdefault(i, 0)
    term_frequncy[i] += 1

In [13]:
#test term Frequancy 
print(corpus.count('hello')) # 
print(term_frequncy['hello'])  

6781
6781


In [14]:
# loading typo checking model and it's dictionary

sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
dictionary_path = pkg_resources.resource_filename(
    "symspellpy", "frequency_dictionary_en_82_765.txt")

sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

# test it by display suggestion term, term frequency, and edit distance
suggestions = sym_spell.lookup("crona", Verbosity.CLOSEST,
                               max_edit_distance=2,ignore_token  = None)
print(suggestions[0])

corona, 1, 2806095


In [15]:
#to check if the word contain numbers or not or if itself is a number 
#if so we won't typo checking model will avoid it.

def contain_digits(s):
    contains_digit = False

    for character in s:
        if character.isdigit():
            contains_digit = True

    return contains_digit

In [16]:
corrected_words = {} #dictionary to store word and it's correction in unqiue_words.
n, bad = 0, 0  #n is a number of words , bad is a number of wrong words
start = time.process_time()

for word in unique_words:
    
        n += 1 

        suggestions = sym_spell.lookup(word, Verbosity.CLOSEST,
                           max_edit_distance=1)   
    
        if len(suggestions)<=0 or contain_digits(word) or term_frequncy[word]>50:
            w = word

        else:
            w = str(suggestions[0]).split(',')[0]


        #w = spell.correction(incorrect_spelling)
        if w != word:
            bad +=1
            
        corrected_words[word] = w

dt = time.process_time() - start
print('{:.0%} of {} correct ({:.0%} unknown) at {:.0f} words per second '
      .format((n-bad)/ n, n, bad / n, n / dt))


82% of 331706 correct (18% unknown) at 42601 words per second 


In [23]:
corrected_words["ello"]

'hello'

In [1]:
#saving dictionary to using it in further preprocessing
import pickle
with open('corrected_words.pkl', 'wb') as handle:
    pickle.dump(corrected_words, handle, protocol=pickle.HIGHEST_PROTOCOL)



NameError: name 'corrected_words' is not defined