This notebook extracts all spelling and puncutation features from the set of queries found in SWC and SQS.

# Load Libraries

The following block of code loads all libraries needed for this notebook.

In [1]:
import csv
import pickle
import string
import textstat

import pandas as pd
import numpy as np

from langdetect import detect
from spellchecker import SpellChecker
from tqdm import tqdm

# Load Data Sets

This block of code loads the data sets and extracts all unique queries from both.

In [2]:
allSessions = pickle.load( open( "../Data/DataSets/SWC/SWC.p", "rb" ) )
allSessionsSQS = pickle.load( open( "../Data/DataSets/SQS/SQS.p", "rb" ) )
allQueries = allSessions['query'].tolist() + allSessionsSQS['query'].tolist()
allQueries = set(allQueries)

# Generate Misspelled List

Generates a list of commonly misspelled words by children from the KidSpell data set which is later used.

In [3]:
kidsMispelled = []

count = 0

with open('DataSets/KidSpell/Web_Search_Lab_Errors.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in spamreader:
        if count == 0:
            count += 1
        else:
            kidsMispelled.append(row[0])

count = 0
with open('DataSets/KidSpell/Web_Search_Informal_Errors.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in spamreader:
        if count == 0:
            count += 1
        else:
            kidsMispelled.append(row[0])
    
count = 0
with open('DataSets/KidSpell/Essay_Writing_Errors.csv', newline='') as csvfile:
    spamreader = csv.reader(csvfile, delimiter=',', quotechar='|')
    for row in spamreader:
        if count == 0:
            count += 1
        else:
            kidsMispelled.append(row[1])

kidsMispelled = set(kidsMispelled)

# Extract Spelling Features

The following block of code extracts features related to spelling errors and stores them in a dataframe. 

In [4]:
spell = SpellChecker()

spellingError = []
oneOffError = []
kidsError = []

netModifiers = ['www', 'http', '.com', '.net', '.edu', '.org', '.gov', '.co', '.mil', '.com']

with tqdm(total = len(allQueries) ) as pbar:
    for query in allQueries:
        query =  query.translate(str.maketrans('', '', string.punctuation))
        website = [mod for mod in netModifiers if(mod in query)] 
        if not website:
            try:
                lang = detect(query)
                misspelled = spell.unknown(query.split(" "))
                found = 0
                oneOff = 0
                kidsMis = 0
                for word in misspelled:
                    if word in kidsMispelled:
                        kidsMis +=1
                    candid = spell.candidates(word)
                    edits = spell.edit_distance_1(word)
                    for can in candid:
                        if can in edits:
                            oneOff += 1
                            break
                oneOffError.append(oneOff)
                spellingError.append(len(misspelled))
                kidsError.append(kidsMis)

            except:
                oneOffError.append(-1)
                spellingError.append(-1)
                kidsError.append(-1)


        else:
            spellingError.append(0)
            oneOffError.append(0)
            kidsError.append(0)

        pbar.update()
    
spelling = pd.DataFrame(data=spellingError, columns = ['numSpellingErrors'])
spelling['query'] = allQueries
spelling['offByOne'] = oneOffError
spelling['kidsError'] = kidsError
#spelling = spelling.set_index('query')

100%|██████████| 70112/70112 [2:41:38<00:00,  7.23it/s]   


# Extract Punctuation And Casing Features

The following block of code extracts spelling and casing features before adding them to the dataframe. 

In [7]:
invalidcharacters= set(['!', ',', '.', '?'])
punct = []
casing = []
with tqdm(total = len(allQueries) ) as pbar:
    for query in allQueries:

        if any(char in invalidcharacters for char in query):
            if any(substring in query for substring in netModifiers):
                punct.append(0)
            else:
                punct.append(1)
        else:
            punct.append(0)

        if query.islower():
            casing.append(0)
        else:
            casing.append(1)
        pbar.update()
        
spelling['punct'] = punct
spelling['casing'] = casing

100%|██████████| 70112/70112 [00:00<00:00, 261486.90it/s]


# Return Feature Set

Returns dataframe with spelling and punctuation features.

In [8]:
pickle.dump(spelling, open( "Pickles/SPFeat.p", "wb" ) )