In [1]:
from symspell import SymSpell
import time
import json
from multiprocessing import Pool
from itertools import islice
import multiprocessing


In [2]:
#Traditional way of loading ==> Takes 7 mins
ss = SymSpell(max_dictionary_edit_distance=3)

#Multithread loading ==> with 4 threads, takes 3 mins
ssMT = SymSpell(max_dictionary_edit_distance=3)

#Number of threads equivalent to number of CPUs or cores
NUM_PROCESSES=multiprocessing.cpu_count()



In [3]:
#First load in traditional way. We are loading it as ss
filename = 'SymSpell_Dctionary_Word_Full.json'
#filename = 'test.json'

%time ss.load_words_with_freq_from_json_and_build_dictionary(filename,encoding="ISO-8859-1")


Loading dictionary...
Loaded dictionary...
CPU times: user 7min 8s, sys: 1.95 s, total: 7min 10s
Wall time: 7min 10s


In [4]:


def buildDictMT(wordGroup):
    """
    This function will be called from pool.
    As it is multi thread, create its own SymSpell
    At the end return _deletes
    All pools will return its own _deletes
    Threading function is expected to merge them together
    """
    ssLocal = SymSpell(max_dictionary_edit_distance=3)
    for word in wordGroup:
        ssLocal.create_dictionary_entry_MT(word) #Dummy set word count. Later set correct count in master SymSpell
    return ssLocal._deletes

def chunks(data, SIZE=NUM_PROCESSES):
    """
    Just create required chunks of raw words.
    """
    it = iter(data)
    for i in range(0, len(data), SIZE):
        yield {k:data[k] for k in islice(it, SIZE)}

In [5]:
def loadDataFromJSON(filename,ssL):
    """
    Input is filename, expected to be already processed _words json.
    We will first split the data into chunks
    Create threads and process them in parllel
    Result will be _deletes with hash key
    As there can be similar hash keys across threads, we will merge them to our master SymSpell
    Also we will update master SymSpell _words
    """
    myData = dict()
    
    print(time.ctime()+': Loading words...')

    with open(filename, 'r',encoding='ISO-8859-1') as fp:
        myData = json.load(fp)
    #To ensure single thread is not getting all worst cases, we will split them it to 4 times the core/threads
    chunkSize=int(len(myData)/(4*NUM_PROCESSES))+1
    print(time.ctime()+': Loaded %i words...' % len(myData))
    # cut words into chunks, so that each chunk is processed in parallel
    word_groups = chunks(myData,chunkSize )
    #create pools
    pool = Pool(NUM_PROCESSES)
    # processes chunks in parallel
    print(time.ctime()+': Building of dictionary started with %i threads...' % NUM_PROCESSES)
    results = pool.map(buildDictMT, word_groups)
    
    #As map is blocking, we know all threads/processes are over
    pool.close()
    pool.terminate()
    pool.join()  
    print(time.ctime()+': Building of dictionary over...')
    
    #As each process would have given its own results, which might have duplicate hashes, merge them to our master
    for result in results:
        for hs in result:
            if hs in ssL._deletes:
                suggestions = ssL._deletes.get(hs)
                suggestions.extend(result[hs])
                ssL._deletes[hs] = suggestions
            else:
                ssL._deletes[hs] = result[hs]       
    print(time.ctime()+': Copied result to master dictionary...')

    #Push words and word counts to our master SymSpell
    for word in myData:
        ssL._words[word]=myData[word]
        if len(word) > ssL._max_length:
            ssL._max_length = len(word)
    print(time.ctime()+': Copied words to master dictionary...')

    del myData
    del results
    del pool
    del word_groups

In [6]:
#Multi thread loading
#JSON is expected to be previously created unique _words json

filename = 'SymSpell_Dctionary_Word_Full.json'
#filename = 'test.json'
%time loadDataFromJSON(filename,ssMT)


Tue Apr 17 21:36:20 2018: Loading words...
Tue Apr 17 21:36:21 2018: Loaded 500557 words...
Tue Apr 17 21:36:21 2018: Building of dictionary started with 4 threads...
Tue Apr 17 21:39:13 2018: Building of dictionary over...
Tue Apr 17 21:39:30 2018: Copied result to master dictionary...
Tue Apr 17 21:39:31 2018: Copied words to master dictionary...
CPU times: user 1min 26s, sys: 2.98 s, total: 1min 29s
Wall time: 3min 12s


In [7]:
suggestion_list = ss.lookup(phrase='infifity', verbosity=1, max_edit_distance=2)
for suggestion in suggestion_list:
    print(suggestion)

suggestion_list = ss.lookup(phrase='haevliy', verbosity=1, max_edit_distance=2)
for suggestion in suggestion_list:
    print(suggestion)

suggestion_list = ss.lookup(phrase='erroring', verbosity=1, max_edit_distance=3)
for suggestion in suggestion_list:
    print(suggestion)

%time suggestion_list = ss.lookup_compound(phrase='Link is haevliyy errorrinng', max_edit_distance=3)
for suggestion in suggestion_list:
    print(suggestion)


infinity:4601706:1
heavily:8139465:2
erroring:25:0
CPU times: user 5.32 s, sys: 20 ms, total: 5.34 s
Wall time: 5.34 s
link is heavily erroring:25:-1


In [8]:
def correctSentance(phrase, ssL):
    suggestion_list = ssL.lookup_compound(phrase, max_edit_distance=3)
    for suggestion in suggestion_list:
        print(suggestion)

In [9]:
phrase='Link is haevliyy errorrinng'
%time correctSentance(phrase,ss)
%time correctSentance(phrase,ssMT)
phrase="in te dhird qarter oflast jear he hadlearned ofca sekretplan y iran"
%time correctSentance(phrase,ss)
%time correctSentance(phrase,ssMT)
phrase="whereis th elove hehad dated forImuch of thepast who couqdn'tread in sixthgrade and ins pired him"
%time correctSentance(phrase,ss)
%time correctSentance(phrase,ssMT)



link is heavily erroring:25:-1
CPU times: user 5.31 s, sys: 12 ms, total: 5.33 s
Wall time: 5.33 s
link is heavily erroring:25:-1
CPU times: user 5.38 s, sys: 0 ns, total: 5.38 s
Wall time: 5.38 s
in to third quarter oblast jear he had learned orca secret plan a iran:1:-1
CPU times: user 2.99 s, sys: 7 µs, total: 2.99 s
Wall time: 2.99 s
in to third quarter oblast jear he had learned orca secret plan a iran:1:-1
CPU times: user 2.99 s, sys: 7.94 ms, total: 3 s
Wall time: 3 s
whereis to love head dated for much of the past who couldn't read in sixth grade and ins tired him:1:-1
CPU times: user 4.09 s, sys: 8.03 ms, total: 4.1 s
Wall time: 4.1 s
whereis to love head dated for much of the past who couldn't read in sixth grade and ins tired him:1:-1
CPU times: user 4.09 s, sys: 7.97 ms, total: 4.09 s
Wall time: 4.09 s


In [10]:
phrase="the bigjest playrs in te strogsommer film slatew ith plety of funn"
%time correctSentance(phrase,ssMT)

the biggest players in to str somme film slate ith plenty of fun:1:-1
CPU times: user 6.45 s, sys: 16 µs, total: 6.45 s
Wall time: 6.45 s


In [11]:
len(ss._deletes)

2366064

In [12]:
len(ssMT._deletes)

2366064

In [13]:
ssMT._deletes == ss._deletes

True

In [14]:
%time ss.save_complete_model_as_json("SymSpell_Dictionary_Word_500K_Complete_Model.json",encoding="ISO-8859-1")

Saving dictionary...
Saved dictionary...
CPU times: user 1min 38s, sys: 2 s, total: 1min 40s
Wall time: 1min 40s
