In [356]:
import itertools
import numpy as np
import enchant
import numpy as np
import string
from string import ascii_lowercase
from random import choice

### First example, creating all permutations of obtained from a word made up of seven letters.
1. Create all possible combinations from the letters of the given word.
2. Create a list of all possible "words" that can be made up with those given words.
3. Compare the list of words to an existing US dictionary.
4. Obtain results, which are the number of words that are real & the list of words that were found to be real.

In [39]:
word = 'PARADOX'
combinations = list(itertools.permutations(word,len(word)))

In [40]:
# words_array = {range(len(combinations)), combinations}
word_dict = {}
for counter, item in enumerate(combinations):
    word_dict[counter] = ''.join(item)

In [41]:
len(word_dict)

5040

Example of using `enchant` library

In [14]:
# real_words = SpellChecker.known(word_dict, SpellChecker.known)
d = enchant.Dict("en_US")   # create dictionary for US English

In [15]:
d.check("enchant")

True

In [16]:
d.check("enchnt")

False

In [17]:
d.suggest("enchnt")

['enchant', 'entrench', 'tench']

The loop to check how many of these words are real.

In [69]:
real_words = 0
fake_words = 0
words_found = []

print("Checking the generated combination of words against an existing dictionary...")
for counter, item in enumerate(word_dict.values()):
    if d.check(item):
        words_found.append(item)
        real_words += 1
    elif d.check(item) is False:
        fake_words += 1

print(f"There are {real_words} real words and {fake_words} words that are not real.")
print(f"The real words found are: {words_found}")

Checking the generated combination of words against an existing dictionary...
There are 0 real words and 5040 words that are not real.
The real words found are: []


### Second example. Doing the same analysis on a randomly generated list of seven, non-repeated letters.

In [57]:
np.random.seed(2222)
randomly_chosen_letters = list(np.random.choice(list(string.ascii_lowercase), 7))

In [58]:
randomly_chosen_letters

['b', 'r', 'j', 'g', 'y', 'i', 'a']

In [59]:
combinations = tuple(itertools.permutations(randomly_chosen_letters,len(randomly_chosen_letters)))

In [60]:
# words_array = {range(len(combinations)), combinations}
word_dict = {}
for counter, item in enumerate(combinations):
    word_dict[counter] = ''.join(item)

In [61]:
len(word_dict)

5040

In [62]:
real_words = 0
fake_words = 0
words_found = []

print("Checking the generated combination of words against an existing dictionary...")
for counter, item in enumerate(word_dict.values()):
    if d.check(item):
        words_found.append(item)
        real_words += 1
    elif d.check(item) is False:
        fake_words += 1

print(f"There are {real_words} real words and {fake_words} words that are not real.")
print(f"The real words found are: {words_found}")

Checking the generated combination of words against an existing dictionary...
There are 0 real words and 5040 words that are not real.
The real words found are: []


### Third example. Checking 10 randomly chosen words & comparing results to 10 randomly generated sets of characters

In [65]:
d.suggest("random")

['ransom',
 'randoms',
 'fandom',
 'ran dom',
 'ran-dom',
 'rand om',
 'rand-om',
 'random']

In [183]:
# Generating our 10 words made up of randomly generated sets of characters, seven characters long
chars = string.ascii_lowercase
sets_of_random_characters = ["".join([choice(chars) for i in range(8)]) for j in range(10)]
sets_of_random_characters

['cjbqofiz',
 'hvqhdkla',
 'qmjoofuf',
 'cozoaqcm',
 'pjkpomua',
 'fmtoykgt',
 'rgjxgbom',
 'unudsoah',
 'pgwhulbq',
 'dyzwljll']

In [174]:
# 10-randomly generated real words, as determined by `enchant` library
generated_real_words = []

def generate_real_words(length_of_words, number_of_words):
    """
    Generate a list of real words by generating random string and checking against
    a pyenchant US dictionary.
    
    Parameters
    ----------
    length_of_words : integer specifying how long the words must be
    number_of_words : integer specifiying how many words are to be generated
    
    The code was accelerated by using the `suggest` function within pyechant,
    which allows us to not have to generate a word exactly-as-is, but only use
    and approximation and then take the length-appropiate choice suggested
    by pyenchant.
    
    """

    # A list to hold all the generated words
    generated = []
    
    # Dictionary and the list of all ascii lowercase characters
    us_dict = enchant.Dict("en_US")
    chars = string.ascii_lowercase
    
    while len(generated) <= number_of_words:
        
        # Generate a temp word, check against US dictionary,
        # and add it to the list if the word is real
        temp_word = "".join([choice(chars) for i in range(length_of_words)])          
        if us_dict.check(temp_word):
            generated.append(temp_word)
        else:
            
            # Try to find suggestions from pyenchant
            try:
                temp_list = us_dict.suggest(t)
                
                # Check if suggested words are correct length, and add to
                # the final list if they are
                for i in temp_list:
                    if len(i) == length_of_words:
                        generated.append(i)
                    else:
                        pass                
            except:
                pass 


            pass
    
    return generated    

# def test_generate_real_words():
#     assert isinstance(length_of_words, int)
#     assert isinstance(number_of_words, int)
    

Time of execution for optimized function

In [181]:
%%time
stuff = generate_real_words(length_of_words=6, number_of_words=6)
stuff

Wall time: 49.4 s


['aptest', 'raptly', 'napery', 'edgers', 'knolls', 'bocage', 'evener']

Time of execution of non-optimized function

In [179]:
def non_optimized(length_of_words, number_of_words):
    """
    Generate a list of real words by generating random string and checking against
    a pyenchant US dictionary.

    Parameters
    ----------
    length_of_words : integer specifying how long the words must be
    number_of_words : integer specifiying how many words are to be generated
    
    The code was accelerated by using the `suggest` function within pyechant,
    which allows us to not have to generate a word exactly-as-is, but only use
    and approximation and then take the length-appropiate choice suggested
    by pyenchant.
    
    """

    # A list to hold all the generated words
    generated = []
    
    # Dictionary and the list of all ascii lowercase characters
    us_dict = enchant.Dict("en_US")
    chars = string.ascii_lowercase
    
    while len(generated) <= number_of_words:
        # Generate a temp word, check against US dictionary,
        # and add it to the list if the word is real
        temp_word = "".join([choice(chars) for i in range(length_of_words)])          
        if us_dict.check(temp_word):
            generated.append(temp_word)
        else:
            pass
    
    return generated    

In [180]:
%%time 
stuff = non_optimized(length_of_words=6, number_of_words=6)
stuff

Wall time: 1min 4s


['enrobe', 'subtly', 'ozonic', 'sleepy', 'oppose', 'gender', 'purree']

### Results of optimization of word combinations function:
Optimized function is not always faster than non-optimized function, so it depends a lot on whether the `suggest` function in `pyenchant` is able to successfully create suggestions that are the right length.

## Comparing how many words are real in real-words vs randomly generated sets of characters

In [187]:
randomly_chosen_words = stuff
sets_of_random_characters
print(f"The real generated words are {randomly_chosen_words}, and the randomly generated sets of characters are {sets_of_random_characters}")

The real generated words are ['aptest', 'raptly', 'napery', 'edgers', 'knolls', 'bocage', 'evener'], and the randomly generated sets of characters are ['cjbqofiz', 'hvqhdkla', 'qmjoofuf', 'cozoaqcm', 'pjkpomua', 'fmtoykgt', 'rgjxgbom', 'unudsoah', 'pgwhulbq', 'dyzwljll']


In [289]:
def real_words_from_combinations(list_of_words, verbose = False):
    """
    A function that takes a list of words, and then finds the number of real words that
    can be formed from all the possible combinations of the characters of each word in the set.
    
    Parameters
    ----------
    list_of_words : a list of words
    verbose : Set to true to display progress messages
    
    Returns
    -------
    real_words_per_word : a tuple containing the number of words observed from each word in the set
    words_found : a list of lists that contains the real words found for each word in the set
    
    """
    if verbose == True:
        print("Checking the generated combination of words against an existing dictionary...")
    
    # Defining the variables to hold final results
    real_words_per_word = []
    words_found = []
    
    # Creating a US dictionary to check words against
    us_dict = enchant.Dict("en_US") 
    

    
    # Checking each word in the given list
    for temp_word in list_of_words:
        
        # Variables to hold per-word results 
        temp_real_words = 0
        temp_fake_words = 0
        temp_words_found = []
        
        # Creating all combinations that can be created with current word in given list of words
        temp_combination = list(itertools.permutations(temp_word,len(temp_word)))
        temp_word_dict = {}
        for counter, item in enumerate(temp_combination):
            temp_word_dict[counter] = ''.join(item)

        # Check if word exists, if word is not the starting word, and if the word has not been previously
        # stored in the final list
        for counter, item in enumerate(temp_word_dict.values()):
            if us_dict.check(item):
                if item == temp_word:
                    pass
                else:
                    if item in real_words_per_word:
                        pass
                    else:
                        temp_words_found.append(item)
                        temp_real_words += 1
            else:
                temp_fake_words += 1
        
        # Removing duplicates and re-assign real number of real words if needed
        temp_words_found = list(dict.fromkeys(temp_words_found))
        temp_real_words = len(temp_words_found)
        
        # Storing per-word results
        real_words_per_word.append(temp_real_words)
        words_found.append(temp_words_found)
        
    if verbose == True:
        print('Done.')
    
    return real_words_per_word, words_found
        
#         print(f"There are {real_words} real words and {fake_words} words that are not real.")
#         print(f"The real words found are: {words_found}")


In [290]:
test_list =['altruist', 'cat']
print(real_words_from_combinations(test_list, verbose = True))

Checking the generated combination of words against an existing dictionary...
Done.
([0, 1], [[], ['act']])


In [276]:
test_list_two = ['paradox', 'super']
print(real_words_from_combinations(test_list_two))

([0, 2], [[], ['sprue', 'purse']])


## Final experiment with combinations

Testing how many real words there are in the 10 sets of randomly generated characters vs the set 10 real words.

In [354]:
frequency, words_found = real_words_from_combinations(randomly_chosen_words)

print("-"*200)
print('Results for set of real words:')
print(f"Test set:                                                  {randomly_chosen_words}")
print(f"The number of real words per word in the test set is:      {frequency}")
print(f"The real words found are:                                  {words_found}")
print("-"*200)

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Results for set of real words:
Test set:                                                  ['aptest', 'raptly', 'napery', 'edgers', 'knolls', 'bocage', 'evener']
The number of real words per word in the test set is:      [0, 2, 0, 0, 0, 0, 1]
The real words found are:                                  [[], ['partly', 'paltry'], [], [], [], [], ['veneer']]
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------


In [355]:
frequency, words_found = real_words_from_combinations(sets_of_random_characters)

print("-"*200)
print('Results for set of real words:')
print(f"Test set:                                                  {sets_of_random_characters}")
print(f"The number of real words per word in the test set is:      {frequency}")
print(f"The real words found are:                                  {words_found}")
print("-"*200)

--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
Results for set of real words:
Test set:                                                  ['cjbqofiz', 'hvqhdkla', 'qmjoofuf', 'cozoaqcm', 'pjkpomua', 'fmtoykgt', 'rgjxgbom', 'unudsoah', 'pgwhulbq', 'dyzwljll']
The number of real words per word in the test set is:      [0, 0, 0, 0, 0, 0, 0, 0, 0, 0]
The real words found are:                                  [[], [], [], [], [], [], [], [], [], []]
--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------
