In [1]:
import numpy as np
import time
import itertools
import hashlib

file_name = 'wordlist.txt'

anagram = 'poultry outwits ants'
anagram = anagram.replace(' ', '')

solution_hashes = ['e4820b45d2277f3844eac66c903e84be',
                   '23170acc097c24edb98fc5488ab033fe',
                   '665e5bcb0c20062fe8abaaf4628bb154']

with open(file_name, 'r') as file:
    words_original = file.read().split('\n')
    if '' in words_original:
        words_original.remove('')

print('Number of words in list:', len(words_original))

Number of words in list: 99175


In [2]:
# helper functions
def check_hash(solutions, target_hashes):
    hashes = [hashlib.md5(solution.encode()).hexdigest() for solution in solutions]

    for target_hash in target_hashes:
        try:
            print(f'"{solutions[hashes.index(target_hash)]}" matches hash {target_hash}')
        except:
            pass
#             print(target_hash, 'not found in solution')

def words_to_phrases(word_lst):
    phrases = []
    for words in word_lst:
        n_words = len(words)
        for permutation in itertools.permutations(words, n_words):
            phrase = ' '.join(permutation)
            phrases.append(phrase)
    return phrases

words_to_phrases(word_lst=[['this', 'is', 'sparta']])

['this is sparta',
 'this sparta is',
 'is this sparta',
 'is sparta this',
 'sparta this is',
 'sparta is this']

# Word list filtering
1. Only keep unique words
2. The word is required to only contain characters that are in the anagram phrase
3. The word cannot have more instances of a given character than is within the anagram phrase

In [3]:
# Bullet 1 - filter out non-unique words
def remove_non_unique(words):
    return set(words)

print(f'Words in current list:', len(words_original))
words = remove_non_unique(words_original)
print('Words remaining:', len(words))

Words in current list: 99175
Words remaining: 96317


In [4]:
# Bullet 2 - Ignore words with characters not in anagram
def filter_invalid_char_words(words, anagram):
    anagram_set = set(anagram)
    words_subset = [word for word in words if set(word).issubset(anagram_set)]

    return words_subset

print(f'Words in current list:', len(words))
words = filter_invalid_char_words(words, anagram)
print('Words remaining:', len(words))

Words in current list: 96317
Words remaining: 2527


In [5]:
# Bullet 3 - ignore words with too many instances of given character
def get_character_count(word):
    char_cnt = {}
    for char in set(word):
        char_cnt[char] = word.count(char)
    
    return char_cnt
        
def filter_execive_char_words(words):
    # character count in anagram
    anagram_char_cnt = get_character_count(anagram)

    # keep only words with character count compatible with anagram
    words_cnt = []
    for word in words:
        keep_word = True
        for char in set(word):
            char_cnt = word.count(char)
            if char_cnt > anagram_char_cnt[char]:
                keep_word = False
                break

        if keep_word:
            words_cnt.append(word)
            
    return words_cnt
        
print(f'Words in current list:', len(words))
words = filter_execive_char_words(words)
print('Words remaining:', len(words))

Words in current list: 2527
Words remaining: 1659


In [6]:
# Only keep a single instance of a words characters
# Note that these need to be expanded into all valid forms later
# def filter_rendundant_character_sets(words):
#     words_unique_lst = np.unique(np.array([sorted(word) for word in words], dtype=object))
#     words_unique = [''.join(char_lst) for char_lst in words_unique_lst]
#     return words_unique

# print(f'Words in current list:', len(words))
# words_set = filter_rendundant_character_sets(words)
# print('Words remaining:', len(words_set))

## Word scoring
Each word is given a score based on their contained characters. Each character has a unique value, that is then summed up for the word. 

In [7]:
def is_anagram(anagram, phrase):
    ana_cnt = get_character_count(anagram)
    phrase_cnt = get_character_count(phrase)
    
    if ana_cnt == phrase_cnt:
        return True
    else:
        return False

# provide each word a numerical score
def get_value_map(chars):
    value_map = {}
    for idx, char in enumerate(chars):
        value_map[char] = 10**idx
        
    return value_map
        
def get_word_value(word, value_map):
    value = 0
    
    for char in word:
        value += value_map[char]
            
    return value


def assign_word_values(words, value_map):
    word_values = {}

    for word in words:
        word_value = get_word_value(word, value_map=value_map)
        word_values[word] = word_value
    
    sorted_value_list = sorted(word_values.items(), key=lambda t: t[::-1])

    return dict(sorted_value_list), sorted_value_list

value_map = get_value_map(set(anagram))
word_value_dct, sorted_value_list = assign_word_values(words, value_map)
anagram_value = get_word_value(word=anagram, value_map=value_map)

print('Anagram phrase:', anagram)
print('Anagram value:', anagram_value)

# checking uniqueness of word values
print('\nChecking uniqueness of word value assignments...')
print(f'Number of unique words:', len(np.unique(np.array([sorted(word) for word in words], dtype=object))))
print(f'Number of unique word values:', len(np.unique([val[1] for val in sorted_value_list])))

Anagram phrase: poultryoutwitsants
Anagram value: 112121111241

Checking uniqueness of word value assignments...
Number of unique words: 1179
Number of unique word values: 1179


# Solution for 2 words
Instead of looping through all words twice to find if their values sum up to the target value (O(n^2)), we can use a sorted list and only go through the list once (O(n)). We look at pairs of numbers by combining low and high values each time and moving the current index according to the target value.

In [8]:
# 2 element sums
def get_pair_sum(word_list, sum_target):
    solutions = []

    idx_low = 0
    idx_high = len(word_list) - 1
    
    while idx_low < idx_high:
        val_low = word_list[idx_low][1]
        val_high = word_list[idx_high][1]

        phrase_value = val_low + val_high
        
        if phrase_value == sum_target:
            word_low = word_list[idx_low][0]
            word_high = word_list[idx_high][0]
            
            solutions.append([word_low, word_high])
    
            # this logic keeps the loop running for mutiple words with same characters: ex: nip, pin
            next_value_low = word_list[idx_low + 1][1]
            next_value_high = word_list[idx_high - 1][1]
            
            if next_value_low == val_low:
                idx_low += 1
            elif next_value_high == val_high:
                idx_high -= 1
            else:
                idx_low += 1

        elif phrase_value < sum_target:
            idx_low += 1
        else:
            idx_high -= 1

    return solutions

# function test: expected ('a', 'f'), ('b', 'e') and ('c', 'e')
get_pair_sum(word_list=[('a', 3), ('b', 4), ('c', 4), ('d',8), ('e', 11), ('f', 12), ('g', 15)], sum_target=15)

[['a', 'f'], ['b', 'e'], ['c', 'e']]

# Recursive solution
This code makes looking for varying number of words in anagram easier using recursion instead of nested for loops.
The code falls back to the base case of 2 remaining words, using the function above.

In [9]:
def get_anagram_words(word_lst, target_value, n_words_anagram, words_current=[]):
    solutions_total = []
    
    if n_words_anagram == 1:
        for word, value in word_lst:
            if value == target_value:
                solutions_total.append([word])
            elif value > target_value:
                break
                
    elif n_words_anagram == 2:
        word_pairs = get_pair_sum(word_list=word_lst, sum_target=target_value)
        
        if len(word_pairs) > 0:
            for pair in word_pairs:
                solutions_total.append(words_current + pair)
    else:
        n_words_anagram -= 1
        idx_lst = 0
            
        for word, value in word_lst:
            solutions = get_anagram_words(word_lst[idx_lst:],
                                          target_value=target_value - value,
                                          n_words_anagram=n_words_anagram,
                                          words_current=words_current + [word]
                                         )
            
            if solutions is not None:
                for solution in solutions:
                    solutions_total.append(solution)
                
            idx_lst += 1
    
    return solutions_total


for n in range(1, 5):
    print(f'\nLooking for solutions with {n} words')
    t0 = time.time()
    solutions = get_anagram_words(word_lst=sorted_value_list,
                      target_value=anagram_value,
                      n_words_anagram=n,
                      words_current=[])


    anagram_phrases = words_to_phrases(solutions)
    print(f'Valid anagram phrases with {n} words:', len(anagram_phrases))
    check_hash(anagram_phrases, target_hashes=solution_hashes)
    print(f'Done in {time.time() - t0} seconds')


Looking for solutions with 1 words
Valid anagram phrases with 1 words: 0
Done in 0.0 seconds

Looking for solutions with 2 words
Valid anagram phrases with 2 words: 0
Done in 0.0 seconds

Looking for solutions with 3 words
Valid anagram phrases with 3 words: 4524
"printout stout yawls" matches hash e4820b45d2277f3844eac66c903e84be
"ty outlaws printouts" matches hash 23170acc097c24edb98fc5488ab033fe
Done in 0.1880359649658203 seconds

Looking for solutions with 4 words
Valid anagram phrases with 4 words: 7088928
"wu lisp not statutory" matches hash 665e5bcb0c20062fe8abaaf4628bb154
Done in 101.52310466766357 seconds
