In [1]:
from collections import Counter

In [55]:
# using the file from https://gist.github.com/cfreshman/dec102adb5e60a8299857cbf78f6cf57
with open('wordle-answers-alphabetical.txt') as f:
   answer_words = f.readlines()

with open('allowed.txt') as f:
   allowed_words = f.readlines()

In [56]:
answer_words = [word[:5] for word in answer_words]
allowed_words = [word[:5] for word in allowed_words]

In [57]:
print(answer_words[:10])

['aback', 'abase', 'abate', 'abbey', 'abbot', 'abhor', 'abide', 'abled', 'abode', 'abort']


In [58]:
# Create a score for each letter in each position, based on the frequency of that letter appearing in the answers
positions = [Counter(),Counter(),Counter(),Counter(),Counter()]
for word in answer_words:
    for index,letter in enumerate(word):
        positions[index][letter] += 1
        
print(f"First letter values {positions[0]}")
print(f"Second letter values {positions[1]}")
print(f"Third letter values {positions[2]}")
print(f"Fourth letter values {positions[3]}")
print(f"Fifth letter values {positions[4]}")

First letter values Counter({'s': 366, 'c': 198, 'b': 173, 't': 149, 'p': 142, 'a': 141, 'f': 136, 'g': 115, 'd': 111, 'm': 107, 'r': 105, 'l': 88, 'w': 83, 'e': 72, 'h': 69, 'v': 43, 'o': 41, 'n': 37, 'i': 34, 'u': 33, 'q': 23, 'j': 20, 'k': 20, 'y': 6, 'z': 3})
Second letter values Counter({'a': 304, 'o': 279, 'r': 267, 'e': 242, 'i': 202, 'l': 201, 'u': 186, 'h': 144, 'n': 87, 't': 77, 'p': 61, 'w': 44, 'c': 40, 'm': 38, 'y': 23, 'd': 20, 'b': 16, 's': 16, 'v': 15, 'x': 14, 'g': 12, 'k': 10, 'f': 8, 'q': 5, 'z': 2, 'j': 2})
Third letter values Counter({'a': 307, 'i': 266, 'o': 244, 'e': 177, 'u': 165, 'r': 163, 'n': 139, 'l': 112, 't': 111, 's': 80, 'd': 75, 'g': 67, 'm': 61, 'p': 58, 'b': 57, 'c': 56, 'v': 49, 'y': 29, 'w': 26, 'f': 25, 'k': 12, 'x': 12, 'z': 11, 'h': 9, 'j': 3, 'q': 1})
Fourth letter values Counter({'e': 318, 'n': 182, 's': 171, 'a': 163, 'l': 162, 'i': 158, 'c': 152, 'r': 152, 't': 139, 'o': 132, 'u': 82, 'g': 76, 'd': 69, 'm': 68, 'k': 55, 'p': 50, 'v': 46, 'f':

In [59]:
# now let's score the corpus, including the allowed words that will never be answers 
# (this doesn't actuallly change the answer: the best words are also possible answers)

green_scored_words = []
for word in answer_words + allowed_words:
    score = 0
    for index,letter in enumerate(word):
        score += positions[index][letter]
    green_scored_words.append((word, score))
green_scored_words.sort(key = lambda x:x[1], reverse = True)
print(green_scored_words[:100])

[('slate', 1437), ('sauce', 1411), ('slice', 1409), ('shale', 1403), ('saute', 1398), ('share', 1393), ('sooty', 1392), ('shine', 1382), ('suite', 1381), ('crane', 1378), ('saint', 1371), ('soapy', 1366), ('shone', 1360), ('shire', 1352), ('saucy', 1351), ('slave', 1344), ('cease', 1342), ('sense', 1342), ('saner', 1339), ('snare', 1336), ('stale', 1336), ('crate', 1335), ('spree', 1332), ('shore', 1330), ('suave', 1329), ('slide', 1326), ('stare', 1326), ('slime', 1325), ('sorry', 1324), ('brace', 1323), ('shiny', 1322), ('gooey', 1320), ('seize', 1318), ('crone', 1315), ('sleet', 1315), ('state', 1313), ('brine', 1312), ('scree', 1311), ('shade', 1310), ('space', 1310), ('spare', 1310), ('shame', 1309), ('slant', 1309), ('sally', 1308), ('scale', 1299), ('spine', 1299), ('trace', 1299), ('shake', 1296), ('stone', 1293), ('tease', 1293), ('shape', 1291), ('scare', 1289), ('shave', 1287), ('salty', 1285), ('sassy', 1285), ('slope', 1285), ('since', 1283), ('poise', 1282), ('swine', 128

In [60]:
best_word = green_scored_words[0][0]
print(f"Best first word: {best_word}")

Best first word: slate


In [61]:
#now let's find the best companion word. It's the highest-scoring word
#that doesn't share any letters with the first word

for scored_word in green_scored_words:
    if not any([letter in best_word for letter in scored_word[0]]):  
        break

print(f"Best second word: {scored_word}")

Best second word: ('crony', 1255)


In [62]:
# Ok, but is slate-crony really the best combo? It's possible there's a lower-scoring
# word than "slate" that has a higher-scoring companion than "crony", for a higher total
# score. Let's take a look at that. Since we know the answer has to beat slate-crony, we 
# can limit the counting pretty heavily

max_score = 0
best_combo = ()
for first_scored_word in green_scored_words[:100]:
    for second_scored_word in green_scored_words[:100]:
        if not any([letter in first_scored_word[0] for letter in second_scored_word[0]]):
            score = first_scored_word[1] + second_scored_word[1]
            if score > max_score:
                max_score = score
                best_combo = (first_scored_word, second_scored_word)
            break

print(f"Best combo: {best_combo}")

Best combo: (('sooty', 1392), ('crane', 1378))


In [63]:
# Ok, a sooty crane beats a slate of cronies! There's an obviously undesirable element there,
# though: "sooty" has a double "o", wasting the opportunity for a yellow hit. We may need to
# take this a bit further, and consider yellow-hit potential. First, let's check how likely
# we are to get a green hit from these two words:
letters_in_answer_corpus = len(answer_words)*5
hits_in_combo = best_combo[0][1] + best_combo[1][1]
average_hits = hits_in_combo/letters_in_answer_corpus
print(f"Average hits from best combo: {round(average_hits,2)}")

Average hits from best combo: 0.24


In [64]:
# 0.24 hits, in two words? That's actually not that good! Clearly, trying to generate green hits alone might
# be too much of an optimistic strategy. Let's add a level, and give yellow hits some weight

yellow_values = Counter()
for word in answer_words:
    for letter in word:
        yellow_values[letter] += 1
print(yellow_values)

Counter({'e': 1233, 'a': 979, 'r': 899, 'o': 754, 't': 729, 'l': 719, 'i': 671, 's': 669, 'n': 575, 'c': 477, 'u': 467, 'y': 425, 'd': 393, 'h': 389, 'p': 367, 'm': 316, 'g': 311, 'b': 281, 'f': 230, 'k': 210, 'w': 195, 'v': 153, 'z': 40, 'x': 37, 'q': 29, 'j': 27})


In [65]:
# Wow, "z" is more common than "j"! "S" comes in surprisingly low as well, 
# though we know it's the most common initial letter. Ok, let's score the words
# based on their yellow-letter potential

yellow_scored_words = []
for word in answer_words + allowed_words:
    score = sum(yellow_values[letter] for letter in word)
    yellow_scored_words.append((word, score))
yellow_scored_words.sort(key = lambda x:x[1], reverse = True)
print(yellow_scored_words[:100])

[('eerie', 5269), ('eater', 5073), ('erase', 5013), ('rarer', 4909), ('elate', 4893), ('tease', 4843), ('easel', 4833), ('lease', 4833), ('tepee', 4795), ('ester', 4763), ('reset', 4763), ('steer', 4763), ('terse', 4763), ('eaten', 4749), ('terra', 4739), ('melee', 4734), ('error', 4684), ('geese', 4679), ('enter', 4669), ('arena', 4665), ('agree', 4655), ('eager', 4655), ('sneer', 4609), ('cease', 4591), ('elite', 4585), ('sleet', 4583), ('steel', 4583), ('erect', 4571), ('treat', 4569), ('alert', 4559), ('alter', 4559), ('later', 4559), ('arose', 4534), ('retro', 4514), ('erode', 4512), ('irate', 4511), ('scree', 4511), ('leery', 4509), ('stare', 4509), ('reuse', 4501), ('tenet', 4499), ('freer', 4494), ('refer', 4494), ('emcee', 4492), ('deter', 4487), ('racer', 4487), ('ether', 4483), ('there', 4483), ('three', 4483), ('elder', 4477), ('eagle', 4475), ('arise', 4451), ('leper', 4451), ('raise', 4451), ('repel', 4451), ('tense', 4439), ('sheer', 4423), ('meter', 4410), ('egret', 440

In [66]:
# Uh, there's a problem here: we're getting words with high-scoring repeated letters. No good.
# Let's do that again

yellow_scored_words = []
for word in answer_words + allowed_words:
    if len(set(word)) == 5:
        score = sum(yellow_values[letter] for letter in word)
        yellow_scored_words.append((word, score))
yellow_scored_words.sort(key = lambda x:x[1], reverse = True)
print(yellow_scored_words[:100])

[('alert', 4559), ('alter', 4559), ('later', 4559), ('arose', 4534), ('irate', 4511), ('stare', 4509), ('arise', 4451), ('raise', 4451), ('learn', 4405), ('renal', 4405), ('saner', 4355), ('snare', 4355), ('least', 4329), ('slate', 4329), ('stale', 4329), ('steal', 4329), ('cater', 4317), ('crate', 4317), ('react', 4317), ('trace', 4317), ('clear', 4307), ('store', 4284), ('loser', 4274), ('aisle', 4271), ('atone', 4270), ('teary', 4265), ('alone', 4260), ('adore', 4258), ('scare', 4257), ('early', 4255), ('layer', 4255), ('relay', 4255), ('leant', 4235), ('trade', 4233), ('tread', 4233), ('opera', 4232), ('earth', 4229), ('hater', 4229), ('heart', 4229), ('taper', 4207), ('paler', 4197), ('pearl', 4197), ('tenor', 4190), ('alien', 4177), ('aider', 4175), ('share', 4169), ('shear', 4169), ('crane', 4163), ('tamer', 4156), ('grate', 4151), ('great', 4151), ('parse', 4147), ('spare', 4147), ('spear', 4147), ('realm', 4146), ('glare', 4141), ('lager', 4141), ('large', 4141), ('regal', 414

In [67]:
# Ok, interesting - this list is very different from the green-hit list, though "slate",
# "alert" and "alter" are obviously related words. Let's look for the best combo

max_score = 0
best_combo = ()
for first_scored_word in yellow_scored_words:
    for second_scored_word in yellow_scored_words:
        if not any([letter in first_scored_word[0] for letter in second_scored_word[0]]):
            score = first_scored_word[1] + second_scored_word[1]
            if score > max_score:
                max_score = score
                best_combo = (first_scored_word, second_scored_word)
            break
print(f"Best combo: {best_combo}")

Best combo: (('alert', 4559), ('scion', 3146))


In [68]:
# Cool words. How many yellow hits can we expect from this
words_in_answer_corpus = len(answer_words)
hits_in_combo = best_combo[0][1] + best_combo[1][1]
average_hits = hits_in_combo/words_in_answer_corpus
print(f"Average hits from best combo: {round(average_hits,2)}")

Average hits from best combo: 3.33


In [75]:
# First, let's reorganize as dictionaries, ranking the words by their score. Highest is best
# Since the yellow_scored_words is now a substantially shorter list, let's use that one

yellow_words = [word[0] for word in yellow_scored_words]
green_scored_words = [(word[0], word[1]) for word in green_scored_words if word[0] in yellow_words]
green_dict = {word[0]:len(green_scored_words)-rank for rank,word in enumerate(green_scored_words)}
yellow_dict = {word[0]:len(yellow_scored_words)-rank for rank,word in enumerate(yellow_scored_words)}

print(green_dict)

{'slate': 1774, 'sauce': 1773, 'slice': 1772, 'shale': 1771, 'saute': 1770, 'share': 1769, 'shine': 1768, 'suite': 1767, 'crane': 1766, 'saint': 1765, 'soapy': 1764, 'shone': 1763, 'shire': 1762, 'saucy': 1761, 'slave': 1760, 'saner': 1759, 'snare': 1758, 'stale': 1757, 'crate': 1756, 'shore': 1755, 'suave': 1754, 'slide': 1753, 'stare': 1752, 'slime': 1751, 'brace': 1750, 'shiny': 1749, 'crone': 1748, 'brine': 1747, 'shade': 1746, 'space': 1745, 'spare': 1744, 'shame': 1743, 'slant': 1742, 'scale': 1741, 'spine': 1740, 'trace': 1739, 'shake': 1738, 'stone': 1737, 'shape': 1736, 'scare': 1735, 'shave': 1734, 'salty': 1733, 'slope': 1732, 'since': 1731, 'poise': 1730, 'swine': 1729, 'boney': 1728, 'snore': 1727, 'stole': 1726, 'sadly': 1725, 'raise': 1724, 'arise': 1723, 'spice': 1722, 'spire': 1721, 'grace': 1720, 'slimy': 1719, 'store': 1718, 'cause': 1717, 'prone': 1716, 'trice': 1715, 'blare': 1714, 'plane': 1713, 'scone': 1712, 'smile': 1711, 'spite': 1710, 'crony': 1709, 'briny': 

In [81]:
# Now, let's get the combo that has the highest combined rank
max_rank = 0
for first_word in yellow_words:
    for second_word in yellow_words:
        if not any([letter in first_word for letter in second_word]):
            rank = (
                green_dict.get(first_word,0)**2
                + green_dict.get(second_word,0)**2
                + yellow_dict.get(first_word,0)**2
                + yellow_dict.get(second_word,0)**2
            )
            if rank > max_rank:
                max_rank = rank
                best_combo = (first_word, second_word)

print(best_combo)
first_word = best_combo[0]
second_word = best_combo[1]
print(f"{first_word} green rank {green_dict.get(first_word,0)}")
print(f"{second_word} green rank {green_dict.get(second_word,0)}")
print(f"{first_word} yellow rank {yellow_dict.get(first_word,0)}")
print(f"{second_word} yellow rank {yellow_dict.get(second_word,0)}")

('stole', 'rainy')
stole green rank 1726
rainy green rank 1669
stole yellow rank 1706
rainy yellow rank 1306


In [82]:
words_in_answer_corpus = len(answer_words)
first_word_yellow_score = [word for word in yellow_scored_words if word[0] == first_word][0][1]
second_word_yellow_score = [word for word in yellow_scored_words if word[0] == second_word][0][1]
hits_in_combo = first_word_yellow_score + second_word_yellow_score
average_hits = hits_in_combo/words_in_answer_corpus
print(f"Average yellow hits from best combo: {round(average_hits,2)}")

Average hits from best combo: 3.31


In [85]:
letters_in_answer_corpus = len(answer_words)*5
first_word_green_score = [word for word in green_scored_words if word[0] == first_word][0][1]
second_word_green_score = [word for word in green_scored_words if word[0] == second_word][0][1]
hits_in_combo = first_word_green_score + second_word_green_score
average_hits = hits_in_combo/letters_in_answer_corpus
print(f"Average green hits from best combo: {round(average_hits,2)}")

Average green hits from best combo: 0.22


"STOLE", "RAINY" looks like a good compromise! It yields 3.31 yellow hits, just under the 3.33 maximum, and 0.22 green hits, just under the 0.24 maximum. I think we have our answer!