In [1]:
import scipy
from sklearn.neighbors import KDTree
from scipy.spatial import distance 
import numpy as np
from numpy.random import default_rng
import pandas as pd
from tqdm import tqdm
from collections import defaultdict
import random
import copy
import functools
import itertools
import json

In [2]:
# Import google common words..!

words_list = []
with open('google-10000-english/google-10000-english-no-swears.txt') as f:
    for line in f:
        word = line.strip()
        if word:
            words_list.append(word)

len(words_list)

9894

In [3]:
words_list[9888]

'swaziland'

In [4]:
[w for w in words_list if 'new' in w]

['new',
 'news',
 'newsletter',
 'newsletters',
 'knew',
 'newspaper',
 'newest',
 'newly',
 'newspapers',
 'renewal',
 'newton',
 'newport',
 'newcastle',
 'newbie',
 'newer',
 'renew',
 'renewable',
 'newfoundland',
 'newark',
 'newman']

In [5]:
# Strip down to common ones
print(words_list.index('eau'))
words_list = words_list[:4000]

8479


In [3]:
scrabble_words_list = []
with open('Collins Scrabble Words (2019).txt') as f:
    next(f) # Remove first line
    for line in f:
        word = line.strip().lower()
        if word:
            scrabble_words_list.append(word)
            
len(scrabble_words_list)

279496

In [4]:
# Find overlap!
words_list = set.intersection(set(words_list), set(scrabble_words_list))
len(words_list)

8347

In [9]:
# Create guess list... This contains far more words.
guesses_list = [w for w in scrabble_words_list if len(w) <= 6]
print(len(guesses_list))

with open('crosswordle_page/guess_list.js', 'w') as f:
    s = json.dumps(guesses_list)
    f.write("var GUESS_LIST = new Set(");
    f.write(s)
    f.write(");")

43117


In [21]:
set(words_list) - set(scrabble_words_list)

{'lexmark',
 've',
 'url',
 'eden',
 'phil',
 'plugins',
 'gzip',
 'msie',
 'bhutan',
 'usgs',
 'l',
 'lithuania',
 'tgp',
 'cc',
 'dk',
 'wv',
 'zimbabwe',
 'tracy',
 'december',
 'seattle',
 'tampa',
 'anthony',
 'xl',
 'wifi',
 'ist',
 'diego',
 'september',
 'marcus',
 'harold',
 'henderson',
 'kb',
 's',
 'phillips',
 'lexington',
 'nottingham',
 'val',
 'trackbacks',
 'hispanic',
 'rp',
 'southampton',
 'rj',
 'pm',
 'japanese',
 'taiwan',
 'darwin',
 'struct',
 'rochester',
 'istanbul',
 'isaac',
 'mx',
 'belkin',
 'ts',
 'gr',
 'fp',
 'vancouver',
 'sb',
 'cruz',
 'cn',
 'bermuda',
 'wallace',
 'sk',
 'mj',
 'scottish',
 'lc',
 'jefferson',
 'inf',
 'fiji',
 'february',
 'a',
 'berkeley',
 'joyce',
 'aberdeen',
 'gary',
 'erp',
 'deutschland',
 'irc',
 'montreal',
 'irs',
 'qc',
 'ww',
 'mitchell',
 'cio',
 'x',
 'milton',
 'jr',
 'invision',
 'ambien',
 'ppc',
 'linda',
 'saturn',
 'fw',
 'americans',
 'jpg',
 'nba',
 'nec',
 'freebsd',
 'samsung',
 'gmc',
 'kenya',
 'cornell'

In [11]:
# import nltk
# # nltk.download('words')
# nltk.download('wordnet')

In [12]:
# from nltk.corpus import words
# from nltk.corpus import wordnet 

In [13]:
# words = list(wordnet.words())

# def is_word(word):
#     # NOTE: Removing hyphenated words too.
#     return (
#         not any(c in word for c in '-_. \'"0123456789,<>!@#$%^&*({[]})')
#     )
# print(len(words))

# words = [w for w in words if is_word(w)]
# print(len(words))
# # Also convert everything to lower case... Maybe I don't want place names though..??
# words = { w.lower() for w in words }
# print(len(words))

In [14]:
words_by_length = defaultdict(list)

for w in words_list:
    words_by_length[len(w)].append(w)

In [15]:
@functools.lru_cache(maxsize=None)
def words_from_constraint(length, position, letter):
    return { w for w in words_by_length[length] if w[position] == letter }

def words_from_constraints(length, constraints):
    if not constraints:
        return words_by_length[length]
    
    sets = (words_from_constraint(length, position, letter)
            for position, letter in constraints)
    
    return set.intersection(*sets)


words_from_constraints(5, [(0, 'a'), (3, 'b')])

{'adobe'}

In [16]:
# A word is just a list of positions that're covered by that word in the grid.
# Could store 'position', 'direction' but...

# Generate all words in a grid

# | 0,0 | 1,0 | ..
# | 0,1 | 1,1 | ..
#   ..    ..

def get_words_in_grid(size_x, size_y):
    words = []

    for i in range(size_x):
        words.append(
            [(i, j) for j in range(size_y)]
        )

    for j in range(size_y):
        words.append(
            [(i, j) for i in range(size_x)]
        )

    random.shuffle(words)

    return words

words = get_words_in_grid(5, 5)
words

[[(0, 2), (1, 2), (2, 2), (3, 2), (4, 2)],
 [(2, 0), (2, 1), (2, 2), (2, 3), (2, 4)],
 [(0, 0), (1, 0), (2, 0), (3, 0), (4, 0)],
 [(3, 0), (3, 1), (3, 2), (3, 3), (3, 4)],
 [(0, 3), (1, 3), (2, 3), (3, 3), (4, 3)],
 [(0, 4), (1, 4), (2, 4), (3, 4), (4, 4)],
 [(1, 0), (1, 1), (1, 2), (1, 3), (1, 4)],
 [(4, 0), (4, 1), (4, 2), (4, 3), (4, 4)],
 [(0, 1), (1, 1), (2, 1), (3, 1), (4, 1)],
 [(0, 0), (0, 1), (0, 2), (0, 3), (0, 4)]]

In [28]:
rng = default_rng()

In [18]:
a = list(range(10))

In [19]:
a[3], a[:3], a[3+1:]

(3, [0, 1, 2], [4, 5, 6, 7, 8, 9])

In [27]:
# Generate some interesting grids..??

def generate_grid(size_x, size_y, min_word_len=3, spawn_attempts=200):
    size_x = 5
    size_y = 5

    grid = np.array([['_' for i in range(size_x)] for j in range(size_y)])

    for i in range(spawn_attempts):
        # Randomly insert '#'s!
        x, y = rng.integers(size_x), rng.integers(size_y)

        # Will this break anything..?
        def is_valid_word(squares):
            if len(squares) <= 1:
                return True

            if squares[0] == '#':
                return True

            for i in range(min_word_len):
                if i >= len(squares) or squares[i] == '#':
                    return False;

            return True


        if (grid[y, x] != '#'
               and is_valid_word(grid[y, x+1:])
               and is_valid_word(np.flip(grid[y, :x]))
               and is_valid_word(np.flip(grid[:y, x]))
               and is_valid_word(grid[y+1:, x])):

            # Can insert!
            grid[y, x] = '#'
    

grid = generate_grid(5, 5)

NameError: name 'rng' is not defined

In [21]:
# Convert grid to list of words
col_words = []
for i in range(size_x):
    word = []
    for j in range(size_y):
        if grid[j, i] == '_':
            word.append((i, j))
        else:
            # NOTE: May be adding empty words... Deal with it later
            col_words.append(word)
            word = []
    col_words.append(word)
    
row_words = [] 
for j in range(size_y):
    word = []
    for i in range(size_x):
        if grid[j, i] == '_':
            word.append((i, j))
        else:
            # NOTE: May be adding empty words... Deal with it later
            row_words.append(word)
            word = []
    row_words.append(word)
    
    
col_words = [w for w in col_words if len(w) > 1]
row_words = [w for w in row_words if len(w) > 1]

# Intercalate for best performance - most interlocking of words.
words = [x for x in itertools.chain(*itertools.zip_longest(col_words, row_words)) if x is not None]
        
grid, words

(array([['_', '_', '_', '#', '#'],
        ['_', '#', '_', '_', '_'],
        ['_', '_', '_', '_', '_'],
        ['#', '_', '_', '_', '_'],
        ['_', '_', '_', '#', '#']], dtype='<U1'),
 [[(0, 0), (0, 1), (0, 2)],
  [(0, 0), (1, 0), (2, 0)],
  [(1, 2), (1, 3), (1, 4)],
  [(2, 1), (3, 1), (4, 1)],
  [(2, 0), (2, 1), (2, 2), (2, 3), (2, 4)],
  [(0, 2), (1, 2), (2, 2), (3, 2), (4, 2)],
  [(3, 1), (3, 2), (3, 3)],
  [(1, 3), (2, 3), (3, 3), (4, 3)],
  [(4, 1), (4, 2), (4, 3)],
  [(0, 4), (1, 4), (2, 4)]])

In [22]:
def nice_print(grid):
    print('+-' * len(grid[0]) + '+')
    for row in grid:
        print('|' + '|'.join(row) + '|')
        print('+-' * len(row) + '+')
        
    print()

In [23]:
grid = [[' ' for i in range(size_x)] for j in range(size_y)]
# NOTE: Accessed with grid[y][x]

node_tracker = defaultdict(int)

def dfs(depth):
    node_tracker[depth] += 1
    
    if depth >= len(words):
        yield copy.deepcopy(grid)
        return
        
    # Build constraints for this word.
    new_letters = []
    constraints = []
    for ix, (i,j) in enumerate(words[depth]):
        c = grid[j][i]
        if c == ' ':
            new_letters.append((ix, i, j))
        else:
            constraints.append((ix, c))

#     print(new_letters, constraints, ix)

    # Find words.
    possible_words = words_from_constraints(ix+1, constraints)
    # NOTE: This is making the program way slower!!
#     possible_words = list(possible_words)
#     random.shuffle(possible_words)

    
    #print('found:', len(possible_words), 'possible words')
    #print(ix+1, constraints)
    for w in possible_words:
        # Insert word into grid...
        for ix, i, j in new_letters:
            grid[j][i] = w[ix]
            
        # Recurse
        for ans in dfs(depth + 1):
            yield ans
        
        # Remove word from grid
        for ix, i, j in new_letters:
            grid[j][i] = ' '
      
x = dfs(0)
for i in range(100):
    b = next(x)
    nice_print(b)
    
    js = str([''.join(row) for row in b])
    print(js)

+-+-+-+-+-+
|t|i|p| | |
+-+-+-+-+-+
|e| |r|e|s|
+-+-+-+-+-+
|n|o|i|s|e|
+-+-+-+-+-+
| |n|o|t|e|
+-+-+-+-+-+
|p|e|r| | |
+-+-+-+-+-+

['tip  ', 'e res', 'noise', ' note', 'per  ']
+-+-+-+-+-+
|t|i|p| | |
+-+-+-+-+-+
|e| |r|e|s|
+-+-+-+-+-+
|n|o|i|s|e|
+-+-+-+-+-+
| |n|o|t|e|
+-+-+-+-+-+
|h|e|r| | |
+-+-+-+-+-+

['tip  ', 'e res', 'noise', ' note', 'her  ']
+-+-+-+-+-+
|t|i|p| | |
+-+-+-+-+-+
|e| |r|e|f|
+-+-+-+-+-+
|n|o|i|s|e|
+-+-+-+-+-+
| |n|o|t|e|
+-+-+-+-+-+
|p|e|r| | |
+-+-+-+-+-+

['tip  ', 'e ref', 'noise', ' note', 'per  ']
+-+-+-+-+-+
|t|i|p| | |
+-+-+-+-+-+
|e| |r|e|f|
+-+-+-+-+-+
|n|o|i|s|e|
+-+-+-+-+-+
| |n|o|t|e|
+-+-+-+-+-+
|h|e|r| | |
+-+-+-+-+-+

['tip  ', 'e ref', 'noise', ' note', 'her  ']
+-+-+-+-+-+
|t|o|p| | |
+-+-+-+-+-+
|e| |r|e|s|
+-+-+-+-+-+
|n|o|i|s|e|
+-+-+-+-+-+
| |n|o|t|e|
+-+-+-+-+-+
|p|e|r| | |
+-+-+-+-+-+

['top  ', 'e res', 'noise', ' note', 'per  ']
+-+-+-+-+-+
|t|o|p| | |
+-+-+-+-+-+
|e| |r|e|s|
+-+-+-+-+-+
|n|o|i|s|e|
+-+-+-+-+-+
| |n|o|t|e|
+-+-+-+-+

+-+-+-+-+-+
|f|l|y| | |
+-+-+-+-+-+
|l| |a|n|n|
+-+-+-+-+-+
|y|a|h|o|o|
+-+-+-+-+-+
| |d|o|w|n|
+-+-+-+-+-+
|i|s|o| | |
+-+-+-+-+-+

['fly  ', 'l ann', 'yahoo', ' down', 'iso  ']
+-+-+-+-+-+
|f|l|y| | |
+-+-+-+-+-+
|l| |a|n|d|
+-+-+-+-+-+
|y|a|h|o|o|
+-+-+-+-+-+
| |d|o|w|n|
+-+-+-+-+-+
|i|s|o| | |
+-+-+-+-+-+

['fly  ', 'l and', 'yahoo', ' down', 'iso  ']
+-+-+-+-+-+
|f|l|y| | |
+-+-+-+-+-+
|l| |a|l|l|
+-+-+-+-+-+
|y|a|h|o|o|
+-+-+-+-+-+
| |d|o|g|s|
+-+-+-+-+-+
|i|s|o| | |
+-+-+-+-+-+

['fly  ', 'l all', 'yahoo', ' dogs', 'iso  ']
+-+-+-+-+-+
|f|l|y| | |
+-+-+-+-+-+
|l| |a|r|m|
+-+-+-+-+-+
|y|a|h|o|o|
+-+-+-+-+-+
| |d|o|w|n|
+-+-+-+-+-+
|i|s|o| | |
+-+-+-+-+-+

['fly  ', 'l arm', 'yahoo', ' down', 'iso  ']
+-+-+-+-+-+
|y|e|s| | |
+-+-+-+-+-+
|o| |h|a|t|
+-+-+-+-+-+
|u|s|a|g|e|
+-+-+-+-+-+
| |i|r|o|n|
+-+-+-+-+-+
|a|r|e| | |
+-+-+-+-+-+

['yes  ', 'o hat', 'usage', ' iron', 'are  ']
+-+-+-+-+-+
|y|e|s| | |
+-+-+-+-+-+
|o| |h|a|t|
+-+-+-+-+-+
|u|s|a|g|e|
+-+-+-+-+-+
| |i|r|o|n|
+-+-+-+-+

+-+-+-+-+-+
|f|i|t| | |
+-+-+-+-+-+
|a| |w|a|r|
+-+-+-+-+-+
|t|w|i|c|e|
+-+-+-+-+-+
| |a|c|t|s|
+-+-+-+-+-+
|e|y|e| | |
+-+-+-+-+-+

['fit  ', 'a war', 'twice', ' acts', 'eye  ']
+-+-+-+-+-+
|f|i|t| | |
+-+-+-+-+-+
|a| |w|a|y|
+-+-+-+-+-+
|t|w|i|c|e|
+-+-+-+-+-+
| |a|c|t|s|
+-+-+-+-+-+
|e|y|e| | |
+-+-+-+-+-+

['fit  ', 'a way', 'twice', ' acts', 'eye  ']
+-+-+-+-+-+
|f|i|t| | |
+-+-+-+-+-+
|a| |a|r|e|
+-+-+-+-+-+
|t|a|k|e|s|
+-+-+-+-+-+
| |r|e|s|t|
+-+-+-+-+-+
|i|t|s| | |
+-+-+-+-+-+

['fit  ', 'a are', 'takes', ' rest', 'its  ']
+-+-+-+-+-+
|f|i|t| | |
+-+-+-+-+-+
|a| |a|r|e|
+-+-+-+-+-+
|t|a|x|e|s|
+-+-+-+-+-+
| |r|e|s|t|
+-+-+-+-+-+
|i|t|s| | |
+-+-+-+-+-+

['fit  ', 'a are', 'taxes', ' rest', 'its  ']
+-+-+-+-+-+
|f|i|t| | |
+-+-+-+-+-+
|a| |h|a|t|
+-+-+-+-+-+
|t|h|e|i|r|
+-+-+-+-+-+
| |a|r|m|y|
+-+-+-+-+-+
|u|s|e| | |
+-+-+-+-+-+

['fit  ', 'a hat', 'their', ' army', 'use  ']
+-+-+-+-+-+
|f|i|t| | |
+-+-+-+-+-+
|a| |h|a|t|
+-+-+-+-+-+
|t|h|e|r|e|
+-+-+-+-+-+
| |a|r|e|a|
+-+-+-+-+

In [24]:
node_tracker

defaultdict(int,
            {0: 1,
             1: 4,
             2: 34,
             3: 7388,
             4: 1617856,
             5: 1434294,
             6: 54818,
             7: 46066,
             8: 4848,
             9: 233,
             10: 100})

In [25]:
# +-+-+-+-+-+
# |h|a|r|s|h|
# +-+-+-+-+-+
# |a|m|a|t|i|
# +-+-+-+-+-+
# |l|i|d|a|r|
# +-+-+-+-+-+
# |a|g|i|l|e|
# +-+-+-+-+-+
# |b|o|x|e|r|
# +-+-+-+-+-+