In [1]:
import nltk
from nltk.corpus import words
import codecs
from unidecode import unidecode
import re
import pandas as pd
import numpy as np

In [2]:
import solver
from solver.prob import *

def printLetterArray(arr):
    print('[', end="")
    for letter in arr:
        print(letter, end=" ")
    print(']')

## Words dataframe

Contains all words in the official solutions. Also contains a vector representing each character as a alphabet-indexed number.

### Columns:
* **Word** : the word in characters
* **0** : first letter`s index in the alphabet
* **1** : second letter`s index in the alphabet
* **2** : third letter`s index in the alphabet
* **3** : fourth letter`s index in the alphabet
* **4** : fifth letter`s index in the alphabet

Alphabet index is calculated as:
```python
index = ord(c.lower()) - ord('a')
```

In [5]:
words = wordVecDataframe()
words 

Unnamed: 0,word,0,1,2,3,4
0,ababa,0,1,0,1,0
1,ababe,0,1,0,1,4
2,abaci,0,1,0,2,8
3,abaca,0,1,0,2,0
4,abace,0,1,0,2,4
...,...,...,...,...,...,...
12581,uteis,20,19,4,8,18
12582,utero,20,19,4,17,14
12583,uvico,20,21,8,2,14
12584,uvido,20,21,8,3,14


## Coded words dataframe

Contains the same words as `words`, but coded in a different way.

Each row represents a word.

Each column corresponds to an alphabet index. i.e. column 1 corresponds to 'b', column 2 corresponds to 'c' and so fourth

Each value is a five-bit number representing the position(s) of the corresponding letter in the corresponding word.

See example

In [6]:
codes = wordCodes(words)
print(f"Codes vec: \n{codes}")
print(f"Shape: {codes.shape}")

Codes vec: 
[[21 10  0 ...  0  0  0]
 [ 5 10  0 ...  0  0  0]
 [ 5  2  8 ...  0  0  0]
 ...
 [ 0  0  8 ...  0  0  0]
 [ 0  0  0 ...  0  0  0]
 [16  0  0 ...  0  0  0]]
Shape: (12586, 26)


### Example

Change `i` to see different words as examples.

Important variables:
* words.word: the word in string format
* v: the 5-dimension vector of alphabet indices representing the word
* c: the 26-dimension vector of codes representing the same word

In [7]:
# to see the index of any word you want:
wanted = "traca"
queried = words.query("`word`==@wanted")
queried

Unnamed: 0,word,0,1,2,3,4
11187,traca,19,17,0,2,0


In [8]:
i = queried.index[0]

w = words.iloc[i]
v = words.iloc[i,1:].to_numpy()
c = codes[i]

print("Word:")
print(w.word)
print("Letters vector:")
print(v)

print("Coded vector:")
print(c)

print("Decoded vector:")
print(decodeWord(c))

Word:
traca
Letters vector:
[19 17 0 2 0]
Coded vector:
[20  0  8  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  0  1  0  0  0  0
  0  0]
Decoded vector:
traca


### Coded vector explanation

Each letter contained in the word generates a non-zero entry in the coded vector:

In [9]:
print(f"{w.word=}")

# Legend
printLetterArray(alphabet)

# Non-zero entries in C
print(1*(c!=0))

w.word='traca'
[a b c d e f g h i j k l m n o p q r s t u v w x y z ]
[1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 0 0]


Then, we code the positions in which each letter appears as a 5-bit number (little-endian)

In [10]:
# Get unique letters in the word
print("Coded vector:")
print(c)
idxs = setOfLetters(v)

print(f"{w.word=}")
for l in idxs:
    print("---")
    print(f"Coding for letter '{num2leter(l)}':")
    printLetterArray([letter+" " for letter in w.word])

    places = 1*(v==l)
    print(f"{str(places).replace( ' ', '  ')} \t=>\t Place vector")

    bits = 2**np.arange(len(places))
    code = np.sum(places*bits)
    print(f"[{str(bits)[2:-1]}] \t=>\t bits")

    print(f"code=sum(places*bits) \t=>\t {code=}")

Coded vector:
[20  0  8  0  0  0  0  0  0  0  0  0  0  0  0  0  0  2  0  1  0  0  0  0
  0  0]
w.word='traca'
---
Coding for letter 'a':
[t  r  a  c  a  ]
[0  0  1  0  1] 	=>	 Place vector
[1  2  4  8 16] 	=>	 bits
code=sum(places*bits) 	=>	 code=20
---
Coding for letter 'r':
[t  r  a  c  a  ]
[0  1  0  0  0] 	=>	 Place vector
[1  2  4  8 16] 	=>	 bits
code=sum(places*bits) 	=>	 code=2
---
Coding for letter 'c':
[t  r  a  c  a  ]
[0  0  0  1  0] 	=>	 Place vector
[1  2  4  8 16] 	=>	 bits
code=sum(places*bits) 	=>	 code=8
---
Coding for letter 't':
[t  r  a  c  a  ]
[1  0  0  0  0] 	=>	 Place vector
[1  2  4  8 16] 	=>	 bits
code=sum(places*bits) 	=>	 code=1


## Matches

In [152]:
matches = words.copy().drop(list(range(5)),axis=1)
matches

Unnamed: 0,word
0,ababa
1,ababe
2,abaci
3,abaca
4,abace
...,...
12581,uteis
12582,utero
12583,uvico
12584,uvido


In [104]:
def decodeToBits(c):
    return np.asarray([np.unpackbits(i, bitorder="little")[:5] for i in c])

In [163]:
_bitValues = 2**np.arange(5)[::-1]
def get_green_matches(codeword, codeset):
    greens = codeword&codeset
    n = len(codeset)
    bits = (np.unpackbits(greens.flatten(),bitorder="little")
        .reshape( (n, 26, 8) )
        [:,:,:5]
        )
    matches =  np.sum(bits, axis=1)
    return [str(m) for m in matches]
    # return np.sum(_bitValues*matches, axis=1)

    
# green = get_green_matches(c,codes)
matches["green"] = get_green_matches(c,codes)
print(f"{w.word=}")
print("Green Matches:")
matches

w.word='traca'
Green Matches:


Unnamed: 0,word,green
0,ababa,[0 0 1 0 1]
1,ababe,[0 0 1 0 0]
2,abaci,[0 0 1 1 0]
3,abaca,[0 0 1 1 1]
4,abace,[0 0 1 1 0]
...,...,...
12581,uteis,[0 0 0 0 0]
12582,utero,[0 0 0 0 0]
12583,uvico,[0 0 0 1 0]
12584,uvido,[0 0 0 0 0]


In [57]:
import gmpy2
popCountNp = np.vectorize(lambda x:gmpy2.popcount(int(x)))

def get_yellow_matches(codeword, codeset):
    # ~codeset: has a 5-bit mask for each letter, 
    #           with 1s where that letter is not located on the word
    #           (consequently the 5-bit mask=11111 for letters not in the word)
    # ~codeset * codeset!=0: eliminates the erroneous 5-bit masks for letters that are not in the word
    # yellow: has 1s where the letter in the codeword matches a letter in the codeset,
    #         but not in the same position
    yellow = ((~codeset)*(codeset!=0))&codeword

    # Yellow is not as expected
    # Example:
    #    codeword   = traca
    #    codeset[i] = pavos
    #    yellow =     --y-y
    #should be  =     --y--
    # Because the count of letters should be considered. 
    # E.g. --y-y implies there are two 'a's in the match

    # Correct for different count of matches

    # Popcount = number of nonzero bits in each letter
    # if there are more nonzero bits in the match than in the original word,
    # correct it
    wrong = popCountNp(yellow) > popCountNp(codeset)
    while np.any(wrong):
        # Corrections: where `wrong!=0`, create a mask to clear the highest nonzero-bit
        # (highest nonzero bit = "rightmost" position in the word that has that letter match)
        corrections = wrong * (2**np.floor(np.log2(yellow)))
        corrections = corrections.astype(np.uint8)

        # Clear the incorrect bits in `yellow`
        yellow = (yellow & (~corrections))
        # Recompute the `wrong` mask (corrections only clears one bit, there might be more)
        wrong = popCountNp(yellow) > popCountNp(codeset)
        yellow=yellow.astype(np.uint8)

    return yellow

yellow = get_yellow_matches(c, codes)

matches["yellow"] = [decodeWord(g) for g in yellow]

print(f"{w.word=}")
print("Yellow matches:")
matches[["word","yellow"]]

  corrections = wrong * (2**np.floor(np.log2(yellow)))


w.word='traca'
Yellow matches:


Unnamed: 0,word,yellow
0,ababa,-----
1,ababe,----a
2,abaci,----a
3,abaca,-----
4,abace,----a
...,...,...
12581,uteis,t----
12582,utero,tr---
12583,uvico,-----
12584,uvido,-----


In [58]:
print(f"matches for {w.word}:")
matches

matches for traca:


Unnamed: 0,word,green,yellow
0,ababa,--a-a,-----
1,ababe,--a--,----a
2,abaci,--ac-,----a
3,abaca,--aca,-----
4,abace,--ac-,----a
...,...,...,...
12581,uteis,-----,t----
12582,utero,-----,tr---
12583,uvico,---c-,-----
12584,uvido,-----,-----


In [59]:
groups = matches.groupby(["green","yellow"]).count()
groups

Unnamed: 0_level_0,Unnamed: 1_level_0,word
green,yellow,Unnamed: 2_level_1
-----,-----,1931
-----,---c-,598
-----,--a--,1366
-----,--a-a,263
-----,--ac-,368
...,...,...
tr-ca,--a--,3
tra--,----a,21
tra-a,-----,8
trac-,----a,2


In [71]:
def decodeToBits(c):
    return np.asarray([np.unpackbits(i, bitorder="little") for i in c])

In [75]:
words.iloc[0]

word    ababa
0           0
1           1
2           0
3           1
4           0
Name: 0, dtype: object

In [73]:
codes[0]

array([21, 10,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,  0,
        0,  0,  0,  0,  0,  0,  0,  0,  0], dtype=uint8)

In [96]:
cods = decodeToBits(codes[0])
let = np.array([num2leter(i) for i in range(26)])
nums = np.array([i+1 for i in range(26)])

print(cods.shape)
print(let.shape)
vec =np.sum(nums[:,np.newaxis]*cods,axis=0)[:5]

(26, 8)
(26,)


'ababa'

In [None]:
def entropy(labels, base=None):
  value,counts = np.unique(labels, return_counts=True)
  norm_counts = counts / counts.sum()
  base = np.e if base is None else base
  return -(norm_counts * np.log(norm_counts)/np.log(base)).sum()

entropies = matches.groupby(["green","yellow"]).apply(entropy)
np.mean(entropies)

In [None]:
def get_matches(c, codes):
  green = c&codes
  has_green =  np.sum(green,axis=1)!=0 
  greens = words.iloc[has_green].copy()
  greens["green"] = green[has_green] #[decodeWord(g) for g in green[has_green]]

  greens=greens.drop(list(range(5)),axis=1)

  cand = ( (~green)&c )
  yellow = ( (~cand)*(cand!=0) )&codes
  yellow = 1*(yellow!=0)
  yellow=yellow.astype(np.uint8)

  has_yellow =  np.sum(yellow,axis=1)!=0 

  yellows = words.iloc[has_yellow].copy()
  yellows["yellow"] = yellow[has_yellow] #[decodeWord(g).replace("-","") for g in yellow[has_yellow]]

  yellows=yellows.drop(list(range(5)),axis=1)

  return pd.merge(greens,yellows,how="outer", on="word").fillna(" ")

In [None]:
words

In [None]:
from tqdm import tqdm

i=1
w = words.iloc[i]
v = words.iloc[i,1:].to_numpy()
c = codes[i]


matches = get_matches(c, codes)
entropies = words[["word"]].copy()

for i in tqdm(entropies.index):
    c = codes[i]
    matches = get_matches(c, codes)
    ent = matches.groupby(["green","yellow"]).apply(entropy)
    entropies["ent"] = np.mean(ent)

entropies

In [None]:
print(entropy([1,2,3]))
print(entropy([1,2,3,4]))
print(entropy([1,2]))

# Try again...

words =

In [None]:
words = wordVecDataframe()
words 

In [None]:
sets = np.array([bagOfLettersVec(words.iloc[i,1:]) for i in range(len(words))])

In [None]:
words

In [None]:
sets