In [None]:
import nltk
from nltk.corpus import words
import codecs
from unidecode import unidecode
import re
import pandas as pd
import numpy as np

In [None]:
import solver
from solver.prob import *

def printLetterArray(arr):
    print('[', end="")
    for letter in arr:
        print(letter, end=" ")
    print(']')

## Words dataframe

Contains all words in the official solutions. Also contains a vector representing each character as a alphabet-indexed number.

### Columns:
* **Word** : the word in characters
* **0** : first letter`s index in the alphabet
* **1** : second letter`s index in the alphabet
* **2** : third letter`s index in the alphabet
* **3** : fourth letter`s index in the alphabet
* **4** : fifth letter`s index in the alphabet

Alphabet index is calculated as:
```python
index = ord(c.lower()) - ord('a')
```

In [None]:
words = wordVecDataframe()
words 

## Coded words dataframe

Contains the same words as `words`, but coded in a different way.

Each row represents a word.

Each column corresponds to an alphabet index. i.e. column 1 corresponds to 'b', column 2 corresponds to 'c' and so fourth

Each value is a five-bit number representing the position(s) of the corresponding letter in the corresponding word.

See example

In [None]:
codes = wordCodes(words)
print(f"Codes vec: \n{codes}")
print(f"Shape: {codes.shape}")

### Example

Change `i` to see different words as examples.

Important variables:
* words.word: the word in string format
* v: the 5-dimension vector of alphabet indices representing the word
* c: the 26-dimension vector of codes representing the same word

In [None]:
# to see the index of any word you want:
wanted = "traca"
queried = words.query("`word`==@wanted")
queried

In [None]:
i = queried.index[0]

w = words.iloc[i]
v = words.iloc[i,1:].to_numpy()
c = codes[i]

print("Word:")
print(w.word)
print("Letters vector:")
print(v)

print("Coded vector:")
print(c)

print("Decoded vector:")
print(decodeWord(c))

### Coded vector explanation

Each letter contained in the word generates a non-zero entry in the coded vector:

In [None]:
print(f"{w.word=}")

# Legend
printLetterArray(alphabet)

# Non-zero entries in C
print(1*(c!=0))

Then, we code the positions in which each letter appears as a 5-bit number (little-endian)

In [None]:
# Get unique letters in the word
print("Coded vector:")
print(c)
idxs = setOfLetters(v)

print(f"{w.word=}")
for l in idxs:
    print("---")
    print(f"Coding for letter '{num2leter(l)}':")
    printLetterArray([letter+" " for letter in w.word])

    places = 1*(v==l)
    print(f"{str(places).replace( ' ', '  ')} \t=>\t Place vector")

    bits = 2**np.arange(len(places))
    code = np.sum(places*bits)
    print(f"[{str(bits)[2:-1]}] \t=>\t bits")

    print(f"code=sum(places*bits) \t=>\t {code=}")

## Matches

In [None]:
green = c&codes

has_green =  np.sum(green,axis=1)!=0 

greens = words.iloc[has_green].copy()
greens["green"] = [decodeWord(g) for g in green[has_green]]
print(f"{w.word=}")
print("Green Matches:")
greens=greens.drop(list(range(5)),axis=1)
greens.head(n=30)

In [None]:
green[0]

In [None]:

# NOTE: green==0 i'm not sure about.
# Wordle does hint for yellow letters when there are repeated letters and a green letter was guessed
# Letreco also does this
# Termo... i`m not sure yet. TODO: Check.
# yellow = ((~c)*(c!=0)*(green==0))&codes

yellow = ((~codes)*(codes!=0))&c

# cand = ( (~green)&c )
# yellow = ( (~cand)*(cand!=0) )&codes
# yellow = 1*(yellow!=0)
yellow=yellow.astype(np.uint8)

has_yellow =  np.sum(yellow,axis=1)!=0 

yellows = words.iloc[has_yellow].copy()
# yellows["yellow"] = [decodeWord(g).replace("-","") for g in yellow[has_yellow]]
yellows["yellow"] = [decodeWord(g) for g in yellow[has_yellow]]
yellows["code"] = [g for g in yellow[has_yellow]]

yellows=yellows.drop(list(range(5)),axis=1)

print(f"{w.word=}")
print("Yellow matches:")
# yellows.head(n=40)
y = yellows.query("`word`=='pecar'")
y

In [None]:
y = y.code.to_numpy()[0]
y

In [None]:
traca = c
traca

In [None]:
pecar = codes[words.query("`word`=='pecar'").index[0]]
pecar

In [None]:
np.vstack((np.arange(255),np.log2(np.arange(255)))).T

In [None]:
import gmpy
popCountNp = np.vectorize(lambda x:gmpy.popcount(int(x)))
# def popCount(x):
#     return np.array([gmpy.popcount(int(e)) for e in x])

# gmpy.popcount(20.0)
print("Non-zero bitcount:")
print(popCountNp(y))
print(popCountNp(pecar))
print("===")
print("log:")
print(2**np.floor(np.log2(y)))
print(y)
print("===")

wrong = popCountNp(y) > popCountNp(pecar)
corrections = wrong * (2**np.floor(np.log2(y)))
corrections = corrections.astype(np.uint8)
print("Yellow is wrong here:")
print(wrong)
print("Subtract this to correct:")
print(corrections)
print("===")

yn = (y & (~corrections))
print("Corrected:")
print(yn)
print("new Wrong-ness:")
print(popCountNp(yn) > popCountNp(pecar))
print("New match:")
print(decodeWord(yn.astype(np.uint8)))


In [None]:
print(f"matches for {w.word}:")
matches = pd.merge(greens,yellows,how="outer", on="word").fillna(" ")
matches

In [None]:
matches.groupby(["green","yellow"]).count()

In [None]:
def entropy(labels, base=None):
  value,counts = np.unique(labels, return_counts=True)
  norm_counts = counts / counts.sum()
  base = np.e if base is None else base
  return -(norm_counts * np.log(norm_counts)/np.log(base)).sum()

entropies = matches.groupby(["green","yellow"]).apply(entropy)
np.mean(entropies)

In [None]:
def get_matches(c, codes):
  green = c&codes
  has_green =  np.sum(green,axis=1)!=0 
  greens = words.iloc[has_green].copy()
  greens["green"] = green[has_green] #[decodeWord(g) for g in green[has_green]]

  greens=greens.drop(list(range(5)),axis=1)

  cand = ( (~green)&c )
  yellow = ( (~cand)*(cand!=0) )&codes
  yellow = 1*(yellow!=0)
  yellow=yellow.astype(np.uint8)

  has_yellow =  np.sum(yellow,axis=1)!=0 

  yellows = words.iloc[has_yellow].copy()
  yellows["yellow"] = yellow[has_yellow] #[decodeWord(g).replace("-","") for g in yellow[has_yellow]]

  yellows=yellows.drop(list(range(5)),axis=1)

  return pd.merge(greens,yellows,how="outer", on="word").fillna(" ")

In [None]:
words

In [None]:
from tqdm import tqdm

i=1
w = words.iloc[i]
v = words.iloc[i,1:].to_numpy()
c = codes[i]


matches = get_matches(c, codes)
entropies = words[["word"]].copy()

for i in tqdm(entropies.index):
    c = codes[i]
    matches = get_matches(c, codes)
    ent = matches.groupby(["green","yellow"]).apply(entropy)
    entropies["ent"] = np.mean(ent)

entropies

In [None]:
print(entropy([1,2,3]))
print(entropy([1,2,3,4]))
print(entropy([1,2]))

# Try again...

words =

In [None]:
words = wordVecDataframe()
words 

In [None]:
sets = np.array([bagOfLettersVec(words.iloc[i,1:]) for i in range(len(words))])

In [None]:
words

In [None]:
sets