In [None]:
import nltk
from nltk.corpus import words
import codecs
from unidecode import unidecode
import re
import pandas as pd
import numpy as np

In [None]:
import solver
from solver.prob import *

def printLetterArray(arr):
    print('[', end="")
    for letter in arr:
        print(letter, end=" ")
    print(']')

## Words dataframe

Contains all words in the official solutions. Also contains a vector representing each character as a alphabet-indexed number.

### Columns:
* **Word** : the word in characters
* **0** : first letter`s index in the alphabet
* **1** : second letter`s index in the alphabet
* **2** : third letter`s index in the alphabet
* **3** : fourth letter`s index in the alphabet
* **4** : fifth letter`s index in the alphabet

Alphabet index is calculated as:
```python
index = ord(c.lower()) - ord('a')
```

In [None]:
words = wordVecDataframe(language="pt")
words 

## Coded words dataframe

Contains the same words as `words`, but coded in a different way.

Each row represents a word.

Each column corresponds to an alphabet index. i.e. column 1 corresponds to 'b', column 2 corresponds to 'c' and so fourth

Each value is a five-bit number representing the position(s) of the corresponding letter in the corresponding word.

See example

In [None]:
codes= wordCodes(words.iloc[:,1:].to_numpy().astype(int))
print(f"Codes vec: \n{codes}")
print(f"Shape: {codes.shape}")

### Example

Change `i` to see different words as examples.

Important variables:
* words.word: the word in string format
* v: the 5-dimension vector of alphabet indices representing the word
* c: the 26-dimension vector of codes representing the same word

In [None]:
# to see the index of any word you want:
wanted = "traca"
queried = words.query("`word`==@wanted")
queried

In [None]:
i = queried.index[0]

w = words.iloc[i]
v = words.iloc[i,1:].to_numpy()
c = codes[i]

print("Word:")
print(w.word)
print("Letters vector:")
print(v)

print("Coded vector:")
print(c)

print("Decoded vector:")
print(decodeWord(c))

### Coded vector explanation

Each letter contained in the word generates a non-zero entry in the coded vector:

In [None]:
print(f"{w.word=}")

# Legend
printLetterArray(alphabet)

# Non-zero entries in C
print(1*(c!=0))

Then, we code the positions in which each letter appears as a 5-bit number (little-endian)

In [None]:
# Get unique letters in the word
print("Coded vector:")
print(c)
idxs = setOfLetters(v)

print(f"{w.word=}")
for l in idxs:
    print("---")
    print(f"Coding for letter '{num2leter(l)}':")
    printLetterArray([letter+" " for letter in w.word])

    places = 1*(v==l)
    print(f"{str(places).replace( ' ', '  ')} \t=>\t Place vector")

    bits = 2**np.arange(len(places))
    code = np.sum(places*bits)
    print(f"[{str(bits)[2:-1]}] \t=>\t bits")

    print(f"code=sum(places*bits) \t=>\t {code=}")

## Matches

In [None]:
matches = words.copy().drop(list(range(5)),axis=1)
matches

## Vectorized functions

In [None]:
def decodeToBits(c):
    return np.asarray([np.unpackbits(i, bitorder="little")[:5] for i in c])

In [None]:
def match_code(arr):
    return np.sum(_bitValues*arr, axis=1)

In [None]:
_bitValues = 2**np.arange(5)
def get_green_matches(codeword, codeset):
    greens = codeword&codeset
    return greens

def codify_matches(codedMatches):
    n = len(codedMatches)
    bits = (np.unpackbits(codedMatches.flatten(),bitorder="little")
        .reshape( (n, 26, 8) )
        [:,:,:5]
        )
    matches =  np.sum(bits, axis=1)
    # return [str(m) for m in matches]
    # return matches
    return match_code(matches)

    
# green = get_green_matches(c,codes)
greens = get_green_matches(c,codes)
matches["green"] = codify_matches(greens)
matches["greenP"] = [decodeWord(g) for g in get_green_matches(c,codes)]
print(f"{w.word=}")
print("Green Matches:")
matches

In [None]:
import gmpy2
popCountNp = np.vectorize(lambda x:gmpy2.popcount(int(x)), otypes=[np.uint32])
# popCountNp= lambda x: [gmpy2.popcount(n) for n in x]

def log2impl(x):
    output = np.zeros_like(x)
    vals = 2**np.arange(8)
    itr = list(enumerate(vals))
    for i, num in (itr):
        output[ x - num >= 0 ] = i
    
    return output

def bit_count(arr):
     # Make the values type-agnostic (as long as it's integers)
     t = arr.dtype.type
     mask = t(-1)
     s55 = t(0x5555555555555555 & mask)  # Add more digits for 128bit support
     s33 = t(0x3333333333333333 & mask)
     s0F = t(0x0F0F0F0F0F0F0F0F & mask)
     s01 = t(0x0101010101010101 & mask)

     arr = arr - ((arr >> 1) & s55)
     arr = (arr & s33) + ((arr >> 2) & s33)
     arr = (arr + (arr >> 4)) & s0F
     return (arr * s01) >> (8 * (arr.itemsize - 1))

# def bit_count(arr):
#     return np.sum(np.unpackbits(arr.astype(np.uint8)).reshape(*arr.shape,8),axis=-1)
def get_yellow_matches(codeword, codeset, greens):
    # ~codeset: has a 5-bit mask for each letter, 
    #           with 1s where that letter is not located on the word
    #           (consequently the 5-bit mask=11111 for letters not in the word)
    # ~codeset * codeset!=0: eliminates the erroneous 5-bit masks for letters that are not in the word
    # yellow: has 1s where the letter in the codeword matches a letter in the codeset,
    #         but not in the same position
    # nongreen = codeset & ((~greens)*(greens!=0))
    # codeset = codeset 
    nongreen = codeset & ~greens
    yellow = (((~codeset)*(codeset!=0)))&codeword
    yellow = yellow & ~greens

    # Yellow is not as expected
    # Example:
    #    codeword   = traca
    #    codeset[i] = pavos
    #    yellow =     --y-y
    #should be  =     --y--
    # Because the count of letters should be considered. 
    # E.g. --y-y implies there are two 'a's in the match

    # Correct for different count of matches

    # Popcount = number of nonzero bits in each letter
    # if there are more nonzero bits in the match than in the original word,
    # correct it
    wrong = bit_count(yellow.astype(np.uint8)) > bit_count(nongreen.astype(np.uint8))
    while np.any(wrong):
        # highestbits = 1<<log2impl(yellow.astype(int))
        highestbits = 2**np.floor(np.log2(yellow))
        # highestbits[yellow==0] = 0

        corrections = wrong * highestbits
        corrections = corrections.astype(np.uint8)

        yellow = (yellow & (~corrections))
        # Recompute the `wrong` mask (corrections only clears one bit, there might be more)
        wrong = bit_count(yellow.astype(np.uint8)) > bit_count(nongreen.astype(np.uint8))
        yellow=yellow.astype(np.uint8)

    return yellow
    # return np.sum(_bitValues*yellow, axis=1)


yellows = get_yellow_matches(c, codes, greens)
matches["yellow"] = codify_matches(yellows)
matches["yellowP"] = [decodeWord(y) for y in yellows]

print(f"{w.word=}")
print("Yellow matches:")
matches[["word","yellow","yellowP"]]

In [None]:
print(w.word)
matches[[ "word", "greenP","yellowP" ]]

In [None]:
print(w.word)
matches.query("word=='crane'")

### Results

In [None]:
print(f"matches for {w.word}:")
show= matches.loc[:,["word", "greenP","yellowP"]]
show.query("yellowP!='-----'")

In [None]:
groups = matches.groupby(["green","yellow"]).count()["word"]
groups

## Applying vectorized functions

You can use a new axis to do the full NxNx26 match codes, but likely you`ll run out of memory (~32GB).

So here we do the vectorized ops one letter at a time. Then coalesce all results into a NxNx5 'binary matches' vector, one for greens and one for yellow.

Likely you can do this 2 or 3 letters at a time without running out of memory, but my machine only has 6GB.

In [None]:
import tqdm

### Green matches

Green matches for ALL combinations of words. Rows = query, columns = matches

In [None]:
n, let = words.iloc[:,1:].shape

# Full matrix
greens = np.zeros((n,n), dtype=np.uint8)

# For each letter
for i in tqdm.tqdm(range(26)):
    # Check matches between all words and all other words for that letter
    out = get_green_matches(codes[:,np.newaxis,i:i+1],codes[...,i:i+1])

    # Set bits in `greens` in positions that have matches
    greens +=  out[...,0]


### Yellow matches

Yellow matches for ALL combinations of words. Rows = query, columns = matches

In [None]:
n, let = words.iloc[:,1:].shape

yellows = np.zeros((n,n), dtype=np.uint8)
for i in tqdm.tqdm(range(26)):
    out = get_yellow_matches(codes[:,np.newaxis,i:i+1],codes[...,i:i+1], greens[...,i:i+1])
    yellows[...] += out[...,0]


In [None]:

n, let = words.iloc[:,1:].shape

# Full matrix
greens = np.zeros((n,n), dtype=np.uint8)
yellows = np.zeros((n,n), dtype=np.uint8)

# For each letter
for i in tqdm.tqdm(range(26)):
    # Check matches between all words and all other words for that letter
    partial_greens = get_green_matches(codes[:,np.newaxis,i:i+1],codes[...,i:i+1])
    # Set bits in `greens` in positions that have matches
    greens +=  partial_greens[...,0]

    partial_yellows = get_yellow_matches(codes[:,np.newaxis,i:i+1],codes[...,i:i+1], partial_greens)
    yellows[...] += partial_yellows[...,0]


In [None]:
yellows

In [None]:
print(yellows.shape)
print(greens.shape)

### Check answers

In [None]:
i

In [None]:
i = queried.index[0]
print(np.all(yellows[i]==matches.yellow))
print(np.all(greens[i]==matches.green))

In [None]:
codes[i]

In [None]:
yellowWrongs = np.where(yellows[i]!=matches.yellow)
dfYWrong = matches.iloc[yellowWrongs].copy()

dfYWrong["yellowVec"] = yellows[i][yellowWrongs]

print(f"{w.word=}")
dfYWrong[["yellow","yellowVec"]] = dfYWrong[["yellow","yellowVec"]].apply(lambda x: x.apply(lambda y: bin(int(y)) ))
dfYWrong[["word","yellow","yellowVec","greenP"]]

### Save matches

In [None]:
file = "matches"
np.savez(file, greens=greens, yellows=yellows, words=words.iloc[:,0].to_numpy())

In [None]:
read = np.load(file+".npz", allow_pickle=True)
print(np.all(read["greens"] == greens))
print(np.all(read["yellows"] ==

In [None]:
 yellows))
read["words"]

## Test