In [146]:
import dataclasses
import pickle
import random
import numpy as np

from annoy import AnnoyIndex
from tqdm.notebook import tqdm
from scipy.spatial.distance import euclidean, pdist, squareform
import scipy
import scipy.stats as stats
from sklearn.decomposition import PCA

from matplotlib import pyplot as plt
from typing import *

In [2]:
def read_glove_file() -> Dict[str, List[float]]:
    """
    Yields the next 'size' vectors in a glove file.
    """
    glove_file = '/mnt/Spookley/datasets/glove/glove.6B.50d.txt'
    w_vecs = {}
    with tqdm(total=400000) as pbar:
        with open(glove_file) as fh:
            for line in fh.readlines():
                pbar.update(1)
                toks = line.strip().split()
                word = toks[0]
                # non-words like punctuation marks have entries, but we don't want those
                if not word.isalnum():  
                    continue
                # Some bigrams and trigrams are in the dataset. Skip those.
                try:
                    float(toks[1])
                except ValueError:
                    continue
                # No errors, parse the line.
                vec = [float(s) for s in toks[1:]]
                w_vecs[word] = vec
    return w_vecs

In [152]:
def build_index(w_vecs: Dict[str, List[float]]) -> Tuple[Dict[int, str], AnnoyIndex]:
    for v in w_vecs.values():
        GLOVE_VEC_SIZE = len(v)
        break
    idx_to_word = {}
    ann_index = AnnoyIndex(GLOVE_VEC_SIZE, 'euclidean')
    with tqdm(total=len(w_vecs)) as pbar:
        for i, w in enumerate(w_vecs.keys()):
            pbar.update(1)
            ann_index.add_item(i, w_vecs[w])
            idx_to_word[i] = w
    ann_index.build(20) # n trees
    return idx_to_word, ann_index

In [148]:
w_vecs = read_glove_file()

  0%|          | 0/400000 [00:00<?, ?it/s]

In [153]:
pca = PCA(n_components=10)
mat_full = np.array([w_vecs[w] for w in w_vecs])
mat = pca.fit_transform(mat_full)

print(mat.shape)

for i, w in enumerate(w_vecs.keys()):
    w_vecs[w] = mat[i, :]

(336158, 10)


In [154]:

idx_to_word, ann_index = build_index(w_vecs)  # Takes about an hour and a GB of RAM.


  0%|          | 0/336158 [00:00<?, ?it/s]

In [155]:
def random_point_in_dist(point, dist):
    # For when we know the dist but have no idea what direction to travel
    vec = np.random.random((len(point)))
    vec = vec / scipy.linalg.norm(vec)
    vec = vec * dist
    return vec+point


def directed_point_in_dist(p1, p2, p1_dist, p2_dist):
    # Generate a vector using p1 and p2.
    # Check if it will point in the general direction of our target.
    p1p2 = (p1-p2)
    p1p2mag = scipy.linalg.norm(p1p2)
    if p1p2mag < 0.00001:
        return None, 0
    p1p2_unit = p1p2 / p1p2mag
    if p1_dist < p2_dist:
        # p1 is closer to target
        mag = p1_dist
        target_point = p1 + p1p2_unit*mag
        confidence = (p2_dist-p1_dist) / p1p2mag
        assert confidence >= 0
    else:
        # j is closer to target
        # make a vector from j to a target that is dists[j] away
        mag = p2_dist
        target_point = p2 - p1p2_unit*mag
        confidence = (p1_dist-p2_dist) / p1p2mag
        assert confidence >= 0
    return target_point, confidence

In [156]:
# test case - target at [0,6], points on y axis
p1 = np.array([0,0])
p2 = np.array([0,2])
target = np.array([0,6])
d1 = euclidean(p1, target)
d2 = euclidean(p2, target)
print('expect [0,6]', directed_point_in_dist(p1, p2, d1, d2))
print('expect [0,6]', directed_point_in_dist(p2, p1, d2, d1))

expect [0,6] (array([0., 6.]), 1.0)
expect [0,6] (array([0., 6.]), 1.0)


In [132]:
# test case - target at [1,3], points on y axis
p1 = np.array([0,0])
p2 = np.array([0,2])
target = np.array([1,3])
d1 = euclidean(p1, target)
d2 = euclidean(p2, target)
print('expect [1,3]', directed_point_in_dist(p1, p2, d1, d2))
print('expect [1,3]', directed_point_in_dist(p2, p1, d2, d1))

expect [0,3] (array([0.        , 3.41421356]), 0.8740320488976422)
expect [0,3] (array([0.        , 3.41421356]), 0.8740320488976422)


In [141]:
# test case - target at [3,1], points on y axis
p1 = np.array([0,0])
p2 = np.array([0,2])
target = np.array([3,1])
d1 = euclidean(p1, target)
d2 = euclidean(p2, target)
print(d1, d2)
print('expect zero confidence', directed_point_in_dist(p1, p2, d1, d2))
print('expect zero confidence', directed_point_in_dist(p2, p1, d2, d1))

3.1622776601683795 3.1622776601683795
expect zero confidence (array([0.        , 5.16227766]), 0.0)
expect zero confidence (array([ 0.        , -3.16227766]), 0.0)


In [164]:
class SemantleGame():
    def __init__(self, w_vecs):
        w_list = list(w_vecs.keys())
        self.target_word = random.choice(w_list[1000:10000])
        self.target_vec = w_vecs[self.target_word]
        
    def guess(self, word, vec) -> Tuple[bool, float]:
        # construct guess
        dist = euclidean(vec, self.target_vec)
        # check if win
        if word == self.target_word:
            return True, dist
        else:
            return False, dist
    
    def display_guesses(self):
        s = []
        for g in sorted(self.guesses, key = lambda g: g.dist):
            s.append(str(g))
        print('\n'.join(s))
        
    def __str__(self):
        return '\n'.join('{}: {}'.format(k, v) for k, v in self.__dict__.items())

In [227]:
@dataclasses.dataclass
class Guess:
    word: str
    num: int
    dist: float
    
class SemantleSolver:
    
    def __init__(self, game: SemantleGame, n_random_guesses=10):
        self.game = game
        self.n_random_guesses = n_random_guesses
        self.closest_dist = float('inf')
        self.guesses = []  # List[Guess]
        self.guessed_words = set()  # for fast lookup
        self.best_guess = None
        
        self.EXH_THRESH = 0.5
        self.N_RANDOM = 10
        self.CONF_THRESH = 0.33
        
        self.stats = {
            'grd_high_conf': 0,
            'grd_random_dist': 0,
            'times_gradient': 0,
            'times_exhaustive': 0,
            'times_random': 0,
        }
        
    def _gradient_method(self, w_vecs, ann_index):
        # Use gradient method to get a closer guess.
        p1 = np.array(w_vecs[self.guesses[-1].word])
        p1_dist = self.guesses[-1].dist
        
        # Consider the few most recent points. 
        # Try and find one with a vector through p1 that points towards the target.
        best_point = None
        best_confidence = 0
        for i in range(2, min(10, len(self.guesses))):
            p2 = np.array(w_vecs[self.guesses[-i].word])
            p2_dist = self.guesses[-i].dist
            
            # where does p2->p1 point? and how well aligned is that spot with the target?
            target_point, confidence = directed_point_in_dist(p1, p2, p1_dist, p2_dist)
            if confidence > best_confidence:
                best_confidence = confidence
                best_point = target_point
                
        if best_confidence < self.CONF_THRESH:
            self.stats['grd_random_dist'] += 1
            vec = np.array(w_vecs[self.best_guess])
            best_point = random_point_in_dist(vec, self.closest_dist)
        else:
            self.stats['grd_high_conf'] += 1

        return best_point

    
    def find_next_guess(self, w_vecs, ann_index, idx_to_word) -> bool:
        if len(self.guesses) < self.N_RANDOM:
            self.stats['times_random'] += 1
            next_word = random.choice(list(w_vecs.keys()))
        elif self.closest_dist > self.EXH_THRESH:
            self.stats['times_gradient'] += 1
            v = self._gradient_method(w_vecs, ann_index)
            idxs_near_best = ann_index.get_nns_by_vector(v, 1000)
            for idx in idxs_near_best:
                w = idx_to_word[idx]
                if w not in self.guessed_words:
                    next_word = w
                    break
        else:
            self.stats['times_exhaustive'] += 1
            # We're close enough to start exhaustive search
            v = w_vecs[self.best_guess]
            idxs_near_best = ann_index.get_nns_by_vector(v, 1000)
            for idx in idxs_near_best:
                w = idx_to_word[idx]
                if w not in self.guessed_words:
                    next_word = w
                    break
            
        return next_word

    def make_guess(self, word):
        # guess the word
        win, dist = self.game.guess(word, w_vecs[word])
        self.guessed_words.add(word)
        self.guesses.append(Guess(word=word, dist=dist, num=len(self.guesses)+1))
        
        # see if this one's better
        if self.best_guess is None or dist < self.closest_dist:
            print(word, round(dist, 3))
            self.closest_dist = dist
            self.best_guess = word
        
        if win:
            print("I win!")
            return True
        else:
            return False
        

In [229]:
game = SemantleGame(w_vecs)
player = SemantleSolver(game)
print(game.target_word)
won = False
while not won:
    word = player.find_next_guess(w_vecs, ann_index, idx_to_word)
    won = player.make_guess(word)
    if len(player.guesses) > 5000:
        print('stopped. ')
        print('Best guess:', player.best_guess, 'dist:', player.closest_dist)
        break
        
print(player.stats)

merged
elohim 3.272
extremity 2.887
southwest 2.825
kms 2.583
km 2.56
canton 2.315
quebec 2.224
metro 1.88
shootout 1.832
cycling 1.722
heysel 1.404
subsidiary 1.131
merged 0.0
I win!
{'grd_high_conf': 37, 'grd_random_dist': 11, 'times_gradient': 48, 'times_exhaustive': 0, 'times_random': 10}


In [202]:
len(player.guesses)

35