In [23]:
import dataclasses
import pickle
import random
import numpy as np

from annoy import AnnoyIndex
from tqdm.notebook import tqdm
from scipy.spatial.distance import euclidean, pdist, squareform
import scipy
import scipy.stats as stats
from sklearn.decomposition import PCA

from matplotlib import pyplot as plt
from typing import *

from scipy.spatial.distance import cosine as cos_dist


In [2]:
def read_glove_file() -> Dict[str, List[float]]:
    """
    Yields the next 'size' vectors in a glove file.
    """
    glove_file = '/mnt/Spookley/datasets/glove/glove.6B.50d.txt'
    w_vecs = {}
    with tqdm(total=400000) as pbar:
        with open(glove_file) as fh:
            for line in fh.readlines():
                pbar.update(1)
                toks = line.strip().split()
                word = toks[0]
                # non-words like punctuation marks have entries, but we don't want those
                if not word.isalnum():  
                    continue
                # Some bigrams and trigrams are in the dataset. Skip those.
                try:
                    float(toks[1])
                except ValueError:
                    continue
                # No errors, parse the line.
                vec = [float(s) for s in toks[1:]]
                vec = np.array(vec)
                vec = vec / np.linalg.norm(vec)
                vec = vec.tolist()
                w_vecs[word] = vec
    return w_vecs

In [3]:
def build_index(w_vecs: Dict[str, List[float]]) -> Tuple[Dict[int, str], AnnoyIndex]:
    for v in w_vecs.values():
        GLOVE_VEC_SIZE = len(v)
        break
    idx_to_word = {}
    ann_index = AnnoyIndex(GLOVE_VEC_SIZE, 'euclidean')
    with tqdm(total=len(w_vecs)) as pbar:
        for i, w in enumerate(w_vecs.keys()):
            pbar.update(1)
            ann_index.add_item(i, w_vecs[w])
            idx_to_word[i] = w
    ann_index.build(20) # n trees
    return idx_to_word, ann_index

In [31]:
w_vecs = read_glove_file()
pca = PCA(n_components=20)
mat_full = np.array([w_vecs[w] for w in w_vecs])
mat = pca.fit_transform(mat_full)

print(mat.shape)
for i, w in enumerate(w_vecs.keys()):
    w_vecs[w] = mat[i, :]

  0%|          | 0/400000 [00:00<?, ?it/s]

(336158, 20)


In [32]:

idx_to_word, ann_index = build_index(w_vecs)  # fast


  0%|          | 0/336158 [00:00<?, ?it/s]

In [33]:
print(euclidean(w_vecs['strawberry'], w_vecs['peach']))
print(euclidean(w_vecs['strawberry'], w_vecs['banana']))
print(euclidean(w_vecs['strawberry'], w_vecs['envelope']))

0.5060883330250372
0.6487490476460706
0.9384484231299287


In [34]:
def random_point_in_dist(point, dist):
    # For when we know the dist but have no idea what direction to travel
    vec = np.random.random((len(point)))
    vec = vec / scipy.linalg.norm(vec)
    vec = vec * dist
    return vec+point


def directed_point_in_dist(p1, p2, p1_dist, p2_dist):
    # Generate a vector using p1 and p2.
    # Check if it will point in the general direction of our target.
    p1p2 = (p1-p2)
    p1p2mag = scipy.linalg.norm(p1p2)
    if p1p2mag < 0.00001:
        return None, 0
    p1p2_unit = p1p2 / p1p2mag
    if p1_dist < p2_dist:
        # p1 is closer to target
        mag = p1_dist
        target_point = p1 + p1p2_unit*mag
        confidence = (p2_dist-p1_dist) / p1p2mag
        assert confidence >= 0
    else:
        # j is closer to target
        # make a vector from j to a target that is dists[j] away
        mag = p2_dist
        target_point = p2 - p1p2_unit*mag
        confidence = (p1_dist-p2_dist) / p1p2mag
        assert confidence >= 0
    return target_point, confidence

In [35]:
# test case - target at [0,6], points on y axis
p1 = np.array([0,0])
p2 = np.array([0,2])
target = np.array([0,6])
d1 = euclidean(p1, target)
d2 = euclidean(p2, target)
print('expect [0,6]', directed_point_in_dist(p1, p2, d1, d2))
print('expect [0,6]', directed_point_in_dist(p2, p1, d2, d1))

expect [0,6] (array([0., 6.]), 1.0)
expect [0,6] (array([0., 6.]), 1.0)


In [36]:
# test case - target at [1,3], points on y axis
p1 = np.array([0,0])
p2 = np.array([0,2])
target = np.array([1,3])
d1 = euclidean(p1, target)
d2 = euclidean(p2, target)
print('expect [1,3]', directed_point_in_dist(p1, p2, d1, d2))
print('expect [1,3]', directed_point_in_dist(p2, p1, d2, d1))

expect [1,3] (array([0.        , 3.41421356]), 0.8740320488976422)
expect [1,3] (array([0.        , 3.41421356]), 0.8740320488976422)


In [37]:
# test case - target at [3,1], points on y axis
p1 = np.array([0,0])
p2 = np.array([0,2])
target = np.array([3,1])
d1 = euclidean(p1, target)
d2 = euclidean(p2, target)
print(d1, d2)
print('expect zero confidence', directed_point_in_dist(p1, p2, d1, d2))
print('expect zero confidence', directed_point_in_dist(p2, p1, d2, d1))

3.1622776601683795 3.1622776601683795
expect zero confidence (array([0.        , 5.16227766]), 0.0)
expect zero confidence (array([ 0.        , -3.16227766]), 0.0)


In [41]:
class SemantleGame():
    def __init__(self, w_vecs):
        w_list = list(w_vecs.keys())
        self.target_word = random.choice(w_list[1000:10000])
        self.target_vec = w_vecs[self.target_word]
        
    def guess(self, word, vec) -> Tuple[bool, float]:
        # construct guess
        # dist = euclidean(vec, self.target_vec) lol nope!
        sim_score = 1-cos_dist(vec, self.target_vec)
        dist = (0.04471114)*sim_score**3 + (0.0740919)*sim_score**2 + (-0.74640201)*sim_score + (0.95066707)
        # check if win
        if word == self.target_word:
            return True, dist
        else:
            return False, dist
    
    def display_guesses(self):
        s = []
        for g in sorted(self.guesses, key = lambda g: g.dist):
            s.append(str(g))
        print('\n'.join(s))
        
    def __str__(self):
        return '\n'.join('{}: {}'.format(k, v) for k, v in self.__dict__.items())

In [46]:
@dataclasses.dataclass
class Guess:
    word: str
    num: int
    dist: float
    
class SemantleSolver:
    
    def __init__(self, n_random_guesses=2, game=None):
        self.n_random_guesses = n_random_guesses
        self.closest_dist = float('inf')
        self.guesses = []  # List[Guess]
        self.guessed_words = set()  # for fast lookup
        self.best_guess = None
        self.game = game
        
        self.N_RANDOM = 2
        self.CONF_THRESH = 0.33
        
        self.stats = {
            'grd_high_conf': 0,
            'grd_random_dist': 0,
            'times_gradient': 0,
            'times_exhaustive': 0,
            'times_random': 0,
        }
        
    def _gradient_method(self, w_vecs, ann_index):
        # Use gradient method to get a closer guess.
        p1 = np.array(w_vecs[self.guesses[-1].word])
        p1_dist = self.guesses[-1].dist
        
        # Consider the few most recent points. 
        # Try and find one with a vector through p1 that points towards the target.
        best_point = None
        best_confidence = 0
        for i in range(2, min(10, len(self.guesses))):
            p2 = np.array(w_vecs[self.guesses[-i].word])
            p2_dist = self.guesses[-i].dist
            
            # where does p2->p1 point? and how well aligned is that spot with the target?
            target_point, confidence = directed_point_in_dist(p1, p2, p1_dist, p2_dist)
            if confidence > best_confidence:
                best_confidence = confidence
                best_point = target_point
                
        if best_confidence < self.CONF_THRESH:
            self.stats['grd_random_dist'] += 1
            vec = np.array(w_vecs[self.best_guess])
            best_point = random_point_in_dist(vec, self.closest_dist)
        else:
            self.stats['grd_high_conf'] += 1

        return best_point

    
    def find_next_guess(self, w_vecs, ann_index, idx_to_word) -> bool:
        if len(self.guesses) < self.N_RANDOM:
            self.stats['times_random'] += 1
            next_word = random.choice(list(w_vecs.keys()))
        else:
            self.stats['times_gradient'] += 1
            v = self._gradient_method(w_vecs, ann_index)
            idxs_near_best = ann_index.get_nns_by_vector(v, 1000)
            for idx in idxs_near_best:
                w = idx_to_word[idx]
                if w not in self.guessed_words:
                    next_word = w
                    break
            
        return next_word

    def make_guess(self, word):
        # guess the word
        win, dist = self.game.guess(word, w_vecs[word])
        self.guessed_words.add(word)
        self.guesses.append(Guess(word=word, dist=dist, num=len(self.guesses)+1))
        
        # see if this one's better
        if self.best_guess is None or dist < self.closest_dist:
            #print(word, round(dist, 3))
            self.closest_dist = dist
            self.best_guess = word
        
        if win:
            #print("I win!")
            return True
        else:
            return False
    
    def add_guess(self, guess, score):
        # Adds a guess from an external source. For playing Real Semantle.
        dist = score_to_dist(score)
        self.guessed_words.add(word)
        self.guesses.append(Guess(word=word, dist=dist, num=len(self.guesses)+1))
        if self.best_guess is None or dist < self.closest_dist:
            #print(word, round(dist, 3))
            self.closest_dist = dist
            self.best_guess = word
        

In [43]:
game = SemantleGame(w_vecs)
player = SemantleSolver(game=game)
print(game.target_word)
won = False
while not won:
    word = player.find_next_guess(w_vecs, ann_index, idx_to_word)
    won = player.make_guess(word)
    g = player.guesses[-1]
    print(g.word, round(g.dist, 3))
    if len(player.guesses) > 5000:
        print('stopped. ')
        print('Best guess:', player.best_guess, 'dist:', player.closest_dist)
        break

print(player.stats)

def
frase 0.786
björling 0.896
samp 0.716
sylvain 0.667
wiltord 0.691
francais 0.711
springbok 0.77
habana 0.664
makelele 0.636
zinedine 0.673
drogba 0.636
kalou 0.672
kluivert 0.668
bergkamp 0.667
juninho 0.636
geremi 0.611
essien 0.613
dindane 0.737
foudy 0.688
puyol 0.591
xavi 0.623
figo 0.639
carles 0.604
mendes 0.598
patrice 0.539
cofidis 0.644
reyna 0.603
desailly 0.709
evra 0.619
deschamps 0.654
malouda 0.582
yannick 0.694
flanker 0.668
didier 0.544
millar 0.507
nicky 0.598
winger 0.555
webber 0.581
lilian 0.609
robin 0.482
jamie 0.516
ashley 0.513
anthony 0.531
nick 0.522
shane 0.623
kerr 0.546
sally 0.526
kelly 0.519
cole 0.553
starring 0.531
jeremy 0.54
ryder 0.659
farrell 0.58
sean 0.524
salomon 0.544
shannon 0.484
lance 0.583
kate 0.541
davies 0.494
stars 0.57
carragher 0.593
fleming 0.598
craig 0.583
jessica 0.504
starred 0.532
star 0.534
alongside 0.516
nash 0.561
ashton 0.582
daly 0.55
slater 0.555
beckham 0.591
poulter 0.615
jacobs 0.571
corey 0.63
woosnam 0.644
justin 

In [16]:
# grid search time.
def run_trial(exh, n_rand, conf_thresh, w_vecs, idx_to_word, ann_index):
    # make a game
    
    game = SemantleGame(w_vecs)
    player = SemantleSolver(game=game)
    
    player.EXH_THRESH = exh
    player.N_RANDOM = n_rand
    player.CONF_THRESH = conf_thresh
    
    won = False
    while not won:
        word = player.find_next_guess(w_vecs, ann_index, idx_to_word)
        won = player.make_guess(word)
        if len(player.guesses) > 1000:
            break
    return len(player.guesses)

In [18]:

n_dims = [10, 15, 20]
exh_threshes = [0.001]
n_randoms = [3]
conf_threshes = [0.2, 0.3, 0.4, 0.6, 0.8]
n_trials = 20

params_results = {}

for n_dim in n_dims:
    
    # set up space
    w_vecs = read_glove_file()
    pca = PCA(n_components=n_dim)
    mat_full = np.array([w_vecs[w] for w in w_vecs])
    mat = pca.fit_transform(mat_full)

    for i, w in enumerate(w_vecs.keys()):
        w_vecs[w] = mat[i, :]

    idx_to_word, ann_index = build_index(w_vecs)

    
    for exh in exh_threshes:
        for n_rand in n_randoms:
            for conf_thresh in conf_threshes:
                params = (n_dim, exh, n_rand, conf_thresh)
                for trial in range(n_trials):
                    n_guesses = run_trial(exh, n_rand, conf_thresh, w_vecs, idx_to_word, ann_index)
                    if not params in params_results:
                        params_results[params] = 0
                    params_results[params] += n_guesses
                params_results[params] /= n_trials
                print(params, params_results[params])

print(params_results)

  0%|          | 0/400000 [00:00<?, ?it/s]

  0%|          | 0/336158 [00:00<?, ?it/s]

(10, 0.001, 3, 0.2) 215.85
(10, 0.001, 3, 0.3) 193.4
(10, 0.001, 3, 0.4) 59.8
(10, 0.001, 3, 0.6) 94.4
(10, 0.001, 3, 0.8) 324.15


  0%|          | 0/400000 [00:00<?, ?it/s]

  0%|          | 0/336158 [00:00<?, ?it/s]

(15, 0.001, 3, 0.2) 103.4
(15, 0.001, 3, 0.3) 121.1
(15, 0.001, 3, 0.4) 84.25
(15, 0.001, 3, 0.6) 190.4
(15, 0.001, 3, 0.8) 334.25


  0%|          | 0/400000 [00:00<?, ?it/s]

  0%|          | 0/336158 [00:00<?, ?it/s]

(20, 0.001, 3, 0.2) 151.35
(20, 0.001, 3, 0.3) 185.65
(20, 0.001, 3, 0.4) 124.6
(20, 0.001, 3, 0.6) 220.75
(20, 0.001, 3, 0.8) 322.0
{(10, 0.001, 3, 0.2): 215.85, (10, 0.001, 3, 0.3): 193.4, (10, 0.001, 3, 0.4): 59.8, (10, 0.001, 3, 0.6): 94.4, (10, 0.001, 3, 0.8): 324.15, (15, 0.001, 3, 0.2): 103.4, (15, 0.001, 3, 0.3): 121.1, (15, 0.001, 3, 0.4): 84.25, (15, 0.001, 3, 0.6): 190.4, (15, 0.001, 3, 0.8): 334.25, (20, 0.001, 3, 0.2): 151.35, (20, 0.001, 3, 0.3): 185.65, (20, 0.001, 3, 0.4): 124.6, (20, 0.001, 3, 0.6): 220.75, (20, 0.001, 3, 0.8): 322.0}


In [None]:
print(list(sorted(zip(params_results.items()), key=lambda x: x[1]))[:10])

In [None]:
ls = list(sorted(zip(params_results.items()), key = lambda x: x[0][1]))

In [None]:
for item in ls:
    print(item)

In [45]:
def score_to_dist(score):
    sim_score = score / 100
    return (0.04471114)*sim_score**3 + (0.0740919)*sim_score**2 + (-0.74640201)*sim_score + (0.95066707)

In [56]:
# Now make something that plays Real Semantle
s_play = SemantleSolver()
s_play.add_guess('liar', 16.14)
s_play.add_guess('godzilla', 0.03)
s_play.add_guess('andre', 4.95)
s_play.add_guess('robot', -2.04)
s_play.add_guess('bravo', 4.86)
s_play.add_guess('panic', 5.47)
s_play.add_guess('media', 10.97)
s_play.add_guess('gbr', -2.02)
s_play.add_guess('strange', -6.41)
s_play.add_guess('cast', -6.41)
s_play.find_next_guess(w_vecs, ann_index, idx_to_word)

'perpignan'