# UMAP solver

uses the models generated in youmap.ipynb


In [2]:
import umap
import math
import dataclasses
import pickle
import random
import numpy as np
import os
import time
import multiprocessing
from joblib import Parallel, delayed
import re
from functools import lru_cache
from annoy import AnnoyIndex
from tqdm.notebook import tqdm
from scipy.spatial.distance import euclidean, pdist, squareform
import scipy
import scipy.stats as stats
from sklearn.decomposition import PCA, TruncatedSVD
from scipy.spatial.distance import cosine as cos_dist
from sklearn.preprocessing import normalize, StandardScaler
from gensim import models
from scipy.optimize import curve_fit
from matplotlib import pyplot as plt
from scipy.stats import binned_statistic
from typing import *

In [4]:
DATA_DIR = '/home/theo/repos/semantle/'

In [5]:
def read_goog_file(size=None):
    vec_file = DATA_DIR + 'GoogleNews-vectors-negative300.bin'
    kv = models.KeyedVectors.load_word2vec_format(vec_file, binary=True, limit=size)
    words = kv.index_to_key
    w_vecs = {}
    good_word = re.compile(r'^[a-z]+$')
    for w in words:
        # filter out proper nouns, pictograms, emoji, multi-words, etc. Eliminates 95% of the dataset.   
        if not good_word.match(w):
            continue
        
        w_vecs[w] = kv[w] / np.linalg.norm(kv[w])
    mat_full = np.array([w_vecs[w] for w in w_vecs])
    w_list = list(w_vecs.keys())
    return w_list, normalize(mat_full, axis=1)

w_list, orig_mat = read_goog_file()

In [6]:
# get stuff we need from prior experiments
with open(DATA_DIR+'w_list_155060.pkl', 'rb') as fh:
    w_list = pickle.load(fh)

umap_files = []
for fn in os.listdir(DATA_DIR):
    if fn.startswith('umap_155060'):
        umap_files.append(fn)

@dataclasses.dataclass
class UMAPFile():
    name: str
    rows: int
    dims: int
    nns: int
    curve: list


with open(DATA_DIR+'umap_curves.pkl', 'rb') as fh:
    curves = pickle.load(fh)
for c in curves:
    c.name = DATA_DIR + c.name.split('/')[-1]


In [7]:
# core semantle-solver stuff
def build_index(w_vecs: Dict[str, List[float]]) -> Tuple[Dict[int, str], AnnoyIndex]:
    for v in w_vecs.values():
        GLOVE_VEC_SIZE = len(v)
        break
    idx_to_word = {}
    ann_index = AnnoyIndex(GLOVE_VEC_SIZE, 'euclidean')
    with tqdm(total=len(w_vecs)) as pbar:
        for i, w in enumerate(w_vecs.keys()):
            pbar.update(1)
            ann_index.add_item(i, w_vecs[w])
            idx_to_word[i] = w
    ann_index.build(20) # n trees
    return idx_to_word, ann_index

In [8]:
def random_point_in_dist(point, dist):
    # For when we know the dist but have no idea what direction to travel
    vec = np.random.random((len(point)))
    vec = vec / scipy.linalg.norm(vec)
    vec = vec * dist*0.5
    return vec+point


def directed_point_in_dist(p1, p2, p1_dist, p2_dist):
    # Generate a vector using p1 and p2.
    # Check if it will point in the general direction of our target.
    p1p2 = (p1-p2)
    p1p2mag = scipy.linalg.norm(p1p2)
    if p1p2mag < 0.00001:
        return None, 0
    p1p2_unit = p1p2 / p1p2mag
    if p1_dist < p2_dist:
        # p1 is closer to target
        mag = p1_dist
        target_point = p1 + p1p2_unit*mag
        confidence = (p2_dist-p1_dist) / p1p2mag
        assert confidence >= 0
    else:
        # j is closer to target
        # make a vector from j to a target that is dists[j] away
        mag = p2_dist
        target_point = p2 - p1p2_unit*mag
        confidence = (p1_dist-p2_dist) / p1p2mag
        assert confidence >= 0
    return target_point, confidence

In [9]:
def score_to_dist(score: float, curve: List[float]):
    # convert a [-100..100] semantle score (which is cos sim * 100) to a euclidean distance in reduced space
    c_dist = 1 - (score / 100)
    p3, p2, p1 = curve
    y = p3 / (1 + np.exp(-p2*(c_dist-p1)))
    return y


In [10]:
class SemantleGame():
    def __init__(self, w_list):
        self.orig_mat = orig_mat
        self.w_list = w_list
        self.target_word = random.choice(['banana', 'leaving', 'three', 'fortunate', 'electric', 'ended', 'swim', 
       'retch', 'flute', 'hands', 'trench', 'painful', 'airborne', 'safety', 'give',
       'never', 'ferocious', 'splitter', 'raccoon', 'pickle', 'microphone', 'love',
       'zombie', 'undulate', 'semicircle', 'book', 'doctor', 'arrow', 'fridge', 'merry',
       'crank', 'similar', 'flew', 'knock', 'neighbor', 'cell', 'cloud', 'moon', 'zebra',
       'therefore', 'abrupt', 'rend', 'knife', 'shill', 'dollar', 'spilled', 'carpet',
       'lunatic', 'beach', 'revert', 'future', 'nigh', 'drake', 'winded', 'play', 'freedom'])
        idx = w_list.index(self.target_word)
        self.target_vec = orig_mat[idx, :]
        
    def guess(self, word, orig_mat) -> Tuple[bool, float]:
        # construct guess
        idx = self.w_list.index(word)
        vec = orig_mat[idx, :]
        semantle_score = (1-cos_dist(vec, self.target_vec))*100
        if word == self.target_word:
            return True, semantle_score
        else:
            return False, semantle_score
    
    def display_guesses(self):
        s = []
        for g in sorted(self.guesses, key = lambda g: g.dist):
            s.append(str(g))
        print('\n'.join(s))
        
    def __str__(self):
        return '\n'.join('{}: {}'.format(k, v) for k, v in self.__dict__.items())

In [13]:
@dataclasses.dataclass
class Guess:
    word: str
    num: int
    dist: float
    score: float
    
class SemantleSolver:
    
    def __init__(self, curve=None, n_random_guesses=5, game=None, conf_thresh=0.1):
        self.n_random_guesses = n_random_guesses
        self.closest_dist = float('inf')
        self.guesses = []  # List[Guess]
        self.guessed_words = set()  # for fast lookup
        self.best_guess = None
        self.game = game
        self.curve = curve
        
        self.CONF_THRESH = conf_thresh
        
        self.stats = {
            'grd_high_conf': 0,
            'grd_random_dist': 0,
            'times_gradient': 0,
            'times_exhaustive': 0,
            'times_random': 0,
        }
        
    def _gradient_method(self, w_vecs, ann_index):
        # Use gradient method to get a closer guess.
        p1 = np.array(w_vecs[self.guesses[-1].word])
        p1_dist = self.guesses[-1].dist
        
        # Consider the few most recent points. 
        # Try and find one with a vector through p1 that points towards the target.
        best_point = None
        best_confidence = 0
        best_p2_dist = float('inf')
        for i in range(2, min(10, len(self.guesses))):
            p2 = np.array(w_vecs[self.guesses[-i].word])
            p2_dist = self.guesses[-i].dist
            
            # where does p2->p1 point? and how well aligned is that spot with the target?
            target_point, confidence = directed_point_in_dist(p1, p2, p1_dist, p2_dist)
            if confidence > best_confidence:
                best_confidence = confidence
                best_point = target_point
        if best_confidence < self.CONF_THRESH:
            self.stats['grd_random_dist'] += 1
            vec = np.array(w_vecs[self.best_guess])
            best_point = random_point_in_dist(vec, self.closest_dist)
            #print('grd_rand')
        else:
            self.stats['grd_high_conf'] += 1
            #print('grd_conf')

        return best_point

    
    def find_next_guess(self, w_vecs, ann_index, idx_to_word) -> bool:
        if len(self.guesses) < self.n_random_guesses:
            self.stats['times_random'] += 1
            next_word = random.choice(list(w_vecs.keys()))
        else:
            self.stats['times_gradient'] += 1
            v = self._gradient_method(w_vecs, ann_index)
            idxs_near_best = ann_index.get_nns_by_vector(v, 1000)
            for idx in idxs_near_best:
                w = idx_to_word[idx]
                if w not in self.guessed_words:
                    next_word = w
                    break
            
        return next_word

    def make_guess(self, word, orig_mat):
        # guess the word
        win, score = self.game.guess(word, orig_mat)
        dist = score_to_dist(score, self.curve)
        self.guessed_words.add(word)
        self.guesses.append(Guess(word=word, dist=dist, num=len(self.guesses)+1, score=score))
        
        # see if this one's better
        if self.best_guess is None or dist < self.closest_dist:
            #print(word, round(dist, 3))
            self.closest_dist = dist
            self.best_guess = word
        
        if win:
            #print("I win!")
            return True
        else:
            return False
    
    def add_guess(self, word, score):
        # Adds a guess from an external source. For playing Real Semantle.
        dist = score_to_dist(score, self.curve)
        self.guessed_words.add(word)
        self.guesses.append(Guess(word=word, dist=dist, num=len(self.guesses)+1, score=score))
        if self.best_guess is None or dist < self.closest_dist:
            #print(word, round(dist, 3))
            self.closest_dist = dist
            self.best_guess = word
        

In [14]:
def set_up(src, curves):
    curve = []
    for c in curves:
        if src in c.name:
            curve = c.curve
    assert len(curve)
    mat = np.load(src)

    n_words = mat.shape[0]
    wfile = DATA_DIR + 'w_list_{}.pkl'.format(n_words)
    with open(wfile, 'rb') as fh:
        w_list = pickle.load(fh)
    
    # ugh, fine.
    w_vecs = {}
    for i in range(mat.shape[0]):
        w_vecs[w_list[i]] = mat[i, :]
    idx_to_word, ann_index = build_index(w_vecs)  # fast
    return mat, w_vecs, curve, w_list, idx_to_word, ann_index

In [17]:
ngs = {}
for c in curves:
    if '155' not in c.name:
        # just solving the reduced dataset here.
        continue
    if not len(c.curve):
        continue
    mat, w_vecs, curve, w_list, idx_to_word, ann_index = set_up(c.name, curves)
    ngs[c.name] = []
    for i in range(10):
        game = SemantleGame(w_list)
        player = SemantleSolver(game=game, curve=curve, n_random_guesses=2, conf_thresh=0.6)
        print(game.target_word)
        #print(player.CONF_THRESH)
        won = False
        while not won:
            word = player.find_next_guess(w_vecs, ann_index, idx_to_word)
            won = player.make_guess(word, orig_mat)
            #g = player.guesses[-1]
            #local_dist = score_to_dist(g.score, curve)
            #print(g.word, round(g.score, 3), '->', round(local_dist, 3))
            if len(player.guesses) >= 1000:
                #print('stopped. ')
                #print('Best guess:', player.best_guess, 'dist:', player.closest_dist)
                break

        #print(len(player.guesses))
        ngs[c.name].append(len(player.guesses))
    v = ngs[c.name]
    print(c.name.split('/')[-1])
    print(v)
    print(sum(v)/len(v))
for k, v in ngs.items():
    print(sum(v)/len(v), k.split('/')[-1])

  0%|          | 0/155060 [00:00<?, ?it/s]

airborne
never
never
electric
drake
raccoon
dollar
give
dollar
retch
umap_155060rows_3dims_15nns.npy
[1000, 1000, 1000, 1000, 327, 732, 51, 1000, 182, 1000]
729.2


  0%|          | 0/155060 [00:00<?, ?it/s]

hands
crank
love
give
three
drake
cloud
splitter
zombie
semicircle
umap_155060rows_3dims_50nns.npy
[1000, 1000, 1000, 1000, 258, 1000, 1000, 1000, 1000, 1000]
925.8


  0%|          | 0/155060 [00:00<?, ?it/s]

moon
electric
swim
love
never
play
undulate
give
freedom
nigh
umap_155060rows_3dims_100nns.npy
[1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]
1000.0


  0%|          | 0/155060 [00:00<?, ?it/s]

microphone
safety
arrow
abrupt
undulate
therefore
revert
carpet
neighbor
abrupt
umap_155060rows_3dims_1000nns.npy
[1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]
1000.0


  0%|          | 0/155060 [00:00<?, ?it/s]

swim
ferocious
merry
fridge
fridge
neighbor
moon
lunatic
flew
undulate
umap_155060rows_10dims_15nns.npy
[114, 1000, 1000, 1000, 368, 1000, 1000, 243, 1000, 422]
714.7


  0%|          | 0/155060 [00:00<?, ?it/s]

cloud
zebra
semicircle
pickle
knife
give
ferocious
electric
flute
electric
umap_155060rows_10dims_50nns.npy
[564, 72, 1000, 1000, 1000, 1000, 1000, 381, 184, 533]
673.4


  0%|          | 0/155060 [00:00<?, ?it/s]

rend
future
swim
microphone
abrupt
fridge
cloud
flew
shill
shill
umap_155060rows_10dims_100nns.npy
[1000, 1000, 684, 83, 626, 428, 1000, 469, 1000, 1000]
729.0


  0%|          | 0/155060 [00:00<?, ?it/s]

ended
knife
flew
never
give
knock
zombie
zebra
drake
freedom
umap_155060rows_10dims_1000nns.npy
[1000, 972, 19, 633, 1000, 1000, 734, 368, 1000, 283]
700.9


  0%|          | 0/155060 [00:00<?, ?it/s]

knife
undulate
banana
leaving
shill
flew
retch
lunatic
love
dollar
umap_155060rows_30dims_15nns.npy
[571, 1000, 1000, 1000, 252, 164, 344, 66, 1000, 303]
570.0


  0%|          | 0/155060 [00:00<?, ?it/s]

cloud
airborne
flew
ferocious
spilled
three
flute
flew
freedom
play
umap_155060rows_30dims_50nns.npy
[249, 1000, 304, 747, 1000, 349, 192, 420, 311, 1000]
557.2


  0%|          | 0/155060 [00:00<?, ?it/s]

crank
undulate
winded
rend
doctor
microphone
zebra
zombie
merry
shill
umap_155060rows_30dims_100nns.npy
[1000, 662, 1000, 1000, 176, 277, 856, 133, 1000, 1000]
710.4


  0%|          | 0/155060 [00:00<?, ?it/s]

drake
painful
fortunate
airborne
cloud
fridge
raccoon
fortunate
retch
flew
umap_155060rows_30dims_1000nns.npy
[293, 1000, 50, 1000, 1000, 317, 111, 199, 640, 919]
552.9


  0%|          | 0/155060 [00:00<?, ?it/s]

therefore
crank
swim
electric
retch
lunatic
undulate
airborne
book
knife
umap_155060rows_100dims_15nns.npy
[1000, 1000, 457, 1000, 947, 75, 826, 1000, 445, 1000]
775.0


  0%|          | 0/155060 [00:00<?, ?it/s]

winded
zombie
splitter
lunatic
book
lunatic
shill
safety
raccoon
ferocious
umap_155060rows_100dims_50nns.npy
[1000, 499, 1000, 81, 250, 99, 1000, 1000, 717, 1000]
664.6


  0%|          | 0/155060 [00:00<?, ?it/s]

give
lunatic
raccoon
play
cloud
doctor
revert
spilled
swim
lunatic
umap_155060rows_100dims_100nns.npy
[1000, 159, 245, 1000, 1000, 157, 486, 295, 1000, 19]
536.1


  0%|          | 0/155060 [00:00<?, ?it/s]

moon
book
splitter
love
semicircle
moon
flute
similar
dollar
fridge
umap_155060rows_100dims_1000nns.npy
[1000, 262, 1000, 1000, 1000, 699, 1000, 1000, 127, 1000]
808.8
729.2 umap_155060rows_3dims_15nns.npy
925.8 umap_155060rows_3dims_50nns.npy
1000.0 umap_155060rows_3dims_100nns.npy
1000.0 umap_155060rows_3dims_1000nns.npy
714.7 umap_155060rows_10dims_15nns.npy
673.4 umap_155060rows_10dims_50nns.npy
729.0 umap_155060rows_10dims_100nns.npy
700.9 umap_155060rows_10dims_1000nns.npy
570.0 umap_155060rows_30dims_15nns.npy
557.2 umap_155060rows_30dims_50nns.npy
710.4 umap_155060rows_30dims_100nns.npy
552.9 umap_155060rows_30dims_1000nns.npy
775.0 umap_155060rows_100dims_15nns.npy
664.6 umap_155060rows_100dims_50nns.npy
536.1 umap_155060rows_100dims_100nns.npy
808.8 umap_155060rows_100dims_1000nns.npy


In [None]:
# 10 and 30 dims did fine really. Splitting hairs beyond that.
# todo check stats like grd_conf, does that still happen in 10d?
best_one = 'umap_155060rows_30dims_50nns'

In [38]:
cos_dist(mat[1,:], game.target_vec))*100

0.003477036952972412

In [39]:
game.target_vec

array([5.4185476, 7.583787 , 7.367848 , 2.170635 , 8.2373705, 1.9374956,
       5.6728373, 1.8657322, 3.5900362, 4.136304 , 6.288454 , 6.229613 ,
       3.5685968, 2.6561875, 5.006074 , 4.669414 , 7.9617863, 3.6898565,
       6.2364535, 3.4242167, 3.9359581, 8.128173 , 4.303084 , 7.1651316,
       5.43378  , 8.028357 , 3.6979685, 6.1893086, 3.534047 , 3.0106165],
      dtype=float32)

In [40]:
mat[1,:]

array([5.880084 , 6.6261044, 7.525349 , 3.9074562, 8.158941 , 2.6142683,
       5.774672 , 1.8681632, 3.1104794, 4.045796 , 6.430109 , 5.9629946,
       3.2894933, 3.0455675, 4.7026014, 4.4188223, 7.9808745, 3.7809465,
       6.2673936, 3.4806774, 4.094248 , 8.023301 , 4.4849553, 7.096601 ,
       5.2204165, 8.102913 , 3.6935813, 6.367431 , 3.5238874, 3.1885743],
      dtype=float32)