## Game: Just one

The idea is to play the guesser in the popular game Just One. Given many words associated with the word to guess (no synonyms), the guesser must find the most plausible word. We use word embeddings to find the average vector between all provided words and give as an answer the embedding most strongly correlated with this average vector.

In [1]:
import numpy as np
from scipy import spatial
from nltk.stem import PorterStemmer
from nltk.corpus import wordnet

In [2]:
embeddings_dict = {}
with open("glove.840B.300d.txt", 'r', encoding='utf8') as f:
    for i, line in enumerate(f):
        try:
            values = line.split()
            word = values[0]
            vector = np.asarray(values[1:], "float32")
            embeddings_dict[word] = vector
        except:
            print(i)

52343
128261
151102
200668
209833
220779
253461
365745
532048
717302
994818
1123331
1148409
1352110
1499727
1533809
1899841
1921152
2058966
2165246


In [3]:
# some words have embeddings of length 299, identify the bad keys here
bad_keys = []
for word in embeddings_dict.keys():
    if len(embeddings_dict[word])!=300:
        bad_keys.append(word)

In [4]:
# remove the bad keys
for key in bad_keys:
    del embeddings_dict[key]

In [5]:
def find_closest_embeddings(embedding):
    return sorted(embeddings_dict.keys(), key=lambda word: spatial.distance.cosine(embeddings_dict[word], embedding))
#alternative spatial.distance.euclidean

In [69]:
given_words = ['fire', 'amazon', 'brazil', 'wet']

In [70]:
given_embeddings = np.asarray([embeddings_dict[word] for word in given_words])

In [71]:
print(given_embeddings)

[[-0.1392    0.10877   0.42009  ...  0.030956 -0.24704   0.056905]
 [-0.73095   0.45252   0.1357   ... -0.24908  -0.091376  0.077178]
 [-0.067526  0.050342  0.59258  ... -0.59192  -0.15179   0.29805 ]
 [ 0.29371  -0.37684   0.014916 ...  0.12695   0.16432  -0.5478  ]]


In [72]:
given_embeddings_mean = np.mean(given_embeddings, axis=0)

In [73]:
# find the 200 embeddings closest to the given mean
closest_to_mean = find_closest_embeddings(given_embeddings_mean)[:200]

In [74]:
# remove words from result which are identical to the given words
closest_to_mean = [word for word in closest_to_mean if word not in given_words]

In [75]:
# remove words from result which if stemmed are identical to the given words
ps = PorterStemmer()
closest_to_mean = [word for word in closest_to_mean if ps.stem(word) not in given_words]

In [76]:
# remove words from result which are synonyms of the given words
synonyms = []
for gword in given_words:
    for syn in wordnet.synsets(gword):
        for l in syn.lemmas():
            synonyms.append(l.name())

closest_to_mean = [word for word in closest_to_mean if word not in synonyms]

In [77]:
print(closest_to_mean[:200])

['brazilian', 'hot', 'america', 'ass', 'asian', 'rain', 'africa', 'african', 'naked', 'butt', 'pussy', 'blow', 'asia', 'american', 'bush', 'dirty', 'australia', 'fuck', 'latina', 'butts', 'ebony', 'babes', 'fucking', 'japan', 'big', 'burning', 'japanese', 'europe', 'indian', 'suck', 'dick', 'porn', 'australian', 'european', 'bbw', 'forest', 'water', 'latin', 'virgin', 'tits', 'booty', 'uk', 'licking', 'lick', 'british', 'dry', 'girl', 'anal', 'dvd', 'chicago', 'asses', 'babe', 'canada', 'nude', 'cock', 'argentina', 'sucking', 'jungle', 'usa', 'black', 'anderson', 'walmart', 'myspace', 'smoke', 'bare', 'dildo', 'earth', 'lesbian', 'mexican', 'hairy', 'teen', 'shemale', 'wild', 'bikini', 'piss', 'england', 'pissing', 'germany', 'fucked', 'sun', 'sex', 'india', 'sexy', 'thunder', 'lightning', 'xxx', 'brown', 'blonde', 'kiss', 'busty', 'boobs', 'dicks', 'cold', 'blowing', 'london', 'chick', 'sand', 'snow', 'kelly', 'britney', 'bang', 'tgp', 'nasty', 'sky', 'mexico', 'huge', 'hell', 'russia

In [92]:
# result for ['keys', 'music', 'instrument', 'mozart']
print(closest_to_mean[:20])

['piano', 'instruments', 'violin', 'guitar', 'melody', 'keyboard', 'orchestra', 'musical', 'flute', 'sound', 'instrumental', 'tunes', 'cello', 'symphony', 'midi', 'songs', 'saxophone', 'clarinet', 'orchestral', 'melodies']


In [85]:
# result for ['furniture', 'store', 'swedish', 'assemble']
print(closest_to_mean[:20])

['shop', 'furnishings', 'stores', 'decor', 'kitchen', 'wood', 'antique', 'chairs', 'wooden', 'cabinets', 'clothing', 'clothes', 'warehouse', 'decorating', 'furnishing', 'sofa', 'shelves', 'ikea', 'danish', 'wicker']
