In [1]:
import random
import os
import re
from gensim import models
from typing import *

  return torch._C._cuda_getDeviceCount() > 0


In [2]:
def read_goog_file(data_dir: str='.', size: int=None, clean=True) -> List[str]:
    """
    Read the 3M vectors of length 300 from the Google News dataset. 
    
    Returns a list of words and a matrix of vectors. Indices match between the two.
    The vectors are normalized to unit, because Semantle uses cos distance which ignores magnitude.
    
    data_dir: dir that contains source files
    'size' and 'clean' will produce fewer rows.
    size: a number smaller than 3 million. Stops reading once we hit that many rows.
    clean: only keep high-quality words (no spaces, numbers, capital letters, or non-ascii.)
    """
    vec_file = os.path.join(data_dir + 'GoogleNews-vectors-negative300.bin')
    kv = models.KeyedVectors.load_word2vec_format(vec_file, binary=True, limit=size)
    words = kv.index_to_key
    vecs = []
    w_set = set()
    
    # subset setup
    if clean:
        ascii_subset = re.compile(r'^[a-zA-Z _]+$')

    # iterate over file
    for w in words:
        if clean:
            # filter out proper nouns, pictograms, emoji, multi-words, etc. Eliminates 95% of the dataset.   
            if not ascii_subset.match(w):
                continue
        w_set.add(w.lower())
    
    # combine, normalize, return
    return w_set



In [3]:
w_set = read_goog_file('/mnt/Spookley/datasets/semantle/')

In [4]:
ok = re.compile(r'^[a-zA-Z]+$')
ls = []
for w in sorted(list(w_set)):
    w = w.replace('_', ' ').strip()
    if w:
        ls.append(w)


In [15]:
[random.choice(ls) for _ in range(10)]

['spokesman coy knobel',
 'kcp mc cadres',
 'phenyx',
 'jac def',
 'graffiti scrawls',
 'lady anna komorowska',
 'madcap analyzer',
 'jawahar tunnel',
 'william rapuzzi',
 'pittsford ny buffalo bills']

In [17]:
with open('all_words.txt', 'w') as fh:
    for w in ls:
        fh.write(w + '\n')
