In [None]:
import os, requests, shutil

download_dir = './data/RNN/'
data_cache = './data/cache'

def ensure_downloaded_and_prepared(expected_file, original_url, zipsize_check_IGNORED, 
                                   vocab_size=100000, embedding_dim=50):
    final_path = os.path.join(download_dir, expected_file)

    # These are temporary files if we need to download it from the original source (slow)
    #full_archive = 'glove.6B.zip'
    #full_extract = 'glove.6B.50d.txt'

    download_url= 'http://redcatlabs.com/downloads/deep-learning-workshop/notebooks/'+final_path
    
    if not os.path.isfile( final_path ):
        os.makedirs(download_dir, exist_ok=True)

        # First, try to download a pre-prepared file directly...
        response = requests.get(download_url, stream=True)
        if response.status_code == requests.codes.ok:
            print("Downloading pre-prepared file from RedCatLabs")
            with open(final_path, 'wb') as out_file:
                shutil.copyfileobj(response.raw, out_file)
        else:
            # But, for some reason, RedCatLabs didn't give us the file directly
            if not os.path.exists(data_cache):
                os.makedirs(data_cache, exist_ok=True)

            zipfile = original_url[ original_url.rfind('/')+1:]
            zipfilepath = os.path.join(data_cache, zipfile)
            
            if not os.path.isfile( zipfilepath ):
                print("Downloading large file from %s" % (original_url,))
                response = requests.get(original_url, stream=True)
                with open(zipfilepath, 'wb') as out_file:
                    shutil.copyfileobj(response.raw, out_file)
                print("Finished Download")
                
            vecfile = zipfile.replace('.zip', '').replace('.gz', '')
            vecfilepath = os.path.join(data_cache, vecfile)
            if not os.path.isfile( vecfilepath ):
                if zipfile.endswith('.zip'):
                    print('Unpacking "%s" from .zip' % (vecfile,))
                    import zipfile
                    zipfile.ZipFile(zipfilepath, 'r').extract(vecfile, data_cache)
                if zipfile.endswith('.gz'):
                    print('Unpacking "%s" from .gz' % (vecfile,))
                    import gzip
                    with gzip.open(zipfilepath, 'rb') as f_in:
                        with open(vecfilepath, 'wb') as f_out:
                            f_out.write(f_in.read())
                print("Finished unpacking")

            with open(vecfilepath, 'rt') as in_file:
                with open(final_path, 'wt') as out_file:
                    print("Reducing vec file to first 100k words, 50 columns")
                    print('  First line : "%s"' % (in_file.readline().strip(),))
                    out_file.write("%d %d\n" % (vocab_size, embedding_dim))
                    for i, l in enumerate(in_file.readlines()):
                        if i>=vocab_size: break
                        # Parse the line
                        arr = l.strip().split(' ')
                        word = arr[0]
                        nums = arr[1:embedding_dim+1]
                        out_file.write("%s %s\n" % (word, ' '.join(nums),))

            # Get rid of tarfile source (the required text file itself will remain)
            #os.unlink(zipfilepath)
            #os.unlink(os.path.join(data_cache, glove_full_50d))

    print('"%s" available locally' % (expected_file, ))

download_base = 'https://s3-us-west-1.amazonaws.com/fasttext-vectors'

ensure_downloaded_and_prepared(  # English
    'wiki-news-300d-1M.vec.50d-100k.txt', 
    download_base+'/wiki-news-300d-1M.vec.zip',
     682631666)  # 683MB download

ensure_downloaded_and_prepared(  # Chinese (Mandarin)
    'cc.zh.300.vec.50d-100k.txt', 
    download_base+'/word-vectors-v2/cc.zh.300.vec.gz',
    1358778961)  # 1.36GB download

ensure_downloaded_and_prepared(  # Malay
    'cc.ms.300.vec.50d-100k.txt',
    download_base+'/word-vectors-v2/cc.ms.300.vec.gz',
     710958603) # 711MB download


In [None]:
#! ls -l ./data/cache/
#! rm ./data/cache/wiki-news-300d-1M.vec.zip

In [None]:
# pip install gensim==3.4.0
import gensim
gensim.__version__  # '3.4.0'

In [None]:
en_vecfile = './data/RNN/wiki-news-300d-1M.vec.50d-100k.txt'
xx_vecfile = './data/RNN/cc.zh.300.vec.50d-100k.txt'

In [None]:
from gensim.models import KeyedVectors

# Creating the english language model from the vectors stored on disk
en_model = KeyedVectors.load_word2vec_format(en_vecfile)

len(en_model.vocab), en_model.vector_size # Vocab size and dim (expect 100k x 50)

In [None]:
', '.join([ f for f in dir(en_model) if not f.startswith('_') ])

In [None]:
# Pick a word 
find_similar_to = 'dog'

# Finding out similar words
for similar_word in en_model.similar_by_word(find_similar_to, topn=10):
    print("Similarity: %.2f, Word: %s" % ( similar_word[1], similar_word[0],))