In [None]:
!pip install annoy
!pip install apache_beam
!pip install tensorflow_hub
!pip install --upgrade --force-reinstall scikit-learn

In [None]:
import annoy
from collections import namedtuple
import numpy as np
import os
import pickle
from sklearn.random_projection import GaussianRandomProjection
import tempfile
import tensorflow as tf
import tensorflow_hub as hub
import tqdm
import sklearn

tf.__version__
sklearn.__version__

In [None]:
embed_fn = None
model_url = 'https://tfhub.dev/google/universal-sentence-encoder/4'

def generate_embeddings(text, model_url, random_projection_matrix=None):
    # Beam will run this function in different processes that need to
    global embed_fn
    if embed_fn is None:
        embed_fn = hub.load(model_url)
        embedding = embed_fn(text).numpy()
    if random_projection_matrix is not None:
        embedding = random_projection_matrix.fit_transform(embedding)
        print(embedding.size)
        #embedding = embedding.dot(random_projection_matrix)
    return text, embedding

In [None]:
vocabulary = 'lowercase_words.txt'
reduced_voc = 'lowercase_reduced_words.txt'
extension = ['es','s','d','ed','ment', 'ement','ive','ing','ion','ions','ted',
             'red','ded','ence','rence', 'ly', 'y']
def preprocess_vocabulary() : 
    def is_extension(root, word):
        for ext in extension : 
            if root + ext == word : 
                return True
            if root[:-1] + ext == word : 
                return True
        return False

    with open(vocabulary, 'r') as voc : 
        lines = voc.readlines()
        lines = list(map(str.strip,lines))

    with open(reduced_voc, 'w') as voc : 
        index = 0
        while index < len(lines) :
            j = index + 1
            while j < len(lines) and is_extension(lines[index], lines[j]): 
                j += 1
            voc.write(lines[index]+'\n')
            index = j

In [None]:
preprocess_vocabulary()

In [None]:
with open(reduced_voc, 'r') as voc : 
    lines = voc.readlines()
    print("Number of lines for")
    print(len(lines))

In [None]:
def generate_random_projection_matrix(projected_dim):
    return GaussianRandomProjection(n_components=projected_dim)


def generate_embeddings(text, model_url, random_projection_matrix=None):
    # Beam will run this function in different processes that need to
    global embed_fn
    if embed_fn is None:
        embed_fn = hub.load(model_url)
    embedding = embed_fn(text).numpy()
    if random_projection_matrix is not None:
        print(random_projection_matrix)
        embedding = random_projection_matrix.fit_transform(embedding)
        print("n_components : ",random_projection_matrix.n_components, "n_features_in : ", random_projection_matrix.n_features_in_)
        print('Storing random projection matrix to disk...')
        with open('random_projection_matrix2', 'wb') as handle:
            pickle.dump(random_projection_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return embedding
    

In [None]:
projected_dim = 64
original_dim = hub.load(model_url)(['']).shape[1]

In [None]:
output_dir = tempfile.mkdtemp()
original_dim = hub.load(model_url)(['']).shape[1]
random_projection_matrix = None

if projected_dim:
    random_projection_matrix = generate_random_projection_matrix(projected_dim)


In [None]:
def build_index(reduced_voc, index_filename, vector_length, metric='angular', num_trees=100):
    
    '''Builds an ANNOY index'''
    annoy_index = annoy.AnnoyIndex(vector_length, metric = metric)
    # Mapping between the item and its identifier in the index
    mapping = {}

    with open(reduced_voc, 'r') as reduced_voc : 
        words_list = reduced_voc.readlines()
    words_list = list(map(str.strip,words_list))
    num_words = len(words_list)

    item_counter = 0
    embeddings = generate_embeddings(words_list,model_url,random_projection_matrix)

    for i, embed in enumerate(embeddings):
        mapping[i] = words_list[i]
        annoy_index.add_item(i,embed)
        if i % 10_000 == 0 : 
            print(f'{i} items loaded to the index')

    print('Building the index with {} trees...'.format(num_trees))
    annoy_index.build(n_trees=num_trees)
    print('Index is successfully built.')

    print('Saving index to disk...')
    annoy_index.save(index_filename)
    print('Index is saved to disk.')
    print("Index file size: {} GB".format(
        round(os.path.getsize(index_filename) / float(1024 ** 3), 5)))
    annoy_index.unload()

    print('Saving mapping to disk...')
    with open(index_filename + '.mapping', 'wb') as handle:
        pickle.dump(mapping, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print('Mapping is saved to disk.')
    print("Mapping file size: {} MB".format(
        round(os.path.getsize(index_filename + '.mapping') / float(1024 ** 2), 5)))

In [None]:
index_filename = "index2"

!rm {index_filename}
!rm {index_filename}.mapping

%time build_index(reduced_voc, index_filename, projected_dim)
