In [1]:
!pip install annoy
!pip install apache_beam
!pip install tensorflow_hub



In [2]:
import numpy as np
import tensorflow as tf
tf.__version__
import tensorflow_hub as hub
import annoy
from sklearn.random_projection import GaussianRandomProjection
import pickle
from collections import namedtuple
import os
import tqdm

In [3]:
embed_fn = None
#model_url = 'https://hub.tensorflow.google.cn/google/universal-sentence-encoder/4'
model_url = 'https://tfhub.dev/google/universal-sentence-encoder/4'

def generate_embeddings(text, model_url, random_projection_matrix=None):
    # Beam will run this function in different processes that need to
    global embed_fn
    if embed_fn is None:
        embed_fn = hub.load(model_url)
        embedding = embed_fn(text).numpy()
    if random_projection_matrix is not None:
        embedding = random_projection_matrix.fit_transform(embedding)
        #embedding = embedding.dot(random_projection_matrix)
    return text, embedding

In [31]:
vocabulary = 'lowercase_words.txt'
reduced_voc = 'lowercase_reduced_words.txt'
extension = ['es','s','d','ed','ment', 'ement','ive','ing','ion','ions','ted',
             'red','ded','ence','rence', 'ly', 'y']
def preprocess_vocabulary() : 
    def is_extension(root, word):
        for ext in extension : 
            if root + ext == word : 
                return True
            if root[:-1] + ext == word : 
                return True
        return False
        #return word.startswith(root[:-1])

    with open(vocabulary, 'r') as voc : 
        lines = voc.readlines()
        lines = list(map(str.strip,lines))

    with open(reduced_voc, 'w') as voc : 
        index = 0
        while index < len(lines) :
            j = index + 1
            while j < len(lines) and is_extension(lines[index], lines[j]): 
                j += 1
            voc.write(lines[index]+'\n')
            index = j

In [32]:
preprocess_vocabulary()

In [33]:
#preprocess_vocabulary()
with open(reduced_voc, 'r') as voc : 
    lines = voc.readlines()
    print(len(lines))

40005


In [6]:
generate_embeddings(['elephant'], model_url)



KeyboardInterrupt: 

In [None]:
def to_tf_example(entries):
    examples = []
    text, embeddings = entries
    for i in range(len(text)): 
        texti = text[i]
        embeddingsi  = embeddings[i]
        features = {
            'text' : tf.train.Feature(
                bytes_list = tf.train.BytesList(value = [texti.encode('utf-8')])), 
            'embedding' : tf.train.Feature(
                float_list = tf.train.BytesList(value = embeddingsi.tolist()))
        }
        example = tf.train.Example(
            features = tf.train.Features(feature = features)).SerializeToString(deterministic = True)
        
        examples.append(example)
    return examples


In [None]:
def generate_random_projection_matrix(projected_dim):
    random_projection_matrix = GaussianRandomProjection(n_components=projected_dim).T
    print(random_projection_matrix.n_components_, random_projection_matrix.n_features_in_)
    print("A Gaussian random weight matrix was creates with shape of {}".format(random_projection_matrix.shape))
    print('Storing random projection matrix to disk...')
    with open('random_projection_matrix', 'wb') as handle:
        pickle.dump(random_projection_matrix, handle, protocol=pickle.HIGHEST_PROTOCOL)
    
    return random_projection_matrix
    

In [28]:
def run_hub2emb(args):

    options = beam.options.pipeline_options.PipelineOptions(**args)
    args = namedtuple("options", args.keys())(*args.values())

    with beam.Pipeline(args.runner, options=options) as pipeline:
    (
        pipeline
        | 'Read sentences from files' >> beam.io.ReadFromText(
            file_pattern=args.data_dir)
        | 'Batch elements' >> util.BatchElements(
            min_batch_size=args.batch_size, max_batch_size=args.batch_size)
        | 'Generate embeddings' >> beam.Map(
            generate_embeddings, args.model_url, args.random_projection_matrix)
        | 'Encode to tf example' >> beam.FlatMap(to_tf_example)
        | 'Write to TFRecords files' >> beam.io.WriteToTFRecord(
            file_path_prefix='{}/emb'.format(args.output_dir),
            file_name_suffix='.tfrecords')
    )

In [None]:
projected_dim = 64 # Test with None value
original_dim = hub.load(model_url)(['']).shape[1]

In [None]:
import tempfile

output_dir = tempfile.mkdtemp()
original_dim = hub.load(model_url)(['']).shape[1]
random_projection_matrix = None

if projected_dim:
    random_projection_matrix = generate_random_projection_weights(original_dim, projected_dim)

args = {
    'job_name': 'hub2emb-{}'.format(datetime.utcnow().strftime('%y%m%d-%H%M%S')),
    'runner': 'DirectRunner',
    'batch_size': 1024,
    'data_dir': reduced_voc,
    'output_dir': output_dir,
    'model_url': model_url,
    'random_projection_matrix': random_projection_matrix,
}

print("Pipeline args are set.")
args

In [None]:
embed_file = os.path.join(output_dir, 'emb-00000-of-00001.tfrecords')
sample = 5

# Create a description of the features.
feature_description = {
    'text': tf.io.FixedLenFeature([], tf.string),
    'embedding': tf.io.FixedLenFeature([projected_dim], tf.float32)
}

def _parse_example(example):
  # Parse the input `tf.Example` proto using the dictionary above.
  return tf.io.parse_single_example(example, feature_description)

dataset = tf.data.TFRecordDataset(embed_file)
for record in dataset.take(sample).map(_parse_example):
    print("{}: {}".format(record['text'].numpy().decode('utf-8'), record['embedding'].numpy()[:10]))

In [31]:
def build_index(embedding_files_pattern, index_filename, vector_length, metric='angular', num_trees=100):
    annoy_index = annoy.AnnoyIndex(vector_length, metric=metric)
    
    mapping = {}
    with open()
    print('Found {} embedding file(s).'.format(num_files))

    item_counter = 0
    for i, embed_file in enumerate(embed_files):
        print('Loading embeddings in file {} of {}...'.format(i+1, num_files))
        dataset = tf.data.TFRecordDataset(embed_file)
        for record in dataset.map(_parse_example):
            text = record['text'].numpy().decode("utf-8")
            embedding = record['embedding'].numpy()
            mapping[item_counter] = text
            annoy_index.add_item(item_counter, embedding)
            item_counter += 1
            if item_counter % 100000 == 0:
                print('{} items loaded to the index'.format(item_counter))

    print('A total of {} items added to the index'.format(item_counter))

    print('Building the index with {} trees...'.format(num_trees))
    annoy_index.build(n_trees=num_trees)
    print('Index is successfully built.')

    print('Saving index to disk...')
    annoy_index.save(index_filename)
    print('Index is saved to disk.')
    print("Index file size: {} GB".format(
        round(os.path.getsize(index_filename) / float(1024 ** 3), 2)))
    annoy_index.unload()

    print('Saving mapping to disk...')
    with open(index_filename + '.mapping', 'wb') as handle:
        pickle.dump(mapping, handle, protocol=pickle.HIGHEST_PROTOCOL)
    print('Mapping is saved to disk.')
    print("Mapping file size: {} MB".format(
        round(os.path.getsize(index_filename + '.mapping') / float(1024 ** 2), 2)))