In [1]:
import os
import sys
import annoy
import pickle
import tempfile

import numpy as np
import tensorflow as tf
import apache_beam as beam
import tensorflow_hub as hub

from datetime import datetime
from collections import namedtuple
from apache_beam.transforms import util
from sklearn.random_projection import gaussian_random_matrix

In [2]:
print('TF version: {}'.format(tf.__version__))
print('TF-Hub version: {}'.format(hub.__version__))
print('Apache Beam version: {}'.format(beam.__version__))

TF version: 2.1.0
TF-Hub version: 0.8.0
Apache Beam version: 2.20.0


In [3]:
from tensorflow.keras.models import load_model 
from tensorflow.compat.v1 import ConfigProto
from tensorflow.compat.v1 import InteractiveSession

config = ConfigProto()
config.gpu_options.allow_growth = True
session = InteractiveSession(config = config)

A Million News Headlines dataset contains news headlines published over a period of 15 years sourced from the reputable Australian Broadcasting Corp. (ABC). This news dataset has a summarised historical record of noteworthy events in the globe from early-2003 to end-2017 with a more granular focus on Australia.

Format: Tab-separated two-column data: 1) publication date and 2) headline text. We are only interested in the headline text.

In [4]:
!head raw.tsv

publish_date	headline_text
20030219	"aba decides against community broadcasting licence"
20030219	"act fire witnesses must be aware of defamation"
20030219	"a g calls for infrastructure protection summit"
20030219	"air nz staff in aust strike for pay rise"
20030219	"air nz strike to affect australian travellers"
20030219	"ambitious olsson wins triple jump"
20030219	"antic delighted with record breaking barca"
20030219	"aussie qualifier stosur wastes four memphis match"
20030219	"aust addresses un security council over iraq"


In [5]:
# We only keep the headline

!rm -r corpus
!mkdir corpus

with open('corpus/text.txt', 'w') as out_file:
  with open('raw.tsv', 'r') as in_file:
    for line in in_file:
      headline = line.split('\t')[1].strip().strip('"')
      out_file.write(headline+"\n")

In [6]:
!tail corpus/text.txt

severe storms forecast for nye in south east queensland
snake catcher pleads for people not to kill reptiles
south australia prepares for party to welcome new year
strikers cool off the heat with big win in adelaide
stunning images from the sydney to hobart yacht
the ashes smiths warners near miss liven up boxing day test
timelapse: brisbanes new year fireworks
what 2017 meant to the kids of australia
what the papodopoulos meeting may mean for ausus
who is george papadopoulos the former trump campaign aide


In [7]:
# Embedding extraction method

embed_fn = None

def generate_embeddings(text, module_url, random_projection_matrix = None):
  # Beam will run this function in different processes that need to
  # import hub and load embed_fn (if not previously loaded)
  global embed_fn
  if embed_fn is None:
    embed_fn = hub.load(module_url)
  embedding = embed_fn(text).numpy()
  if random_projection_matrix is not None:
    embedding = embedding.dot(random_projection_matrix)
  return text, embedding

In [8]:
def to_tf_example(entries):
  examples = []

  text_list, embedding_list = entries
  for i in range(len(text_list)):
    text = text_list[i]
    embedding = embedding_list[i]

    features = {
        'text': tf.train.Feature(
            bytes_list = tf.train.BytesList(value = [text.encode('utf-8')])),
        'embedding': tf.train.Feature(
            float_list = tf.train.FloatList(value = embedding.tolist()))
    }
  
    example = tf.train.Example(
        features = tf.train.Features(
            feature = features)).SerializeToString(deterministic = True)
  
    examples.append(example)
  
  return examples

In [9]:
# Beam Pipeline

def run_hub2emb(args):
  '''Runs the embedding generation pipeline'''

  options = beam.options.pipeline_options.PipelineOptions(**args)
  args = namedtuple("options", args.keys())(*args.values())

  with beam.Pipeline(args.runner, options = options) as pipeline:
    (
        pipeline
        | 'Read sentences from files' >> beam.io.ReadFromText(
            file_pattern = args.data_dir)
        | 'Batch elements' >> util.BatchElements(
            min_batch_size = args.batch_size, max_batch_size = args.batch_size)
        | 'Generate embeddings' >> beam.Map(
            generate_embeddings, args.module_url, args.random_projection_matrix)
        | 'Encode to tf example' >> beam.FlatMap(to_tf_example)
        | 'Write to TFRecords files' >> beam.io.WriteToTFRecord(
            file_path_prefix = '{}/emb'.format(args.output_dir),
            file_name_suffix = '.tfrecords')
    )

Random projection is a simple, yet powerfull technique used to reduce the dimensionality of a set of points which lie in Euclidean space. For a theoretical background, see the Johnson-Lindenstrauss lemma.

Reducing the dimensionality of the embeddings with random projection means less time needed to build and query the ANN index.

In this tutorial we use Gaussian Random Projection from the Scikit-learn library.

In [10]:
def generate_random_projection_weights(original_dim, projected_dim):
  random_projection_matrix = None
  random_projection_matrix = gaussian_random_matrix(
      n_components = projected_dim, n_features = original_dim).T
  print("A Gaussian random weight matrix was creates with shape of {}".format(random_projection_matrix.shape))
  print('Storing random projection matrix to disk...')
  with open('random_projection_matrix', 'wb') as handle:
    pickle.dump(random_projection_matrix, 
                handle, protocol = pickle.HIGHEST_PROTOCOL)
        
  return random_projection_matrix

In [11]:
module_url = 'https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1' 
projected_dim = 64

In [12]:
output_dir = tempfile.mkdtemp()
original_dim = hub.load(module_url)(['']).shape[1]
random_projection_matrix = None

if projected_dim:
  random_projection_matrix = generate_random_projection_weights(
      original_dim, projected_dim)

args = {
    'job_name': 'hub2emb-{}'.format(datetime.utcnow().strftime('%y%m%d-%H%M%S')),
    'runner': 'DirectRunner',
    'batch_size': 1024,
    'data_dir': 'corpus/*.txt',
    'output_dir': output_dir,
    'module_url': module_url,
    'random_projection_matrix': random_projection_matrix,
}

print("Pipeline args are set.")
args

A Gaussian random weight matrix was creates with shape of (128, 64)
Storing random projection matrix to disk...
Pipeline args are set.




{'job_name': 'hub2emb-200505-040704',
 'runner': 'DirectRunner',
 'batch_size': 1024,
 'data_dir': 'corpus/*.txt',
 'output_dir': '/tmp/tmp6o0hfzko',
 'module_url': 'https://tfhub.dev/google/tf2-preview/nnlm-en-dim128/1',
 'random_projection_matrix': array([[ 0.11777096, -0.18941194,  0.12355215, ..., -0.11935346,
         -0.05472856,  0.10257149],
        [-0.02463811,  0.03498989,  0.15584462, ...,  0.03632531,
         -0.15344418,  0.13499407],
        [ 0.1745208 ,  0.25193234, -0.0140515 , ...,  0.07187349,
          0.05380318,  0.22745183],
        ...,
        [-0.01787598, -0.10541555, -0.0043509 , ..., -0.05776771,
          0.00942593, -0.05365407],
        [-0.0649402 ,  0.11598637,  0.15178003, ..., -0.11255522,
         -0.01757714,  0.13242785],
        [-0.01959453, -0.01784685,  0.07341345, ..., -0.05164322,
         -0.2140977 ,  0.05822046]])}

In [13]:
print("Running pipeline...")
%time run_hub2emb(args)
print("Pipeline is done.")

Running pipeline...




CPU times: user 4min 40s, sys: 13.1 s, total: 4min 53s
Wall time: 1min 15s
Pipeline is done.


In [14]:
embed_file = os.path.join(output_dir, 'emb-00000-of-00001.tfrecords')
sample = 5

# Create a description of the features.
feature_description = {
    'text': tf.io.FixedLenFeature([], tf.string),
    'embedding': tf.io.FixedLenFeature([projected_dim], tf.float32)
}

def _parse_example(example):
  # Parse the input `tf.Example` proto using the dictionary above.
  return tf.io.parse_single_example(example, feature_description)

dataset = tf.data.TFRecordDataset(embed_file)
for record in dataset.take(sample).map(_parse_example):
  print("{}: {}".format(record['text'].numpy().decode('utf-8'), record['embedding'].numpy()[:10]))

headline_text: [-0.06242689  0.0451642  -0.12775695  0.13880947 -0.00685298  0.01386029
  0.06293475 -0.05843846 -0.1290109   0.07561249]
aba decides against community broadcasting licence: [-0.02759941 -0.0668483   0.14180553 -0.10934345  0.16544361 -0.11714991
 -0.04192351 -0.16579825  0.07328099  0.05042982]
act fire witnesses must be aware of defamation: [ 0.08266592  0.05657516 -0.09953444 -0.22255915 -0.08982131 -0.09224436
 -0.11571347 -0.15178515 -0.12480546  0.24920128]
a g calls for infrastructure protection summit: [-0.11184522  0.09457187 -0.15731621 -0.05019473 -0.03897159  0.09188722
  0.1212418  -0.02741695 -0.03070022  0.16001716]
air nz staff in aust strike for pay rise: [ 0.04117656 -0.23968488 -0.23157041  0.04474726  0.03047607  0.02460521
  0.1868024  -0.20157345  0.21702358  0.13327442]


ANNOY (Approximate Nearest Neighbors Oh Yeah) is a C++ library with Python bindings to search for points in space that are close to a given query point. It also creates large read-only file-based data structures that are mapped into memory. It is built and used by Spotify for music recommendations.

In [15]:
def build_index(embedding_files_pattern, index_filename, vector_length, 
    metric = 'angular', num_trees = 100):
  '''Builds an ANNOY index'''

  annoy_index = annoy.AnnoyIndex(vector_length, metric = metric)
  # Mapping between the item and its identifier in the index
  mapping = {}

  embed_files = tf.io.gfile.glob(embedding_files_pattern)
  num_files = len(embed_files)
  print('Found {} embedding file(s).'.format(num_files))

  item_counter = 0
  for i, embed_file in enumerate(embed_files):
    print('Loading embeddings in file {} of {}...'.format(i+1, num_files))
    dataset = tf.data.TFRecordDataset(embed_file)
    for record in dataset.map(_parse_example):
      text = record['text'].numpy().decode("utf-8")
      embedding = record['embedding'].numpy()
      mapping[item_counter] = text
      annoy_index.add_item(item_counter, embedding)
      item_counter += 1
      if item_counter % 100000 == 0:
        print('{} items loaded to the index'.format(item_counter))

  print('A total of {} items added to the index'.format(item_counter))

  print('Building the index with {} trees...'.format(num_trees))
  annoy_index.build(n_trees = num_trees)
  print('Index is successfully built.')
  
  print('Saving index to disk...')
  annoy_index.save(index_filename)
  print('Index is saved to disk.')
  print("Index file size: {} GB".format(
    round(os.path.getsize(index_filename) / float(1024 ** 3), 2)))
  annoy_index.unload()

  print('Saving mapping to disk...')
  with open(index_filename + '.mapping', 'wb') as handle:
    pickle.dump(mapping, handle, protocol = pickle.HIGHEST_PROTOCOL)
  print('Mapping is saved to disk.')
  print("Mapping file size: {} MB".format(
    round(os.path.getsize(index_filename + '.mapping') / float(1024 ** 2), 2)))

In [16]:
embedding_files = "{}/emb-*.tfrecords".format(output_dir)
embedding_dimension = projected_dim
index_filename = "index"

In [17]:
%time build_index(embedding_files, index_filename, embedding_dimension)

Found 1 embedding file(s).
Loading embeddings in file 1 of 1...
100000 items loaded to the index
200000 items loaded to the index
300000 items loaded to the index
400000 items loaded to the index
500000 items loaded to the index
600000 items loaded to the index
700000 items loaded to the index
800000 items loaded to the index
900000 items loaded to the index
1000000 items loaded to the index
1100000 items loaded to the index
A total of 1103664 items added to the index
Building the index with 100 trees...
Index is successfully built.
Saving index to disk...
Index is saved to disk.
Index file size: 1.62 GB
Saving mapping to disk...
Mapping is saved to disk.
Mapping file size: 50.61 MB
CPU times: user 7min 54s, sys: 18.2 s, total: 8min 12s
Wall time: 7min 8s


### Use the Index for Similarity Matching
Now we can use the ANN index to find news headlines that are semantically close to an input query.

In [18]:
index = annoy.AnnoyIndex(embedding_dimension)
index.load(index_filename, prefault = True)
print('Annoy index is loaded.')
with open(index_filename + '.mapping', 'rb') as handle:
  mapping = pickle.load(handle)
print('Mapping file is loaded.')

Annoy index is loaded.
Mapping file is loaded.


  """Entry point for launching an IPython kernel.


In [19]:
# Similarity matching method

def find_similar_items(embedding, num_matches = 5):
  '''Finds similar items to a given embedding in the ANN index'''
  ids = index.get_nns_by_vector(
  embedding, num_matches, search_k = -1, include_distances = False)
  items = [mapping[i] for i in ids]
  return items

In [20]:
# Extract embedding from a given query

print("Loading the TF-Hub module...")
%time embed_fn = hub.load(module_url)
print("TF-Hub module is loaded.")

random_projection_matrix = None
if os.path.exists('random_projection_matrix'):
  print("Loading random projection matrix...")
  with open('random_projection_matrix', 'rb') as handle:
    random_projection_matrix = pickle.load(handle)
  print('random projection matrix is loaded.')

def extract_embeddings(query):
  '''Generates the embedding for the query'''
  query_embedding =  embed_fn([query])[0].numpy()
  if random_projection_matrix is not None:
    query_embedding = query_embedding.dot(random_projection_matrix)
  return query_embedding

Loading the TF-Hub module...
CPU times: user 949 ms, sys: 540 ms, total: 1.49 s
Wall time: 1.49 s
TF-Hub module is loaded.
Loading random projection matrix...
random projection matrix is loaded.


In [21]:
extract_embeddings("Hello Machine Learning!")[:10]

array([-0.13628113,  0.01904645,  0.2518048 ,  0.08050381,  0.07141582,
       -0.16103104, -0.02986444,  0.0651799 ,  0.33066507,  0.03870673])

In [22]:
# Enter a query to find the most similar items

query = "confronting global challenges" 

print("Generating embedding for the query...")
%time query_embedding = extract_embeddings(query)

print("")
print("Finding relevant items in the index...")
%time items = find_similar_items(query_embedding, 10)

print("")
print("Results:")
print("=========")
for item in items:
  print(item)

Generating embedding for the query...
CPU times: user 5.64 ms, sys: 16.4 ms, total: 22.1 ms
Wall time: 21 ms

Finding relevant items in the index...
CPU times: user 1.58 ms, sys: 0 ns, total: 1.58 ms
Wall time: 906 µs

Results:
confronting global challenges
old wisdom unites to solve global dilemmas
old wisdom unites to solve global dilemmas
nuclear watchdog warns of new global dangers
us consumerism poses global recession threat expert
outback challenge proves lucrative
iea boss warns of global energy crisis
new research challenges global warming theories
