In [0]:
# RUN this notebook on Google Colab
# Use GPU

In [2]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [0]:
from __future__ import absolute_import, division, print_function, unicode_literals

try:
  # %tensorflow_version only exists in Colab.
  %tensorflow_version 2.x
except Exception:
  pass

import re
import os
import io
import time
import json
import unicodedata
import numpy as np
import tensorflow as tf
from tensorflow.keras import preprocessing
from tensorflow.keras.layers import Embedding, LSTM, Bidirectional, Dense, concatenate

In [0]:
%cd # Folder

In [0]:
# Config
NUM_EXAMPLES = 175000
BATCH_SIZE = 64
STEPS_PER_EPOCH = NUM_EXAMPLES//BATCH_SIZE
WORD_EMB_DIM = 128
POS_EMB_DIM = 128
DEP_EMB_DIM = 128
VOCAB_SIZE = 50002
WORD_DICT_SIZE = 50002
POS_DICT_SIZE = 53
DEP_DICT_SIZE = 54
NUM_UNITS = 100
NUM_LAYERS = 1
LEARNING_RATE = .001
DECAY_RATE = .99

In [0]:
def read_data(file_directory, file_name):
  """ Read in the vectorized sentences from the 
      given file.

  Args:
    file_directory: The directory with the files with the vectorizes sentences.
    file_name: The file that contains the vectroized sentence.

  Returns:
    data: A list of vectorized sentences.
  
  """
  data = []
  file_path_name = file_directory + '/' + file_name
  with open(file_path_name) as doc:
      for line in doc:
          data.append([int(x) for x in line.split()])
  return data

In [0]:
def read_all_data(file_dir, file_base, words, pos, dep, target, num_files):
  """ Read in all the vectorized sentences. Each file
      (e.g. train-word_vector1.txt) contains 20,000 vectorized sentences.
      This function reads in all the data from all the files.
  
  Args:
    file_dir: The directory with the files with the vectorizes sentences.
    file_base: Either 'train', 'val', or 'test'
    words: Empty array to place the vectorized words.
    pos: Empty array to place the vectorized parts of speech.
    dep: Empty array to place the dependency relations.
    target: Empty array to place the labels.
    num_files: The number of files to read in

  Returns:
    None
    
  """
  for file_id in range(1, num_files+1):
    file_tail = '-word_vector' + str(file_id) + '.txt'
    words += read_data(file_dir, file_base + file_tail)
    file_tail = '-label_vector' + str(file_id) + '.txt'
    target += read_data(file_dir, file_base + file_tail)
    file_tail = '-pos_vector' + str(file_id) + '.txt'
    pos += read_data(file_dir, file_base + file_tail)
    file_tail = '-dep_vector' + str(file_id) + '.txt'
    dep += read_data(file_dir, file_base + file_tail)

In [0]:
def read_file(file_name, read_json=False):
  """ Read in the dictionaries or vocabulary.

  Args:
    file_name: The name of the file that holds the dictionary.
    read_json: Whether the file is a json file or a text file.

  Returns:
    data: The dictionary/vocabulary contained in the file.

  """
  data = []
  # currently in 'assets' directory
  # change this to the directory containing these dictionaires.
  with open('assets/' + file_name) as document:
    if read_json:
      data = json.load(document)
    else:
      data += ([str(x) for x in document.read().split()])
  return data

In [0]:
# Get vocabulary and dictionaries
word_vocab = read_file('word_vocab.txt')
word2id = read_file('word_dict.json', read_json=True)
pos2id = read_file('pos_dict.json', read_json=True)
dep2id = read_file('dep_dict.json', read_json=True)

# Compute id to word dictionary
id2word = {v: k for k, v in word2id.items()}
# Add other values
id2word.update({0:'<delete>', 50001:'<unk>', 50002:'<unk>'})

In [0]:
# Create training set
train_words_seq = []
train_pos_seq = []
train_dep_seq = []
train_target_seq = []
read_all_data('train', 'train', train_words_seq, train_pos_seq, 
              train_dep_seq, train_target_seq, 9)

In [0]:
# Remove very long sentences
train_words_seq = train_words_seq[:-5000]
train_pos_seq = train_pos_seq[:-5000]
train_dep_seq = train_dep_seq[:-5000]
train_target_seq = train_target_seq[:-5000]

In [0]:
# Create validation set
val_words_seq = []
val_pos_seq = []
val_dep_seq = []
val_target_seq = []
read_all_data('validation', 'val', val_words_seq, val_pos_seq, 
              val_dep_seq, val_target_seq, 1)

In [0]:
# Create test set
test_words_seq = []
test_pos_seq = []
test_dep_seq = []
test_target_seq = []
read_all_data('test', 'test', test_words_seq, test_pos_seq, 
              test_dep_seq, test_target_seq, 1)

In [0]:
# pad all sequences
train_word_tensor = preprocessing.sequence.pad_sequences(train_words_seq, padding='post')
train_pos_tensor = preprocessing.sequence.pad_sequences(train_pos_seq, padding='post')
train_dep_tensor = preprocessing.sequence.pad_sequences(train_dep_seq, padding='post')
train_target_tensor = preprocessing.sequence.pad_sequences(train_target_seq, padding='post')
val_word_tensor = preprocessing.sequence.pad_sequences(val_words_seq, padding='post')
val_pos_tensor = preprocessing.sequence.pad_sequences(val_pos_seq, padding='post')
val_dep_tensor = preprocessing.sequence.pad_sequences(val_dep_seq, padding='post')
val_target_tensor = preprocessing.sequence.pad_sequences(val_target_seq, padding='post')

In [0]:
# Verify that dimensions are all the same
def verify_dim(word_seq, pos_seq, dep_seq, target_seq):
  for word_sent, pos_sent, dep_sent, target_sent in zip(word_seq, pos_seq, dep_seq, target_seq):
    if len(word_sent) == len(pos_sent) == len(dep_sent) == len(target_sent):
      continue
    else:
      print('FAIL')
      break

In [0]:
verify_dim(train_word_tensor, train_pos_tensor, train_dep_tensor, train_target_tensor)

In [0]:
verify_dim(val_word_tensor, val_pos_tensor, val_dep_tensor, val_target_tensor)

In [0]:
# Create a tf.data Dataset
dataset = tf.data.Dataset.from_tensor_slices((train_word_tensor, 
                                              train_pos_tensor, 
                                              train_dep_tensor, 
                                              train_target_tensor)
                                            ).shuffle(NUM_EXAMPLES)
dataset = dataset.batch(BATCH_SIZE, drop_remainder=True)

In [0]:
# Verify that dimensions are correct
w, p, d, t = next(iter(dataset))
verify_dim(w, p, d, t)
print(w.shape, p.shape, d.shape, t.shape)

In [0]:
# Encoder
class Encoder(tf.keras.Model):
  """ Encoder model that compresses a given input sentences.
      It recieves three embeddings as input. The word embeddings,
      the part-of-speech embeddings, and the dependency relation
      embeddings. A bidirectional LSTM is used to extract the 
      features that are used to predict whether a word is removed
      or kept.

  Args:
    vocab_size: Vocabulary size defined by the training set.
    enc_units: The number of units for the LSTM.
    batch_size: The batch size.

  """
  def __init__(self, vocab_size, enc_units, batch_sz):
    super(Encoder, self).__init__()
    self.batch_sz = batch_sz
    self.enc_units = enc_units
    self.word_embedding = Embedding(WORD_DICT_SIZE, WORD_EMB_DIM, mask_zero=True, embeddings_initializer='glorot_uniform')
    self.pos_embedding = Embedding(POS_DICT_SIZE, POS_EMB_DIM, mask_zero=True, embeddings_initializer='glorot_uniform')
    self.dep_embedding = Embedding(DEP_DICT_SIZE, DEP_EMB_DIM, mask_zero=True, embeddings_initializer='glorot_uniform')
    self.lstm = LSTM(NUM_UNITS, return_sequences=True, return_state=True, dropout=0.5, recurrent_initializer='glorot_uniform')
    self.bidirectional = Bidirectional(self.lstm, merge_mode='concat')
    self.fc = Dense(1, activation='sigmoid')

  def call(self, words, pos, dep):
    word_emb = self.word_embedding(words)
    pos_emb = self.pos_embedding(pos)
    dep_emb = self.dep_embedding(dep)
    mask = self.dep_embedding.compute_mask(words)
    x = concatenate([word_emb, pos_emb, dep_emb])
    x, forward_ouput, forward_state, backward_output, backward_state = self.bidirectional(x, mask=mask)
    output = tf.squeeze(self.fc(x), -1)
    return output

In [0]:
def loss_function(real, pred):
  """ Loss function.

  Args:
    real: The ground truth labels.
    pred: The predicted probabilities.

  Returns:
    The average loss.

  """
  labels = tf.math.logical_not(tf.math.equal(real, 0))
  new_target = tf.dtypes.cast(labels, tf.float32)
  loss_ = loss_object(new_target, pred)
  return tf.reduce_mean(loss_)

In [0]:
optimizer = tf.keras.optimizers.Adam(learning_rate=LEARNING_RATE)
loss_object = tf.keras.losses.BinaryCrossentropy(from_logits=True, reduction='none')

In [0]:
encoder = Encoder(VOCAB_SIZE, NUM_UNITS, BATCH_SIZE)

In [0]:
checkpoint_dir = './training_checkpoints'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
                                 encoder=encoder)

In [0]:
@tf.function
def train_step(words, pos, dep, targ):
  """ Train the model and calculate the loss
      for a batch per time step.

  Args:
    words: The vectorized words for a batch.
    pos: The vectorized parts of speech for a batch.
    dep: The vectorized dependency relations for a batch.
    targ: The labels for a batch.
  
  Returns:
    The loss for a given batch.

  """
  loss = 0

  with tf.GradientTape() as tape:
    predictions = encoder(words, pos, dep)

    # don't include '<bos>'
    # calculate loss per batch, per time step
    for t in range(1, targ.shape[1]):
      loss += loss_function(targ[:, t], predictions[:, t])

  batch_loss = (loss / int(targ.shape[1]))
  variables = encoder.trainable_variables
  gradients = tape.gradient(loss, variables)
  optimizer.apply_gradients(zip(gradients, variables))
  return batch_loss

In [0]:
# Train the Model

EPOCHS = 40

for epoch in range(EPOCHS):
  start = time.time()
  total_loss = 0

  for (batch, (words, pos, dep, targ)) in enumerate(dataset.take(STEPS_PER_EPOCH)):
    batch_loss = train_step(words, pos, dep, targ)
    total_loss += batch_loss

    if batch % 100 == 0:
        print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
                                                     batch,
                                                     batch_loss.numpy()))
  # saving (checkpoint) the model every 2 epochs
  if (epoch + 1) % 2 == 0:
    checkpoint.save(file_prefix = checkpoint_prefix)

  print('Epoch {} Loss {:.4f}'.format(epoch + 1,
                                      total_loss / STEPS_PER_EPOCH))
  print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))

In [0]:
# Save Model
encoder.save_weights('model/sc_model', save_format='tf')

In [0]:
# Load Model
enc = Encoder(VOCAB_SIZE, NUM_UNITS, BATCH_SIZE)
enc.load_weights('model/sc_model')

In [0]:
import nltk
from nltk.metrics.distance import edit_distance

In [0]:
def evaluate(encoder, words, pos, dep):
  """ Returns the model predictions.

  Args:
    encoder: The Encoder model.
    words: A list of lists of words.
    pos: A list of lists of parts of speech.
    dep: A list of lists of dependency relations.

  Returns:
    enc_out: A list of lists of probabilities.

  """
  word_seq = tf.dtypes.cast(tf.convert_to_tensor(words), dtype=tf.dtypes.float32)
  pos_seq = tf.dtypes.cast(tf.convert_to_tensor(pos), dtype=tf.dtypes.float32)
  dep_seq = tf.dtypes.cast(tf.convert_to_tensor(dep), dtype=tf.dtypes.float32)
  enc_out = encoder(word_seq, pos_seq, dep_seq)
  return(enc_out)


In [0]:
def get_comp_ratio(res):
  """ Get the compression ratio and percent of reduction.

  Args:
    res: The sentence generated by the model.

  Returns:
    Tuple with compression ratio and percent of reduction.

  """
  kept_num = 0
  for i in range(len(res[0])):
    # use a threshold of .5 to indicate if a word is kept.
    if res[0][int(i)]>.5:
        kept_num += 1
  return (len(res[0])/kept_num, kept_num/(len(res[0]/kept_num)))

In [0]:
def get_edit_distance(res, target):
  """ Compute the edit distance between the generated sentence and the 
      true compressed sentence.

  Args:
    res: The sentence generated by the model.
    target: The true compressed sentence.

  Returns:
    Edit distance.

  """
  res_str = ''
  targ_str = ''
  for i in range(len(res[0])):
    # convert generated sentence to a bit string
    if res[0][int(i)]>.5:
      res_str += '1'
    else:
      res_str += '0'
    # convert true sentence to a bit string
    if int(target[0][i]) != 0:
      targ_str += '1'
    else:
      targ_str += '0'
  return edit_distance(res_str, targ_str)

In [0]:
edit_distances = []
comp = []
comp_perc = []
for words, pos, dep, targ in zip(test_words_seq, test_pos_seq, test_dep_seq, test_target_seq):
  word_seq = tf.expand_dims(words, 0)
  pos_seq = tf.expand_dims(pos, 0)
  dep_seq = tf.expand_dims(dep, 0)
  target_seq = tf.expand_dims(targ, 0)
  res = evaluate(enc, word_seq, pos_seq, dep_seq)
  compression = get_comp_ratio(res)
  comp.append(compression[0])
  comp_perc.append(compression[1])
  edit_distances.append(get_edit_distance(res, target_seq))
  

In [0]:
np.mean(comp)

In [0]:
np.mean(comp_perc)

In [0]:
np.mean(edit_distances)

In [0]:
# Create Sample Data
sample_word = tf.expand_dims(test_words_seq[0], 0)
sample_pos = tf.expand_dims(test_pos_seq[0], 0)
sample_dep = tf.expand_dims(test_dep_seq[0], 0)
sample_target = tf.expand_dims(test_target_seq[0], 0)

In [0]:
res = evaluate(enc, sample_word, sample_pos, sample_dep)

In [0]:
# Print original target
sample_original = ''
for i in sample_word[0]:
  if i>0:
    sample_original += id2word[int(i)] + ' '
print(sample_original)

In [0]:
# Print sample target
sample_truth = ' '
for i in sample_target[0]:
  if i>0:
    sample_truth += id2word[int(i)] + ' '
print(sample_truth)

In [0]:
# Print sample result
sample_result = ' '
for i in range(len(res[0])):
  if res[0][int(i)]>.5:
    sample_result += id2word[int(sample_word[0][int(i)])] + ' '
print(sample_result)