In [0]:
import os
import sys
import h5py
import random
import pickle
try:
  import Levenshtein
except ImportError:
  ! pip install python-levenshtein
  import Levenshtein
import numpy as np
import seaborn as sns

import keras
import keras.backend as K
from keras.layers import Merge
from keras.utils import np_utils
from keras.models import Sequential
from keras.preprocessing import sequence
from keras.regularizers import l2
from keras.layers.wrappers import TimeDistributed
from keras.layers.recurrent import LSTM
from keras.layers.embeddings import Embedding
from keras.layers.core import (
    Lambda,
    Dense,
    Activation,
    Dropout,
    RepeatVector
)
from sklearn.cross_validation import train_test_split

In [24]:
keras.__version__

'2.1.6'

Input data *X* is made from *MAXLEN_DESC* description words followed by  *eos* followed by headline words followed by *eos*. 
If description is shorter than *MAXLEN_DESC* it will be left padded with empty.
if entire data is longer than *MAXLEN* it will be clipped and if it is shorter it will be padded.

Labels *Y* are the headline words followed by *eos* and clipped or padded to *MAXLEN_HEAD*.

In [0]:
# model global params
MAXLEN_DESC=25
MAXLEN_HEAD=25
MAXLEN = MAXLEN_DESC + MAXLEN_HEAD
RNN_SIZE = 128
RNN_LAYERS = 3
BATCH_NORMALIZATION=False
ACTIVATION_RNN_SIZE = 40 if MAXLEN_DESC else 0

# data global params
DATA_FOLDER = 'drive/text_summarization/data'
WEIGHTS_FILENAME = 'train_weights.hdf5'
EMBEDDINGS_FILENAME = 'vocabulary-embedding'

WEIGHTS_PATH = os.path.join(DATA_FOLDER, WEIGHTS_FILENAME)
VOCABULARY_EMBEDDINGS_PATH = os.path.join(DATA_FOLDER, EMBEDDINGS_FILENAME)

The output of the first *ACTIVATION_RNN_SIZE* nodes from the top layer will be used for activation and the rest will be used to select predicted word

In [0]:
# model training parameters
SEED=42
LSTM_DROPOUT, LSTM_RECURRENT_DROPOUT, DROPOUT_RATE , WEIGHT_DECLAY = 0, 0, 0, 0
REGULARIZER = l2(WEIGHT_DECLAY) if WEIGHT_DECLAY else None
OPTIMIZER = 'adam'
BATCH_SIZE=64
UNKNOWN_WORDS_COUNT = 100

random.seed(SEED)
np.random.seed(SEED)

In [0]:
# load embeddings
with open("{}.pkl".format(VOCABULARY_EMBEDDINGS_PATH), "rb") as fp:
    EMBEDDING, INDEX_TO_WORD, WORD_TO_INDEX, GLOVE_INDEX_TO_INDEX = pickle.load(fp)

# load all data
with open("{}.data.pkl".format(VOCABULARY_EMBEDDINGS_PATH), "rb") as fp:
    X, Y = pickle.load(fp)

VOCAB_SIZE, EMBEDDING_SIZE = EMBEDDING.shape

In [28]:
print ('data size: X = {}, Y = {}'.format(len(X), len(Y)))
print ('embeddings size: {}'.format(EMBEDDING_SIZE))
print ('vocabulary size: ', VOCAB_SIZE, 'the last {} words can be used as place holders for unknown words'.format(UNKNOWN_WORDS_COUNT))
print ('different words count: INDEX_TO_WORD = {}, WORD_TO_INDEX = {}'.format(len(INDEX_TO_WORD), len(WORD_TO_INDEX)))
print ('words outside vocabulary which we can substituted using glove:', len(GLOVE_INDEX_TO_INDEX))
print ('number of unknown words: ', len(INDEX_TO_WORD) - VOCAB_SIZE - len(GLOVE_INDEX_TO_INDEX))

data size: X = 1000000, Y = 1000000
embeddings size: 100
vocabulary size:  40000 the last 100 words can be used as place holders for unknown words
different words count: INDEX_TO_WORD = 1557454, WORD_TO_INDEX = 1557454
words outside vocabulary which we can substituted using glove: 185463
number of unknown words:  1331991


In [0]:
# unknown words
for i in range(UNKNOWN_WORDS_COUNT):
    INDEX_TO_WORD[VOCAB_SIZE - i - 1] = "<{}>".format(i)

#  out of vocabulary words
for i in range(VOCAB_SIZE - UNKNOWN_WORDS_COUNT, len(INDEX_TO_WORD)):
    INDEX_TO_WORD[i] = INDEX_TO_WORD[i] + '^'

In [30]:
# reduce sample size.
# For training we reduced sample size.
# This is done only because of our restriction on GPU capacity.
reduce_sample_size = 10

new_example_size = len(X) // reduce_sample_size
val_samples_count = int(new_example_size * 0.1)

X_train, X_test, Y_train, Y_test = train_test_split(X[:new_example_size], Y[:new_example_size], test_size=val_samples_count, random_state=SEED)
print("Train size: {}, Test size: {}".format(len(X_train), len(X_test)))

Train size: 90000, Test size: 10000


In [0]:
# handle empty and end of sequence
empty = 0
eos = 1
INDEX_TO_WORD[empty] = '_'
INDEX_TO_WORD[eos] = '~'

In [317]:
# show how data looks like    
def print_sample(label, sample):
    print(label + ':', end=' '),
    for index in sample:
        print(INDEX_TO_WORD[index], end=' '),
    print()
    
    
print_sample('Head', Y_train[9111])
print_sample('Desc', X_train[9111])

Head: Microsoft Acquires Popular Android App Echo Notification Lockscreen^ 
Desc: Microsoft has acquired Double Labs , an Android app startup which develops the popular Echo Notification Lockscreen^ for Android devices . This app had received between 1 million and 5 million downloads so far in Play Store . 


# Model

## base rnn model

Standard stacked LSTM model identical to one used during training

In [0]:
rnn_model = Sequential()
rnn_model.add(Embedding(
    VOCAB_SIZE, EMBEDDING_SIZE, input_length=MAXLEN,
    embeddings_regularizer=REGULARIZER, weights=[EMBEDDING],
    mask_zero=True, name='embedding_1'))

for i in range(RNN_LAYERS):
    lstm = LSTM(RNN_SIZE, return_sequences=True,
                kernel_regularizer=REGULARIZER, bias_regularizer=REGULARIZER,
                recurrent_regularizer=REGULARIZER, dropout=LSTM_DROPOUT,
                recurrent_dropout=LSTM_RECURRENT_DROPOUT,
                name='lstm_{}'.format(i+1)
                  )
    rnn_model.add(lstm)
    rnn_model.add(Dropout(DROPOUT_RATE, name='dropout_{}'.format(i+1)))


In [34]:
rnn_model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
embedding_1 (Embedding)      (None, 50, 100)           4000000   
_________________________________________________________________
lstm_1 (LSTM)                (None, 50, 128)           117248    
_________________________________________________________________
dropout_1 (Dropout)          (None, 50, 128)           0         
_________________________________________________________________
lstm_2 (LSTM)                (None, 50, 128)           131584    
_________________________________________________________________
dropout_2 (Dropout)          (None, 50, 128)           0         
_________________________________________________________________
lstm_3 (LSTM)                (None, 50, 128)           131584    
_________________________________________________________________
dropout_3 (Dropout)          (None, 50, 128)           0         
Total para

## load weights

In [0]:
def load_weights(model, filepath):
    """
    Load as much weights as possible from given file
    """
    print("Loading {} to {}".format(filepath, model.name))
    with h5py.File(filepath, mode='r') as file:
        layer_names = [n.decode('utf8') for n in file.attrs['layer_names']]
        weights_tuples = []
        for name in layer_names:
            print("Loading layer: {}".format(name))
            # for each layer
            weight_names = [n.decode('utf8') for n in file[name].attrs['weight_names']]
            
            if len(weight_names):
                weights = [file[name][weight_name] for weight_name in weight_names]
                try:
                    layer = model.get_layer(name=name)
                except:
                    layer = None
                if not layer:
                    print("Failed to find layer {}".format(name))
                    weights = [np.array(w) for w in weights]
                    break

                all_weights = layer.trainable_weights + layer.non_trainable_weights
                weights_tuples += zip(all_weights, weights)
                
        K.batch_set_value(weights_tuples)
    return weights

In [36]:
weights = load_weights(rnn_model, WEIGHTS_PATH)

Loading drive/text_summarization/data/train_weights.hdf5 to sequential_3
Loading layer: embedding_1
Loading layer: lstm_1
Loading layer: dropout_1
Loading layer: lstm_2
Loading layer: dropout_2
Loading layer: lstm_3
Loading layer: dropout_3
Loading layer: simplecontext_1
Loading layer: time_distributed_2
Failed to find layer time_distributed_2


In [37]:
# shape of loaded weights
[w.shape for w in weights]

[(176, 40000), (40000,)]

## final headline model

For each word in this part it concatenate the output of the previous layer (RNN) with a weighted average of the outputs of the description part. In this only the last RNN_SIZE - ACTIVATION_RNN_SIZE are used from each output. The first ACTIVATION_RNN_SIZE output is used to compute the weights for the averaging.

In [0]:
context_weight = K.variable(1.)
head_weight = K.variable(1.)

def simple_context(X, mask, size=ACTIVATION_RNN_SIZE, maxlen_desc=MAXLEN_DESC, maxlen_head=MAXLEN_HEAD):
    desc, head = X[:,:maxlen_desc], X[:,maxlen_desc:]
    head_activations, head_words = head[:,:,:size], head[:,:,size:]
    desc_activations, desc_words = desc[:,:,:size], desc[:,:,size:]
    
    # activation for every head word and every desc word
    activation_energies = K.batch_dot(head_activations, desc_activations, axes=(2,2))
    
    # make sure we don't use description words that are masked out
    if mask is not None:
      activation_energies = activation_energies + \
      -1e20 * K.expand_dims(1.- K.cast(mask[:, :maxlen_desc], 'float32'), 1)
    
    # for every head word compute weights for every desc word
    activation_energies = K.reshape(activation_energies,(-1, maxlen_desc))
    activation_weights = K.softmax(activation_energies)
    activation_weights = K.reshape(activation_weights,(-1, maxlen_head, maxlen_desc))

    # for every head word compute weighted average of desc words
    desc_avg_word = K.batch_dot(activation_weights, desc_words, axes=(2,1))
    return K.concatenate((context_weight * desc_avg_word, head_weight * head_words))


class SimpleContext(Lambda):
    def __init__(self, **kwargs):
        super(SimpleContext, self).__init__(simple_context, **kwargs)
        self.supports_masking = True

    def compute_mask(self, input, input_mask=None):
        return input_mask[:, MAXLEN_DESC:]    
    
    def get_output_shape_for(self, input_shape):
        nb_samples = input_shape[0]
        return (nb_samples, MAXLEN_HEAD, 2 * (RNN_SIZE - ACTIVATION_RNN_SIZE))

In [0]:
# define final model
model = Sequential()
model.add(rnn_model)

if ACTIVATION_RNN_SIZE:
    model.add(SimpleContext(name='simplecontext_1'))

In [40]:
model.summary()

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
sequential_3 (Sequential)    (None, 50, 128)           4380416   
_________________________________________________________________
simplecontext_1 (SimpleConte (None, 25, 176)           0         
Total params: 4,380,416
Trainable params: 4,380,416
Non-trainable params: 0
_________________________________________________________________


In [0]:
model.compile(loss='categorical_crossentropy', optimizer=OPTIMIZER)

## test shapes

In [0]:
def left_padd(x, maxlen_desc=MAXLEN_DESC, eos=eos):
    """
    left pad a description to maxlen_desc and then add eos.
    """
    if maxlen_desc == 0:
        return [eos]
    size = len(x)
    if size > maxlen_desc:
        x = x[-maxlen_desc:]
        size = maxlen_desc
    return [empty] * (maxlen_desc - size) + x + [eos]

In [0]:
test_samples = [[empty] * MAXLEN_DESC + [eos]]
data = sequence.pad_sequences(test_samples, maxlen=MAXLEN, value=empty, padding='post', truncating='post')

In [44]:
data

array([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
        0, 0, 0, 0, 0, 0]], dtype=int32)

In [45]:
# check eos
np.all(data[:,MAXLEN_DESC] == eos)

True

In [46]:
data.shape

(1, 50)

In [47]:
probs = model.predict(data, verbose=0, batch_size=1)
probs.shape

(1, 25, 176)

# Test sample generation

In [0]:
def get_sample(energy, n, temperature):
    """
    Used in Beam search. Sample at most n different elements
    according to their energy 
    """
    res = []
    size = min(n, len(energy))
    probs = np.exp(-np.array(energy) / temperature )
    for i in range(size):
        max_i = np.argmax(np.random.multinomial(1, probs / np.sum(probs), 1))
        res.append(max_i)
        probs[max_i] = 0.0 # select each element only once
    return res

In [0]:
def beamsearch(predict, k, start=[empty] * MAXLEN_DESC + [eos],
               maxsample=MAXLEN, oov=VOCAB_SIZE-1, 
               empty=empty, eos=eos, temperature=1.0):
    """
    Standard Beam search. Return k samples and their NLL scores.
    All samples starts with an `empty` label and end with `eos` 
    or truncated to length of `maxsample`.
    `predict` model that returns label probability of each sample.
    """
    dead_samples = []
    dead_scores = []
    live_scores = [0]
    live_samples = [start]

    while live_samples:
        # for every live sample get prob for every label 
        probs = predict(live_samples, empty=empty)
        
        # total score for every sample is sum of -log of word probs
        sample_score = np.array(live_scores)[:, None] - np.log(probs)
        sample_score[:, empty] = 1e20

        live_scores = list(sample_score.flatten())

        # find the best (lowest) scores we have from all possible dead samples
        # and all live samples and all possible new words added
        scores = dead_scores + live_scores
        ranks = get_sample(scores, k, temperature)
        n = len(dead_scores)
        dead_scores = [dead_scores[r] for r in ranks if r < n]
        dead_samples = [dead_samples[r] for r in ranks if r < n]
        
        live_scores = [live_scores[r - n] for r in ranks if r >= n]
        live_samples = [live_samples[(r - n) // VOCAB_SIZE] + \
                        [(r - n) % VOCAB_SIZE] for r in ranks if r >= n]

        # even if len(live_samples) == maxsample we dont want it dead
        # last prediction out of it to reach a headline of MAXLEN_HEAD
        def is_zombie(s): return s[-1] == eos or len(s) > maxsample
        
        # add zombies to the dead
        dead_scores += [c for s, c in zip(live_samples, live_scores) 
                        if is_zombie(s)]
        dead_samples += [s for s in live_samples if is_zombie(s)]
        
        # remove zombies from the living 
        live_scores = [c for s, c in zip(live_samples, live_scores) 
                       if not is_zombie(s)]
        live_samples = [s for s in live_samples if not is_zombie(s)]

    return dead_samples, dead_scores

In [0]:
# top dense of the trained model
def convert_to_probs(output):
    output = np.dot(output, weights[0]) + weights[1]
    output -= output.max()
    output = np.exp(output)
    output /= output.sum()
    return output

In [0]:
def custom_predict(samples, empty=empty, model=model, maxlen=MAXLEN):
    """
    For every sample, calculate probability for every possible label.
    """
    sample_lengths = list(map(len, samples))
    data = sequence.pad_sequences(samples, maxlen=MAXLEN, value=empty, padding='post', truncating='post')
    model_predict = model.predict(data, verbose=0, batch_size=BATCH_SIZE)
    return np.array([
        convert_to_probs(prob[sample_length - MAXLEN_DESC - 1]) 
        for prob, sample_length in zip(model_predict, sample_lengths)
    ])


In [0]:
def fold_vocabulary(words):
    """
    Convert list of word indexes that may contain words outside VOCAB_SIZE to words inside.
    If a word is outside, try first to use GLOVE_INDEX_TO_INDEX to find a similar word inside.
    If none exist then replace all occurances of the same unknown word with <0>, <1>, etc
    """
    words = [x if x < VOCAB_SIZE - UNKNOWN_WORDS_COUNT else GLOVE_INDEX_TO_INDEX.get(x,x) for x in words]
    # the more popular word is <0> and so on
    outside = sorted([x for x in words if x >= VOCAB_SIZE - UNKNOWN_WORDS_COUNT])
    # if there are more than UNKNOWN_WORDS_COUNT oov words then put them all in UNKNOWN_WORDS_COUNT-1
    outside = dict((x, VOCAB_SIZE - 1 - min(i, UNKNOWN_WORDS_COUNT-1)) for i, x in enumerate(outside))
    words = [outside.get(x,x) for x in words]
    return words

In [0]:
def unfold_vocabulary(desc, words):
    """ Reverse operation to fold_vocabulary """
    unfold = {}
    for i, unfold_idx in enumerate(desc):
        fold_idx = words[i]
        if fold_idx >= VOCAB_SIZE - UNKNOWN_WORDS_COUNT:
            unfold[fold_idx] = unfold_idx
    return [unfold.get(x,x) for x in words]

In [0]:
def gensamples(X=None, X_test=None, Y_test=None, skips=2, k=10, batch_size=BATCH_SIZE, short=True, temperature=1.):
    """
    Generates samples with best score using custom_predict and Beam search
    """  
    x = [WORD_TO_INDEX[w.rstrip('^')] for w in X.split()]

    print('HEADS: ')
    samples = []
    if MAXLEN_DESC == 0:
        skips = [0]
    else:
        skips = range(min(MAXLEN_DESC, len(x)), max(MAXLEN_DESC, len(x)), abs(MAXLEN_DESC - len(x)) // skips + 1)
    for s in skips:
        start = left_padd(x[:s])
        fold_start = fold_vocabulary(start)
        sample, score = beamsearch(predict=custom_predict, start=fold_start,
                                   k=k, temperature=temperature)
        samples += [(s, start, score) for s, score in zip(sample, score)]
    
    # sort samples
    samples.sort(key=lambda x: x[-1])
    codes = []
    for sample, start, score in samples:
        code = ''
        words = []
        sample = unfold_vocabulary(start, sample)[len(start):]
        for w in sample:
            if w == eos:
                break
            words.append(INDEX_TO_WORD[w])
            code += chr(w // (256 * 256)) + chr((w // 256) % 256) + chr(w % 256)
        if short:
            distance = min([100] + [-Levenshtein.jaro(code,c) for c in codes])
            if distance > -0.6:
                print("{} {}".format(score, ' '.join(words)))
        else:
            print("{} {}".format(score, ' '.join(words)))
        codes.append(code)
    return samples

Examples:

In [0]:
X = "Christopher Nolan 's next movie will be released on July 21st , 2017 . The 45-year-old filmmaker - who 's directed some of the biggest hits in recent years , including Inception and Interstellar - is to make the as-yet-untitled movie for his long-time partner Warner Bros and has instructed everyone connected to the project to keep its details a secret , according to The Hollywood Reporter ."
Y = "Christopher Nolan 's new film gets release date"

In [152]:
samples = gensamples(X=X, k=10, skips=2, batch_size=BATCH_SIZE, temperature=1.)

HEADS: 
18.383082628250122 : to first


In [0]:
X = "Shares of Toronto-Dominion^ Bank ( NYSE : TD ) have been given an average recommendation of “Buy” by the eleven research firms that are currently covering the firm , Market Beat Ratings reports . One equities research analyst has rated the stock with a sell rating , three have assigned a hold rating and six have given a buy rating to the company ."
Y = "Toronto-Dominion^ Bank Receives $ 56.75^ Average Target Price from Analysts ( NYSE : TD )"

In [68]:
samples = gensamples(X=X, k=10, skips=2, batch_size=BATCH_SIZE, temperature=1.)

HEADS: 
34.63547205924988 of Products Market Toronto-Dominion^ , ( Report


In [0]:
X = "Sydney-based^ strategic integrated communications agency , Zadro^ , has moved to the next level ; due to the business expansion , they are commencing a new chapter in their growth by relocating to a bigger , better and brighter office space in Surry Hills . Felicity^ Zadro^ , Managing Director , Zadro^ , says the move is indicative of the way the agency has flourished^ since its inception eight years ago ."
Y = "Integrated communications agency , Zadro^ , moves to the next level "

In [268]:
samples = gensamples(X=X, k=10, skips=2, batch_size=BATCH_SIZE, temperature=1., short=False)


HEADS: 
15.158374309539795 Zadro^ into
22.767340421676636 : man , for
25.752980589866638 Khang Sydney-based^ Your in
31.19711208343506 : man , for in of
33.33200526237488 Khang Sydney-based^ Your in ( )
33.389075756073 Khang Sydney-based^ Your in ( Sydney-based^
37.114909648895264 Khang Sydney-based^ Your in ( Sydney-based^ )
40.732621908187866 Khang Sydney-based^ Your in ( Sydney-based^ ) ,
44.883264899253845 : man , for in of | Week
46.78041231632233 Khang Sydney-based^ Your in ( Sydney-based^ ) , In
48.34776163101196 : man , for in the Wire in Zadro^
48.40708410739899 : man , for in the Wire in in
48.86311888694763 Khang Sydney-based^ Your in ( Sydney-based^ ) , Zadro^ ,
52.33882141113281 : man , for in the Wire in Zadro^ in
56.21885633468628 Khang Sydney-based^ Your in ( Sydney-based^ ) , Zadro^ , Share
56.60913872718811 : man , for in the Wire in Zadro^ in ,
61.65643393993378 Khang Sydney-based^ Your in ( Sydney-based^ ) , Zadro^ , Share '
63.978535175323486 : man , for in the Wir

In [0]:
X = "Microsoft has acquired Double Labs , an Android app startup which develops the popular Echo Notification Lockscreen for Android devices . This app had received between 1 million and 5 million downloads so far in Play Store . "
Y = "Microsoft Acquires Popular Android App Echo Notification Lockscreen^ "

In [358]:
samples = gensamples(X=X, k=10, skips=2, batch_size=BATCH_SIZE, temperature=1., short=False)

HEADS: 
11.164496660232544 : Lockscreen^
17.4766845703125 : Lockscreen^ of Lockscreen^
24.58097004890442 : Lockscreen^ of Lockscreen^ ( )
28.738329887390137 : Lockscreen^ of Lockscreen^ ( ) and
31.96679711341858 : Lockscreen^ of Lockscreen^ ( ) Lockscreen^ to
36.09665632247925 Lockscreen^ : be In next in
39.28780651092529 Lockscreen^ : be In next in Lockscreen^
39.65440595149994 : Lockscreen^ of Lockscreen^ ( ) Lockscreen^ to Citigroup
46.90165972709656 Lockscreen^ : be In next in Lockscreen^ in Lockscreen^
47.96070921421051 : Lockscreen^ of Lockscreen^ ( ) Lockscreen^ to Citigroup Senior
48.542194962501526 : Lockscreen^ of Lockscreen^ ( ) Lockscreen^ in Iran Analysis
51.04038429260254 Lockscreen^ : be In next in Lockscreen^ in Lockscreen^ )
52.64095377922058 Lockscreen^ : be In next in Lockscreen^ for ) ?
59.06521785259247 Lockscreen^ : be In next in Lockscreen^ for ) of 3
59.90263086557388 Lockscreen^ : be In next in Lockscreen^ for ) of Australian
62.37356781959534 : Lockscreen^ of 