In [173]:
from keras import initializers
from keras import backend as K
from keras import regularizers
from keras.layers import Dense, Activation, Bidirectional, Embedding, GRU, Concatenate
from keras.layers.core import Reshape, Dropout
from keras.models import Model, Input, save_model, load_model

# Params
MAX_SEQUENCE_LENGTH = 20000
DISC_HIDDEN_SIZE_LSTM = 64
DISC_HIDDEN_SIZE_DENSE = 4612
dropout = 0.2
weight_decay = 0.25
samples_per_epoch = 12000
learning_rate = 0.002

In [174]:
embeddings_layer_q = Embedding(
            input_dim=20000,
            output_dim=300,
            input_length=MAX_SEQUENCE_LENGTH,
            trainable=False)

embeddings_layer_d = Embedding(
            input_dim=20000,
            output_dim=300,
            input_length=MAX_SEQUENCE_LENGTH,
            trainable=False)

### AdamW

In [175]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

from six.moves import zip

from keras import backend as K
from keras.legacy import interfaces

from keras.optimizers import Optimizer


class AdamW(Optimizer):
    """AdamW optimizer.
    Default parameters follow those provided in the original paper.
    # Arguments
        lr: float >= 0. Learning rate.
        beta_1: float, 0 < beta < 1. Generally close to 1.
        beta_2: float, 0 < beta < 1. Generally close to 1.
        epsilon: float >= 0. Fuzz factor. If `None`, defaults to `K.epsilon()`.
        decay: float >= 0. Learning rate decay over each update.
        weight_decay: float >= 0. Weight decay (L2 penalty) (default: 0.025).
        batch_size: integer >= 1. Batch size used during training.
        samples_per_epoch: integer >= 1. Number of samples (training points) per epoch.
        epochs: integer >= 1. Total number of epochs for training.
    # References
        - [Adam - A Method for Stochastic Optimization](http://arxiv.org/abs/1412.6980v8)
        - [Fixing Weight Decay Regularization in Adam](https://arxiv.org/abs/1711.05101)
    """

    def __init__(self, lr=0.001, beta_1=0.9, beta_2=0.999,
                 epsilon=None, decay=0., weight_decay=0.025,
                 batch_size=1, samples_per_epoch=1,
                 epochs=1, **kwargs):
        super(AdamW, self).__init__(**kwargs)
        with K.name_scope(self.__class__.__name__):
            self.iterations = K.variable(0, dtype='int64', name='iterations')
            self.lr = K.variable(lr, name='lr')
            self.beta_1 = K.variable(beta_1, name='beta_1')
            self.beta_2 = K.variable(beta_2, name='beta_2')
            self.decay = K.variable(decay, name='decay')
            self.weight_decay = K.variable(weight_decay, name='weight_decay')
            self.batch_size = K.variable(batch_size, name='batch_size')
            self.samples_per_epoch = K.variable(samples_per_epoch, name='samples_per_epoch')
            self.epochs = K.variable(epochs, name='epochs')
        if epsilon is None:
            epsilon = K.epsilon()
        self.epsilon = epsilon
        self.initial_decay = decay

    @interfaces.legacy_get_updates_support
    def get_updates(self, loss, params):
        grads = self.get_gradients(loss, params)
        self.updates = [K.update_add(self.iterations, 1)]

        lr = self.lr
        if self.initial_decay > 0:
            lr = lr * (1. / (1. + self.decay * K.cast(self.iterations,
                                                      K.dtype(self.decay))))

        t = K.cast(self.iterations, K.floatx()) + 1
        '''Bias corrections according to the Adam paper
        '''
        lr_t = lr * (K.sqrt(1. - K.pow(self.beta_2, t)) /
                     (1. - K.pow(self.beta_1, t)))

        ms = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        vs = [K.zeros(K.int_shape(p), dtype=K.dtype(p)) for p in params]
        self.weights = [self.iterations] + ms + vs

        for p, g, m, v in zip(params, grads, ms, vs):
            m_t = (self.beta_1 * m) + (1. - self.beta_1) * g
            v_t = (self.beta_2 * v) + (1. - self.beta_2) * K.square(g)

            '''Schedule multiplier eta_t = 1 for simple AdamW
            According to the AdamW paper, eta_t can be fixed, decay, or 
            also be used for warm restarts (AdamWR to come). 
            '''
            eta_t = 1.
            p_t = p - eta_t * (lr_t * m_t / (K.sqrt(v_t) + self.epsilon))
            if self.weight_decay != 0:
                '''Normalized weight decay according to the AdamW paper
                '''
                w_d = self.weight_decay * K.sqrt(self.batch_size / (self.samples_per_epoch * self.epochs))
                p_t = p_t - eta_t * (w_d * p)

            self.updates.append(K.update(m, m_t))
            self.updates.append(K.update(v, v_t))
            new_p = p_t

            # Apply constraints.
            if getattr(p, 'constraint', None) is not None:
                new_p = p.constraint(new_p)

            self.updates.append(K.update(p, new_p))
        return self.updates

    def get_config(self):
        config = {'lr': float(K.get_value(self.lr)),
                  'beta_1': float(K.get_value(self.beta_1)),
                  'beta_2': float(K.get_value(self.beta_2)),
                  'decay': float(K.get_value(self.decay)),
                  'weight_decay': float(K.get_value(self.weight_decay)),
                  'batch_size': int(K.get_value(self.batch_size)),
                  'samples_per_epoch': int(K.get_value(self.samples_per_epoch)),
                  'epochs': int(K.get_value(self.epochs)),
                  'epsilon': self.epsilon}
        base_config = super(AdamW, self).get_config()
        return dict(list(base_config.items()) + list(config.items()))

In [176]:
adamw = AdamW(batch_size=8, samples_per_epoch=samples_per_epoch,
                      epochs=12000)

### Queries Input

In [177]:
sequence_input_q = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='input_query')
sequence_input_q

<tf.Tensor 'input_query_6:0' shape=(?, 20000) dtype=int32>

In [178]:
embedded_sequences_q = embeddings_layer_q(sequence_input_q)
embeddings_layer_q

<keras.layers.embeddings.Embedding at 0x7f64869341d0>

In [179]:
lstm_q_in = Bidirectional(GRU(DISC_HIDDEN_SIZE_LSTM, return_sequences=True, activation='elu', dropout=dropout, recurrent_dropout=dropout))(embedded_sequences_q)
lstm_q_in

<tf.Tensor 'bidirectional_29/concat:0' shape=(?, ?, 128) dtype=float32>

In [180]:
lstm_q_out = Bidirectional(GRU(DISC_HIDDEN_SIZE_LSTM, return_sequences=False, activation='elu', dropout=dropout, recurrent_dropout=dropout))(lstm_q_in)
lstm_q_out

<tf.Tensor 'bidirectional_30/concat:0' shape=(?, 128) dtype=float32>

### Documents Input

In [181]:
sequence_input_d = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32', name='input_doc')
sequence_input_d

<tf.Tensor 'input_doc_6:0' shape=(?, 20000) dtype=int32>

In [182]:
embedded_sequences_d = embeddings_layer_d(sequence_input_d)
embedded_sequences_d

<tf.Tensor 'embedding_14/embedding_lookup:0' shape=(?, 20000, 300) dtype=float32>

In [183]:
lstm_d_in = Bidirectional(GRU(DISC_HIDDEN_SIZE_LSTM, return_sequences=True, activation='elu', dropout=dropout, recurrent_dropout=dropout))(embedded_sequences_d)
lstm_d_in

<tf.Tensor 'bidirectional_31/concat:0' shape=(?, ?, 128) dtype=float32>

In [184]:
lstm_d_out = Bidirectional(GRU(DISC_HIDDEN_SIZE_LSTM, return_sequences=False, activation='elu', dropout=dropout, recurrent_dropout=dropout))(lstm_d_in)
lstm_d_out

<tf.Tensor 'bidirectional_32/concat:0' shape=(?, 128) dtype=float32>

### Concatenate and then Dense Layers

In [185]:
x = Concatenate()([lstm_q_out, lstm_d_out])
x

<tf.Tensor 'concatenate_8/concat:0' shape=(?, 256) dtype=float32>

In [186]:
x = Dropout(dropout)(x)
x

<tf.Tensor 'dropout_8/cond/Merge:0' shape=(?, 256) dtype=float32>

In [187]:
x = Dense(DISC_HIDDEN_SIZE_DENSE,
          activation='elu',
          kernel_regularizer=regularizers.l2(),
          kernel_initializer=initializers.random_normal(stddev=0.01),
          name='merged_input')(x)
x

<tf.Tensor 'merged_input_7/Elu:0' shape=(?, 4612) dtype=float32>

In [188]:
x = Dense(1, activation='elu')(x)
x

<tf.Tensor 'dense_8/Elu:0' shape=(?, 1) dtype=float32>

In [189]:
score = Reshape([-1])(x)
score

<tf.Tensor 'reshape_7/Reshape:0' shape=(?, ?) dtype=float32>

In [190]:
prob = Activation('sigmoid', name='prob')(score)

In [191]:
model = Model(inputs=[sequence_input_q, sequence_input_d], outputs=[prob])
model

<keras.engine.training.Model at 0x7f6483925f60>

In [192]:
model.summary()

__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_query (InputLayer)        (None, 20000)        0                                            
__________________________________________________________________________________________________
input_doc (InputLayer)          (None, 20000)        0                                            
__________________________________________________________________________________________________
embedding_13 (Embedding)        (None, 20000, 300)   6000000     input_query[0][0]                
__________________________________________________________________________________________________
embedding_14 (Embedding)        (None, 20000, 300)   6000000     input_doc[0][0]                  
__________________________________________________________________________________________________
bidirectio

In [193]:
model.compile(loss='binary_crossentropy',
                      optimizer=adamw,
                      metrics=['accuracy'])
model

<keras.engine.training.Model at 0x7f6483925f60>

In [194]:
inp = model.input
print(inp)
out = model.get_layer("prob").output
print(out)

[<tf.Tensor 'input_query_6:0' shape=(?, 20000) dtype=int32>, <tf.Tensor 'input_doc_6:0' shape=(?, 20000) dtype=int32>]
Tensor("prob_6/Sigmoid:0", shape=(?, ?), dtype=float32)


In [195]:
import nltk
from nltk.corpus import stopwords

from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

MAX_SEQUENCE_LENGTH = 20000
MAX_NUM_WORDS = 20000
EMBEDDING_DIM = 300

WORKDIR = '/home/lukas/git-projects/lstm-irgan'
DOCUMENTS_DIR = WORKDIR + '/data/wikiclir/dev.docs'  #'/data/example/documents/'
QUERIES = WORKDIR + '/data/wikiclir/dev.queries' #'/data/example/queries.txt'
LABELLED_DATA = WORKDIR + '/data/wikiclir/dev.qrel' #'/data/example/labelled_data.txt'

def __get_documents():
    path = DOCUMENTS_DIR
    documents = {}
    doc_ids = []

    with open(path) as f:
        content = f.readlines()
        for line in content[:100]:
            values = line.split("\t", 1)
            id = int(values[0])
            text = values[1]
            documents[id] = text
            doc_ids.append(id)
    return documents, doc_ids


def __get_queries():
    path = QUERIES
    queries = {}
    query_ids = []

    with open(path) as f:
        content = f.readlines()
        for line in content[:100]:
            values = line.split("\t", 1)
            id = int(values[0])
            text = values[1]
            queries[id] = text
            query_ids.append(id)
    return queries, query_ids


def __get_ratings():
    path = LABELLED_DATA
    ratings = {}

    with open(path) as f:
        content = f.readlines()
        for line in content[:100]:
            values = line.split("\t")
            query = int(values[0])
            text = int(values[2])
            rating = float(values[3])

            if query in ratings.keys():
                ratings[query][text] = rating
            else:
                ratings[query] = {text: rating}

    return ratings


def __filter_stop_words(texts, stop_words):
    for i, text in enumerate(texts):
        new_text = [word for word in text.split() if word not in stop_words]
        texts[i] = ' '.join(new_text)
    return texts


def __init_tokenizer(text_data, max_sequence_length):
    texts = list(text_data.values())
    ids = list(text_data.keys())

    nltk.download('stopwords')
    stop_words = set(stopwords.words('english'))
    stop_words.update(['.', ',', '"', "'", ':', ';', '(', ')', '[', ']', '{', '}'])
    texts = __filter_stop_words(texts, stop_words)

    # finally, vectorize the text samples into a 2D integer tensor
    tokenizer = Tokenizer(num_words=MAX_NUM_WORDS)
    tokenizer.fit_on_texts(texts)
    sequences = tokenizer.texts_to_sequences(texts)

    word_index = tokenizer.word_index
    print('Found %s unique tokens.' % len(word_index))

    data = pad_sequences(sequences, maxlen=max_sequence_length)

    text_data_sequenced = {}
    for i, text in enumerate(data):
        text_data_sequenced[ids[i]] = text

    return tokenizer, text_data_sequenced


def get_data():
    documents_data, doc_ids = __get_documents()
    queries_data, query_ids = __get_queries()
    ratings_data = __get_ratings()

    print('Tokenize queries')
    tokenizer_q, queries_data = __init_tokenizer(queries_data, MAX_SEQUENCE_LENGTH)
    print('Tokenize documents')
    tokenizer_d, documents_data = __init_tokenizer(documents_data, MAX_SEQUENCE_LENGTH)

    print('Found %s training data.' % len(ratings_data))

    return query_ids, ratings_data, documents_data, queries_data, tokenizer_q, tokenizer_d

query_ids, ratings_data, documents_data, queries_data, tokenizer_q, tokenizer_d = get_data()

Tokenize queries
[nltk_data] Downloading package stopwords to /home/lukas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Found 1247 unique tokens.
Tokenize documents
[nltk_data] Downloading package stopwords to /home/lukas/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
Found 7709 unique tokens.
Found 1 training data.


In [204]:
docs = [value for key, value in list(documents_data.items())[:3]]
key, value = list(queries_data.items())[0]
queries = [value]*3
prob = [0.2, 0.5, 0,1]
print(docs)
print(queries)

[array([   0,    0,    0, ...,   23, 3388, 1496], dtype=int32), array([   0,    0,    0, ...,  286,  591, 2098], dtype=int32), array([  0,   0,   0, ..., 169, 503, 188], dtype=int32)]
[array([  0,   0,   0, ...,   2, 204,  28], dtype=int32), array([  0,   0,   0, ...,   2, 204,  28], dtype=int32), array([  0,   0,   0, ...,   2, 204,  28], dtype=int32)]


In [205]:
def get_reward(train_data_queries, train_data_documents):
    inputs = model.inputs + [K.learning_phase()]
    print(inputs)
    out = model.get_layer("prob").output
    functor = K.function(inputs, [out])
    print(functor)
    layer_outs = functor([train_data_queries, train_data_documents, 1.])
    return (layer_outs[0] - 0.5) * 2

reward = get_reward(queries, docs)
reward

[<tf.Tensor 'input_query_6:0' shape=(?, 20000) dtype=int32>, <tf.Tensor 'input_doc_6:0' shape=(?, 20000) dtype=int32>, <tf.Tensor 'bidirectional_1/keras_learning_phase:0' shape=() dtype=bool>]
<keras.backend.tensorflow_backend.Function object at 0x7f64855013c8>


array([[-0.00151122],
       [-0.00453931],
       [-0.00172484]], dtype=float32)

In [212]:
def get_preresult(train_data_queries, train_data_documents):
    return (model.predict([train_data_queries, train_data_documents]) - 0.5) * 2
    
preresult = get_preresult(queries, docs)
preresult

array([[-0.00094205],
       [-0.00294149],
       [-0.00079733]], dtype=float32)

In [None]:
import numpy as np

def train(train_data_queries, train_data_documents, train_data_label):
    model.train_on_batch([train_data_queries, train_data_documents], train_data_label)
    
# choose data
choose_queries = np.array(queries)
choose_documents = np.array(docs)

# prepare pos and neg label
pred_data_label = [1.0] * 1
pred_data_label.extend([0.0] * 2)
pred_data_label = np.asarray(pred_data_label)

train(choose_queries, choose_documents, pred_data_label)