<a href="https://colab.research.google.com/github/mrezende/stack_over_flow_qa/blob/master/stack_over_flow_qa.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Keras models for question answering problems from 

In [0]:
from __future__ import print_function

from abc import abstractmethod

from keras.engine import Input
from keras.layers import merge, Embedding, Dropout, Conv1D, Lambda, LSTM, Dense, concatenate, TimeDistributed
from keras import backend as K
from keras.models import Model

import numpy as np


class LanguageModel:
    def __init__(self, config):
        self.question = Input(shape=(config['question_len'],), dtype='int32', name='question_base')
        self.answer_good = Input(shape=(config['answer_len'],), dtype='int32', name='answer_good_base')
        self.answer_bad = Input(shape=(config['answer_len'],), dtype='int32', name='answer_bad_base')

        self.config = config
        self.params = config.get('similarity', dict())

        # initialize a bunch of variables that will be set later
        self._models = None
        self._similarities = None
        self._answer = None
        self._qa_model = None

        self.training_model = None
        self.prediction_model = None

    def get_answer(self):
        if self._answer is None:
            self._answer = Input(shape=(self.config['answer_len'],), dtype='int32', name='answer')
        return self._answer

    @abstractmethod
    def build(self):
        return

    def get_similarity(self):
        ''' Specify similarity in configuration under 'similarity' -> 'mode'
        If a parameter is needed for the model, specify it in 'similarity'

        Example configuration:

        config = {
            ... other parameters ...
            'similarity': {
                'mode': 'gesd',
                'gamma': 1,
                'c': 1,
            }
        }

        cosine: dot(a, b) / sqrt(dot(a, a) * dot(b, b))
        polynomial: (gamma * dot(a, b) + c) ^ d
        sigmoid: tanh(gamma * dot(a, b) + c)
        rbf: exp(-gamma * l2_norm(a-b) ^ 2)
        euclidean: 1 / (1 + l2_norm(a - b))
        exponential: exp(-gamma * l2_norm(a - b))
        gesd: euclidean * sigmoid
        aesd: (euclidean + sigmoid) / 2
        '''

        params = self.params
        similarity = params['mode']

        dot = lambda a, b: K.batch_dot(a, b, axes=1)
        l2_norm = lambda a, b: K.sqrt(K.sum(K.square(a - b), axis=1, keepdims=True))

        if similarity == 'cosine':
            return lambda x: dot(x[0], x[1]) / K.maximum(K.sqrt(dot(x[0], x[0]) * dot(x[1], x[1])), K.epsilon())
        elif similarity == 'polynomial':
            return lambda x: (params['gamma'] * dot(x[0], x[1]) + params['c']) ** params['d']
        elif similarity == 'sigmoid':
            return lambda x: K.tanh(params['gamma'] * dot(x[0], x[1]) + params['c'])
        elif similarity == 'rbf':
            return lambda x: K.exp(-1 * params['gamma'] * l2_norm(x[0], x[1]) ** 2)
        elif similarity == 'euclidean':
            return lambda x: 1 / (1 + l2_norm(x[0], x[1]))
        elif similarity == 'exponential':
            return lambda x: K.exp(-1 * params['gamma'] * l2_norm(x[0], x[1]))
        elif similarity == 'gesd':
            euclidean = lambda x: 1 / (1 + l2_norm(x[0], x[1]))
            sigmoid = lambda x: 1 / (1 + K.exp(-1 * params['gamma'] * (dot(x[0], x[1]) + params['c'])))
            return lambda x: euclidean(x) * sigmoid(x)
        elif similarity == 'aesd':
            euclidean = lambda x: 0.5 / (1 + l2_norm(x[0], x[1]))
            sigmoid = lambda x: 0.5 / (1 + K.exp(-1 * params['gamma'] * (dot(x[0], x[1]) + params['c'])))
            return lambda x: euclidean(x) + sigmoid(x)
        else:
            raise Exception('Invalid similarity: {}'.format(similarity))

    def get_qa_model(self):
        if self._models is None:
            self._models = self.build()

        if self._qa_model is None:
            question_output, answer_output = self._models
            # dropout = Dropout(self.params.get('dropout', 0.2))
            similarity = self.get_similarity()
            # qa_model = merge([dropout(question_output), dropout(answer_output)],
            #                  mode=similarity, output_shape=lambda _: (None, 1))
            # qa_model = Lambda(similarity, output_shape=lambda _: (None, 1))([dropout(question_output),
            #                                                                  dropout(answer_output)])
            qa_model = Lambda(similarity, output_shape=lambda _: (None, 1))([question_output,
                                                                             answer_output])
            self._qa_model = Model(inputs=[self.question, self.get_answer()], outputs=qa_model, name='qa_model')

        return self._qa_model

    def compile(self, optimizer, **kwargs):
        qa_model = self.get_qa_model()

        good_similarity = qa_model([self.question, self.answer_good])
        bad_similarity = qa_model([self.question, self.answer_bad])

        # loss = merge([good_similarity, bad_similarity],
        #              mode=lambda x: K.relu(self.config['margin'] - x[0] + x[1]),
        #              output_shape=lambda x: x[0])

        loss = Lambda(lambda x: K.relu(self.config['margin'] - x[0] + x[1]),
                      output_shape=lambda x: x[0])([good_similarity, bad_similarity])

        self.prediction_model = Model(inputs=[self.question, self.answer_good], outputs=good_similarity,
                                      name='prediction_model')
        self.prediction_model.compile(loss=lambda y_true, y_pred: y_pred, optimizer=optimizer, **kwargs)

        self.training_model = Model(inputs=[self.question, self.answer_good, self.answer_bad], outputs=loss,
                                    name='training_model')
        self.training_model.compile(loss=lambda y_true, y_pred: y_pred, optimizer=optimizer, **kwargs)

    def fit(self, x, **kwargs):
        assert self.training_model is not None, 'Must compile the model before fitting data'
        y = np.zeros(shape=(x[0].shape[0],)) # doesn't get used
        return self.training_model.fit(x, y, **kwargs)

    def predict(self, x):
        assert self.prediction_model is not None and isinstance(self.prediction_model, Model)
        return self.prediction_model.predict_on_batch(x)

    def save_weights(self, file_name, **kwargs):
        assert self.prediction_model is not None, 'Must compile the model before saving weights'
        self.prediction_model.save_weights(file_name, **kwargs)

    def load_weights(self, file_name, **kwargs):
        assert self.prediction_model is not None, 'Must compile the model loading weights'
        self.prediction_model.load_weights(file_name, **kwargs)


class EmbeddingModel(LanguageModel):
    def build(self):
        question = self.question
        answer = self.get_answer()

        # add embedding layers
        weights = np.load(self.config['initial_embed_weights'])
        embedding = Embedding(input_dim=self.config['n_words'],
                              output_dim=weights.shape[1],
                              mask_zero=True,
                              # dropout=0.2,
                              weights=[weights])
        question_embedding = embedding(question)
        answer_embedding = embedding(answer)

        # maxpooling
        maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]))
        maxpool.supports_masking = True
        question_pool = maxpool(question_embedding)
        answer_pool = maxpool(answer_embedding)

        return question_pool, answer_pool


class ConvolutionModel(LanguageModel):
    def build(self):
        assert self.config['question_len'] == self.config['answer_len']

        question = self.question
        answer = self.get_answer()

        # add embedding layers
        weights = np.load(self.config['initial_embed_weights'])
        embedding = Embedding(input_dim=self.config['n_words'],
                              output_dim=weights.shape[1],
                              weights=[weights])
        question_embedding = embedding(question)
        answer_embedding = embedding(answer)

        hidden_layer = TimeDistributed(Dense(200, activation='tanh'))

        question_hl = hidden_layer(question_embedding)
        answer_hl = hidden_layer(answer_embedding)

        # cnn
        cnns = [Conv1D(kernel_size=kernel_size,
                       filters=1000,
                       activation='tanh',
                       padding='same') for kernel_size in [2, 3, 5, 7]]
        # question_cnn = merge([cnn(question_embedding) for cnn in cnns], mode='concat')
        question_cnn = concatenate([cnn(question_hl) for cnn in cnns], axis=-1)
        # answer_cnn = merge([cnn(answer_embedding) for cnn in cnns], mode='concat')
        answer_cnn = concatenate([cnn(answer_hl) for cnn in cnns], axis=-1)

        # maxpooling
        maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]))
        maxpool.supports_masking = True
        # enc = Dense(100, activation='tanh')
        # question_pool = enc(maxpool(question_cnn))
        # answer_pool = enc(maxpool(answer_cnn))
        question_pool = maxpool(question_cnn)
        answer_pool = maxpool(answer_cnn)

        return question_pool, answer_pool


class ConvolutionalLSTM(LanguageModel):
    def build(self):
        question = self.question
        answer = self.get_answer()

        # add embedding layers
        weights = np.load(self.config['initial_embed_weights'])
        embedding = Embedding(input_dim=self.config['n_words'],
                              output_dim=weights.shape[1],
                              weights=[weights])
        question_embedding = embedding(question)
        answer_embedding = embedding(answer)

        f_rnn = LSTM(141, return_sequences=True, implementation=1)
        b_rnn = LSTM(141, return_sequences=True, implementation=1, go_backwards=True)

        qf_rnn = f_rnn(question_embedding)
        qb_rnn = b_rnn(question_embedding)
        # question_pool = merge([qf_rnn, qb_rnn], mode='concat', concat_axis=-1)
        question_pool = concatenate([qf_rnn, qb_rnn], axis=-1)

        af_rnn = f_rnn(answer_embedding)
        ab_rnn = b_rnn(answer_embedding)
        # answer_pool = merge([af_rnn, ab_rnn], mode='concat', concat_axis=-1)
        answer_pool = concatenate([af_rnn, ab_rnn], axis=-1)

        # cnn
        cnns = [Conv1D(kernel_size=kernel_size,
                       filters=500,
                       activation='tanh',
                       padding='same') for kernel_size in [1, 2, 3, 5]]
        # question_cnn = merge([cnn(question_pool) for cnn in cnns], mode='concat')
        question_cnn = concatenate([cnn(question_pool) for cnn in cnns], axis=-1)
        # answer_cnn = merge([cnn(answer_pool) for cnn in cnns], mode='concat')
        answer_cnn = concatenate([cnn(answer_pool) for cnn in cnns], axis=-1)

        maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]))
        maxpool.supports_masking = True
        question_pool = maxpool(question_cnn)
        answer_pool = maxpool(answer_cnn)

        return question_pool, answer_pool


class AttentionModel(LanguageModel):
    def build(self):
        question = self.question
        answer = self.get_answer()

        # add embedding layers
        weights = np.load(self.config['initial_embed_weights'])
        embedding = Embedding(input_dim=self.config['n_words'],
                              output_dim=weights.shape[1],
                              # mask_zero=True,
                              weights=[weights])
        question_embedding = embedding(question)
        answer_embedding = embedding(answer)

        # question rnn part
        f_rnn = LSTM(141, return_sequences=True, consume_less='mem')
        b_rnn = LSTM(141, return_sequences=True, consume_less='mem', go_backwards=True)
        question_f_rnn = f_rnn(question_embedding)
        question_b_rnn = b_rnn(question_embedding)

        # maxpooling
        maxpool = Lambda(lambda x: K.max(x, axis=1, keepdims=False), output_shape=lambda x: (x[0], x[2]))
        maxpool.supports_masking = True
        question_pool = merge([maxpool(question_f_rnn), maxpool(question_b_rnn)], mode='concat', concat_axis=-1)

        # answer rnn part
        from attention_lstm import AttentionLSTMWrapper
        f_rnn = AttentionLSTMWrapper(f_rnn, question_pool, single_attention_param=True)
        b_rnn = AttentionLSTMWrapper(b_rnn, question_pool, single_attention_param=True)

        answer_f_rnn = f_rnn(answer_embedding)
        answer_b_rnn = b_rnn(answer_embedding)
        answer_pool = merge([maxpool(answer_f_rnn), maxpool(answer_b_rnn)], mode='concat', concat_axis=-1)

        return question_pool, answer_pool

Stack over flow evaluation

In [0]:
from __future__ import print_function

import os

import sys
import random
from time import strftime, gmtime, time

import pickle
import json
from keras.preprocessing.text import Tokenizer
from keras import backend as K

import threading
from scipy.stats import rankdata

random.seed(42)

def clear_session():
    K.clear_session()

def log(x):
    print(x)


class Evaluator:
    def __init__(self, conf, model, optimizer=None):
        try:
            data_path = os.environ['STACK_OVER_FLOW_QA']
        except KeyError:
            print("STACK_OVER_FLOW_QA is not set. Set it to your clone of https://github.com/mrezende/stack_over_flow_python")
            sys.exit(1)
        if isinstance(conf, str):
            conf = json.load(open(conf, 'rb'))
        self.model = model(conf)
        self.path = data_path
        self.conf = conf
        self.params = conf['training']
        optimizer = self.params['optimizer'] if optimizer is None else optimizer
        self.model.compile(optimizer)

        self.answers = self.load('answers.json') # self.load('generated')
        self._vocab = None
        self._reverse_vocab = None
        self._eval_sets = None

    ##### Resources #####

    def load(self, name):
        return json.load(open(os.path.join(self.path, name), 'r'))

    def vocab(self):
        if self._vocab is None:
            reverse_vocab = self.reverse_vocab()
            self._vocab = dict((v, k.lower()) for k, v in reverse_vocab.items())
        return self._vocab

    def reverse_vocab(self):
        if self._reverse_vocab is None:
            samples = self.load('samples_for_tokenizer.json')

            tokenizer = Tokenizer()
            tokenizer.fit_on_texts(samples)

            self._reverse_vocab = tokenizer.word_index
        return self._reverse_vocab

    ##### Loading / saving #####

    def save_epoch(self, epoch):
        if not os.path.exists('models/'):
            os.makedirs('models/')
        self.model.save_weights('models/weights_epoch_%d.h5' % epoch, overwrite=True)

    def load_epoch(self, epoch):
        assert os.path.exists('models/weights_epoch_%d.h5' % epoch), 'Weights at epoch %d not found' % epoch
        self.model.load_weights('models/weights_epoch_%d.h5' % epoch)

    ##### Converting / reverting #####

    def convert(self, words):
        rvocab = self.reverse_vocab()
        if type(words) == str:
            words = words.strip().lower().split(' ')
        return [rvocab.get(w, 0) for w in words]

    def revert(self, indices):
        vocab = self.vocab()
        return [vocab.get(i, 'X') for i in indices]

    ##### Padding #####

    def padq(self, data):
        return self.pad(data, self.conf.get('question_len', None))

    def pada(self, data):
        return self.pad(data, self.conf.get('answer_len', None))

    def pad(self, data, len=None):
        from keras.preprocessing.sequence import pad_sequences
        return pad_sequences(data, maxlen=len, padding='post', truncating='post', value=0)

    ##### Training #####

    def get_time(self):
        return strftime('%Y-%m-%d %H:%M:%S', gmtime())

    def train(self):
        batch_size = self.params['batch_size']
        nb_epoch = self.params['nb_epoch']
        validation_split = self.params['validation_split']

        training_set = self.load('train.json')
        # top_50 = self.load('top_50')

        questions = list()
        good_answers = list()
        indices = list()

        for j, q in enumerate(training_set):
            questions += [q['question']] * len(q['answers'])
            good_answers += [i for i in q['answers']]
            indices += [j] * len(q['answers'])
        log('Began training at %s on %d samples' % (self.get_time(), len(questions)))

        questions = self.padq(questions)
        good_answers = self.pada(good_answers)

        val_loss = {'loss': 1., 'epoch': 0}

        # def get_bad_samples(indices, top_50):
        #     return [self.answers[random.choice(top_50[i])] for i in indices]

        for i in range(1, nb_epoch+1):
            # sample from all answers to get bad answers
            # if i % 2 == 0:
            #     bad_answers = self.pada(random.sample(self.answers.values(), len(good_answers)))
            # else:
            #     bad_answers = self.pada(get_bad_samples(indices, top_50))
            bad_answers = self.pada(random.sample(self.answers, len(good_answers)))

            print('Fitting epoch %d' % i, file=sys.stderr)
            hist = self.model.fit([questions, good_answers, bad_answers], epochs=1, batch_size=batch_size,
                                  validation_split=validation_split, verbose=2)

            if hist.history['val_loss'][0] < val_loss['loss']:
                val_loss = {'loss': hist.history['val_loss'][0], 'epoch': i}
            log('%s -- Epoch %d ' % (self.get_time(), i) +
                'Loss = %.4f, Validation Loss = %.4f ' % (hist.history['loss'][0], hist.history['val_loss'][0]) +
                '(Best: Loss = %.4f, Epoch = %d)' % (val_loss['loss'], val_loss['epoch']))

            self.save_epoch(i)

        return val_loss

    ##### Evaluation #####

    def prog_bar(self, so_far, total, n_bars=20):
        n_complete = int(so_far * n_bars / total)
        if n_complete >= n_bars - 1:
            print('\r[' + '=' * n_bars + ']', end='', file=sys.stderr)
        else:
            s = '\r[' + '=' * (n_complete - 1) + '>' + '.' * (n_bars - n_complete) + ']'
            print(s, end='', file=sys.stderr)

    def eval_sets(self):
        if self._eval_sets is None:
            self._eval_sets = dict([(s, self.load(s)) for s in ['test.json']])
        return self._eval_sets

    def get_score(self, verbose=False):
        top1_ls = []
        mrr_ls = []
        for name, data in self.eval_sets().items():
            print('----- %s -----' % name)

            random.shuffle(data)

            if 'n_eval' in self.params:
                data = data[:self.params['n_eval']]

            c_1, c_2 = 0, 0

            for i, d in enumerate(data):
                self.prog_bar(i, len(data))

                answers = d['good'] + d['bad']
                answers = self.pada(answers)
                question = self.padq([d['question']] * len(answers))

                sims = self.model.predict([question, answers])

                n_good = len(d['good'])
                max_r = np.argmax(sims)
                max_n = np.argmax(sims[:n_good])

                r = rankdata(sims, method='max')

                #if verbose:
                #    min_r = np.argmin(sims)
                #    amin_r = self.answers[indices[min_r]]
                #     amax_r = self.answers[indices[max_r]]
                #     amax_n = self.answers[indices[max_n]]
                #
                #     print(' '.join(self.revert(d['question'])))
                #     print('Predicted: ({}) '.format(sims[max_r]) + ' '.join(self.revert(amax_r)))
                #     print('Expected: ({}) Rank = {} '.format(sims[max_n], r[max_n]) + ' '.join(self.revert(amax_n)))
                #     print('Worst: ({})'.format(sims[min_r]) + ' '.join(self.revert(amin_r)))

                c_1 += 1 if max_r == max_n else 0
                c_2 += 1 / float(r[max_r] - r[max_n] + 1)

            top1 = c_1 / float(len(data))
            mrr = c_2 / float(len(data))

            del data
            print('Top-1 Precision: %f' % top1)
            print('MRR: %f' % mrr)
            top1_ls.append(top1)
            mrr_ls.append(mrr)
        return top1_ls, mrr_ls


if __name__ == '__main__':
    if len(sys.argv) >= 2 and sys.argv[1] == 'serve':
        from flask import Flask
        app = Flask(__name__)
        port = 5000
        lines = list()
        def log(x):
            lines.append(x)

        @app.route('/')
        def home():
            return ('<html><body><h1>Training Log</h1>' +
                    ''.join(['<code>{}</code><br/>'.format(line) for line in lines]) +
                    '</body></html>')

        def start_server():
            app.run(debug=False, use_evalex=False, port=port)

        threading.Thread(target=start_server, args=tuple()).start()
        print('Serving to port %d' % port, file=sys.stderr)

    import numpy as np

    confs = json.load(open('stack_over_flow_conf.json', 'r'))

    from keras_models import EmbeddingModel, ConvolutionModel, ConvolutionalLSTM
    for conf in confs:
        print(conf)
        evaluator = Evaluator(conf, model=ConvolutionalLSTM, optimizer='adam')

        # train the model
        best_loss = evaluator.train()

        # evaluate mrr for a particular epoch
        evaluator.load_epoch(best_loss['epoch'])
        top1, mrr = evaluator.get_score(verbose=False)
        log(' - Top-1 Precision:')
        log('   - %.3f on test 1' % top1[0])

        log(' - MRR:')
        log('   - %.3f on test 1' % mrr[0])
        clear_session()
