In [1]:
# -*- coding: utf8 -*-
from __future__ import print_function
import os
import sys
import time
import json
import argparse
import random
random.seed(49999)
from collections import OrderedDict

import numpy
numpy.random.seed(49999)
import tensorflow
tensorflow.set_random_seed(49999)
import keras
import keras.backend as K
from keras.models import Sequential, Model
from keras.layers import *
from keras.optimizers import Adadelta

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
from utils import *
import inputs
import metrics
from losses import *
from optimizers import *

In [3]:
config = tensorflow.ConfigProto()
config.gpu_options.allow_growth = True
sess = tensorflow.Session(config = config)

In [4]:
sys.path.append('models/')

from utils.utility import *
from layers.Match import *

class MVLSTM:
    def __init__(self, config):
        self.__name = 'MVLSTM'
        self.config = {}
        self.check_list = [ 'text1_maxlen', 'text2_maxlen',
                   'embed', 'embed_size', 'train_embed',  'vocab_size',
                   'hidden_size', 'topk', 'dropout_rate']
        self.embed_trainable = config['train_embed']
        self.setup(config)
        if not self.check():
            raise TypeError('[MVLSTM] parameter check wrong')
        print('[MVLSTM] init done', end='\n')
        
    def set_default(self, k, v):
        if k not in self.config:
            self.config[k] = v
            
    def check(self):
        for e in self.check_list:
            if e not in self.config:
                print(e, end='\n')
                print('[Model] Error %s not in config' % e, end='\n')
                return False
        return True
    
    def check_list(self,check_list):
        self.check_list = check_list

    def setup(self, config):
        if not isinstance(config, dict):
            raise TypeError('parameter config should be dict:', config)

        self.set_default('hidden_size', 32)
        self.set_default('topk', 100)
        self.set_default('dropout_rate', 0)
        self.config.update(config)

    def build(self):
        query = Input(name='query', shape=(self.config['text1_maxlen'],))
        show_layer_info('Input', query)
        doc = Input(name='doc', shape=(self.config['text2_maxlen'],))
        show_layer_info('Input', doc)

        embedding = Embedding(self.config['vocab_size'], self.config['embed_size'], weights=[self.config['embed']], trainable = self.embed_trainable)
        q_embed = embedding(query)
        show_layer_info('Embedding', q_embed)
        d_embed = embedding(doc)
        show_layer_info('Embedding', d_embed)

        q_rep = Bidirectional(LSTM(self.config['hidden_size'], return_sequences=True, dropout=self.config['dropout_rate']))(q_embed)
        show_layer_info('Bidirectional-LSTM', q_rep)
        d_rep = Bidirectional(LSTM(self.config['hidden_size'], return_sequences=True, dropout=self.config['dropout_rate']))(d_embed)
        show_layer_info('Bidirectional-LSTM', d_rep)

        cross = Match(match_type='dot')([q_rep, d_rep])
        #cross = Dot(axes=[2, 2])([q_embed, d_embed])
        show_layer_info('Match-dot', cross)

        cross_reshape = Reshape((-1, ))(cross)
        show_layer_info('Reshape', cross_reshape)

        mm_k = Lambda(lambda x: K.tf.nn.top_k(x, k=self.config['topk'], sorted=True)[0])(cross_reshape)
        show_layer_info('Lambda-topk', mm_k)

        pool1_flat_drop = Dropout(rate=self.config['dropout_rate'])(mm_k)
        show_layer_info('Dropout', pool1_flat_drop)

        if self.config['target_mode'] == 'classification':
            out_ = Dense(2, activation='softmax')(pool1_flat_drop)
        elif self.config['target_mode'] in ['regression', 'ranking']:
            out_ = Dense(1)(pool1_flat_drop)
        show_layer_info('Dense', out_)

        #model = Model(inputs=[query, doc, dpool_index], outputs=out_)
        model = Model(inputs=[query, doc], outputs=out_)
        return model

def load_model(config):
    model_config = config['model']['setting']
    model_config.update(config['inputs']['share'])
    model = MVLSTM(model_config)
    mo = model.build()
    return mo

In [11]:
def train(config):

    print(json.dumps(config, indent=2), end='\n')
    # read basic config
    global_conf = config["global"]
#     optimizer = global_conf['optimizer']
#     optimizer = optimizers.get(optimizer)
#     K.set_value(optimizer.lr, global_conf['learning_rate'])
    weights_file = str(global_conf['weights_file']) + '.%d'
    display_interval = int(global_conf['display_interval'])
    num_iters = int(global_conf['num_iters'])
    save_weights_iters = int(global_conf['save_weights_iters'])

    # read input config
    input_conf = config['inputs']
    share_input_conf = input_conf['share']


    # collect embedding
    if 'embed_path' in share_input_conf:
        embed_dict = read_embedding(filename=share_input_conf['embed_path'])
        _PAD_ = share_input_conf['vocab_size'] - 1
        embed_dict[_PAD_] = np.zeros((share_input_conf['embed_size'], ), dtype=np.float32)
        embed = np.float32(np.random.uniform(-0.2, 0.2, [share_input_conf['vocab_size'], share_input_conf['embed_size']]))
        share_input_conf['embed'] = convert_embed_2_numpy(embed_dict, embed = embed)
    else:
        embed = np.float32(np.random.uniform(-0.2, 0.2, [share_input_conf['vocab_size'], share_input_conf['embed_size']]))
        share_input_conf['embed'] = embed
    print('[Embedding] Embedding Load Done.', end='\n')

    # list all input tags and construct tags config
    input_train_conf = OrderedDict()
    input_eval_conf = OrderedDict()
    for tag in input_conf.keys():
        if 'phase' not in input_conf[tag]:
            continue
        if input_conf[tag]['phase'] == 'TRAIN':
            input_train_conf[tag] = {}
            input_train_conf[tag].update(share_input_conf)
            input_train_conf[tag].update(input_conf[tag])
        elif input_conf[tag]['phase'] == 'EVAL':
            input_eval_conf[tag] = {}
            input_eval_conf[tag].update(share_input_conf)
            input_eval_conf[tag].update(input_conf[tag])
    print('[Input] Process Input Tags. %s in TRAIN, %s in EVAL.' % (input_train_conf.keys(), input_eval_conf.keys()), end='\n')

    # collect dataset identification
    dataset = {}
    for tag in input_conf:
        if tag != 'share' and input_conf[tag]['phase'] == 'PREDICT':
            continue
        if 'text1_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text1_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
        if 'text2_corpus' in input_conf[tag]:
            datapath = input_conf[tag]['text2_corpus']
            if datapath not in dataset:
                dataset[datapath], _ = read_data(datapath)
    print('[Dataset] %s Dataset Load Done.' % len(dataset), end='\n')

    # initial data generator
    train_gen = OrderedDict()
    eval_gen = OrderedDict()

    for tag, conf in input_train_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        train_gen[tag] = generator( config = conf )

    for tag, conf in input_eval_conf.items():
        print(conf, end='\n')
        conf['data1'] = dataset[conf['text1_corpus']]
        conf['data2'] = dataset[conf['text2_corpus']]
        generator = inputs.get(conf['input_type'])
        eval_gen[tag] = generator( config = conf )

    ######### Load Model #########
    model = load_model(config)

    loss = []
    for lobj in config['losses']:
        if lobj['object_name'] in mz_specialized_losses:
            loss.append(rank_losses.get(lobj['object_name'])(lobj['object_params']))
        else:
            loss.append(rank_losses.get(lobj['object_name']))
    eval_metrics = OrderedDict()
    for mobj in config['metrics']:
        mobj = mobj.lower()
        if '@' in mobj:
            mt_key, mt_val = mobj.split('@', 1)
            eval_metrics[mobj] = metrics.get(mt_key)(int(mt_val))
        else:
            eval_metrics[mobj] = metrics.get(mobj)
    optimizer = Adadelta(lr=config["global"]["learning_rate"], rho=0.95)
    model.compile(optimizer = optimizer, loss=loss)
    print('[Model] Model Compile Done.', end='\n')

    for i_e in range(num_iters):
        for tag, generator in train_gen.items():
            genfun = generator.get_batch_generator()
            print('[%s]\t[Train:%s] ' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), end='')
            history = model.fit_generator(
                    genfun,
                    steps_per_epoch = display_interval,
                    epochs = 1,
                    shuffle=False,
                    verbose = 0
                ) #callbacks=[eval_map])
            print('Iter:%d\tloss=%.6f' % (i_e, history.history['loss'][0]), end='\n')

#         for tag, generator in eval_gen.items():
#             genfun = generator.get_batch_generator()
#             print('[%s]\t[Eval:%s] ' % (time.strftime('%m-%d-%Y %H:%M:%S', time.localtime(time.time())), tag), end='')
#             res = dict([[k,0.] for k in eval_metrics.keys()])
#             num_valid = 0
#             for input_data, y_true in genfun:
#                 y_pred = model.predict(input_data, batch_size=len(y_true))
#                 if issubclass(type(generator), inputs.list_generator.ListBasicGenerator):
#                     list_counts = input_data['list_counts']
#                     for k, eval_func in eval_metrics.items():
#                         for lc_idx in range(len(list_counts)-1):
#                             pre = list_counts[lc_idx]
#                             suf = list_counts[lc_idx+1]
#                             res[k] += eval_func(y_true = y_true[pre:suf], y_pred = y_pred[pre:suf])
#                     num_valid += len(list_counts) - 1
#                 else:
#                     for k, eval_func in eval_metrics.items():
#                         res[k] += eval_func(y_true = y_true, y_pred = y_pred)
#                     num_valid += 1
#             generator.reset()
#             print('Iter:%d\t%s' % (i_e, '\t'.join(['%s=%f'%(k,v/num_valid) for k, v in res.items()])), end='\n')
#             sys.stdout.flush()
#         if (i_e+1) % save_weights_iters == 0:
#             model.save_weights(weights_file % (i_e+1))

In [12]:
config = {
  "net_name": "MVLSTM",
  "global":{
      "model_type": "PY",
      "weights_file": "../examples/wikiqa/weights/mvlstm.wikiqa.weights",
      "save_weights_iters": 10,
      "num_iters": 400,
      "display_interval": 10,
      "test_weights_iters": 400,
      "optimizer": "adadelta",
      "learning_rate": 1.0
  },
  "inputs": {
    "share": {
        "text1_corpus": "../data/WikiQA/corpus_preprocessed.txt",
        "text2_corpus": "../data/WikiQA/corpus_preprocessed.txt",
        "use_dpool": False,
        "embed_size": 50,
        "embed_path": "../data/WikiQA/embed_glove_d50",
        "vocab_size": 18670,
        "train_embed": False,
        "target_mode": "ranking",
        "text1_maxlen": 10,
        "text2_maxlen": 40
    },
    "train": {
        "input_type": "PairGenerator", 
        "phase": "TRAIN",
        "use_iter": False,
        "query_per_iter": 50,
        "batch_per_iter": 5,
        "batch_size": 100,
        "relation_file": "../data/WikiQA/relation_train.txt"
    },
    "valid": {
        "input_type": "ListGenerator", 
        "phase": "EVAL",
        "batch_list": 10,
        "relation_file": "../data/WikiQA/relation_valid.txt"
    },
    "test": {
        "input_type": "ListGenerator", 
        "phase": "EVAL",
        "batch_list": 10,
        "relation_file": "../data/WikiQA/relation_test.txt"
    },
    "predict": {
        "input_type": "ListGenerator", 
        "phase": "PREDICT",
        "batch_list": 10,
        "relation_file": "../data/WikiQA/relation_test.txt"
    }
  },
  "outputs": {
    "predict": {
      "save_format": "TREC",
      "save_path": "predict.test.mvlstm.wikiqa.txt"
    }
  },
  "model": {
    "model_path": "models/",
    "model_py": "mvlstm.MVLSTM",
    "setting": {
        "hidden_size": 50,
        "topk": 100,
        "dropout_rate": 0.5
    }
  },
  "losses": [ 
    {
       "object_name": "rank_hinge_loss" ,
       "object_params": {
            "margin": 1.0
       }
    }
  ],
  "metrics": [ "ndcg@3", "ndcg@5", "map" ]
}


In [None]:
phase = 'train'
# model_file = '../examples/wikiqa/config/mvlstm_wikiqa.config'
# with open(model_file, 'r') as f:
#     config = json.load(f)
    
if phase == 'train':
    train(config)
elif phase == 'predict':
    predict(config)
else:
    print('Phase Error.', end='\n')

{
  "inputs": {
    "test": {
      "phase": "EVAL", 
      "input_type": "ListGenerator", 
      "relation_file": "../data/WikiQA/relation_test.txt", 
      "batch_list": 10
    }, 
    "predict": {
      "phase": "PREDICT", 
      "input_type": "ListGenerator", 
      "relation_file": "../data/WikiQA/relation_test.txt", 
      "batch_list": 10
    }, 
    "train": {
      "relation_file": "../data/WikiQA/relation_train.txt", 
      "input_type": "PairGenerator", 
      "batch_size": 100, 
      "batch_per_iter": 5, 
      "phase": "TRAIN", 
      "query_per_iter": 50, 
      "use_iter": false
    }, 
    "share": {
      "text2_corpus": "../data/WikiQA/corpus_preprocessed.txt", 
      "vocab_size": 18670, 
      "use_dpool": false, 
      "embed_path": "../data/WikiQA/embed_glove_d50", 
      "text1_maxlen": 10, 
      "embed_size": 50, 
      "target_mode": "ranking", 
      "train_embed": false, 
      "text1_corpus": "../data/WikiQA/corpus_preprocessed.txt", 
      "text2_maxlen":