# 智能问答方案概述
### 1. 使用 gensim 训练词向量 iter=10,min_count=4,size=100,workers=6,negative=8,window=5
### 2. 基于paddle 构建二分类模型，确定 para 筛选特征
### 3. 修改 baseline ，加载（1）训练得词向量 + （2）para筛选方式,1 epoch 40+, 3 epoch 41.805000

In [1]:
#确定cuda 和 cudnn 版本
!cat /usr/local/cuda/version.txt
!cat /usr/local/cuda/include/cudnn.h | grep CUDNN_MAJOR -A 2

CUDA Version 8.0.61
#define CUDNN_MAJOR 7
#define CUDNN_MINOR 1
#define CUDNN_PATCHLEVEL 3
--
#define CUDNN_VERSION    (CUDNN_MAJOR * 1000 + CUDNN_MINOR * 100 + CUDNN_PATCHLEVEL)

#include "driver_types.h"


In [221]:
import gensim, logging,json,os
from collections import Counter

## 1. 使用 gensim 训练词向量

In [222]:
logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO)
 
root = '/mnt/datasets/dureader/preprocessed/'
s_train = root+'trainset/search.train.json'
z_train = root+'trainset/zhidao.train.json'
s_dev = root +'devset/search.dev.json'
z_dev = root+'devset/zhidao.dev.json'
s_test = root +'testset/search.test.json'
z_test = root +'testset/zhidao.test.json'

fs = [s_train,z_train,s_dev,z_dev,s_test,z_test]

class MySentences(object):
    def __init__(self, dirnames):
        self.dirnames = dirnames
    def __iter__(self):
        for fname in self.dirnames:
            for line in open(fname):
            	obj = json.loads(line.strip())
            	ques_sent = obj['segmented_question']
            	yield ques_sent
            	for doc in obj['documents']:
            		title_sent = doc['segmented_title']
            		yield title_sent
            		for p in doc['segmented_paragraphs']:
            			yield p
sents = MySentences(fs) # a memory-friendly iterator

In [7]:
%%time
model = gensim.models.Word2Vec(iter=10,min_count=4,size=100,workers=6,negative=8,window=5)  # an empty model, no training yet
model.build_vocab(sents)  # can be a non-repeatable, 1-pass generator
sents = MySentences(fs)
model.train(sents,total_examples=model.corpus_count,epochs=model.epochs)  # can be a non-repeatable, 1-pass generator

2018-06-24 07:06:31,498 : INFO : collecting all words and their counts
2018-06-24 07:06:31,500 : INFO : PROGRESS: at sentence #0, processed 0 words, keeping 0 word types
2018-06-24 07:06:31,729 : INFO : PROGRESS: at sentence #10000, processed 417539 words, keeping 34082 word types
2018-06-24 07:06:31,973 : INFO : PROGRESS: at sentence #20000, processed 834472 words, keeping 49902 word types
2018-06-24 07:06:32,219 : INFO : PROGRESS: at sentence #30000, processed 1253373 words, keeping 63265 word types
2018-06-24 07:06:32,457 : INFO : PROGRESS: at sentence #40000, processed 1664600 words, keeping 73866 word types
2018-06-24 07:06:32,700 : INFO : PROGRESS: at sentence #50000, processed 2087604 words, keeping 82949 word types
2018-06-24 07:06:32,953 : INFO : PROGRESS: at sentence #60000, processed 2506449 words, keeping 91513 word types
2018-06-24 07:06:33,195 : INFO : PROGRESS: at sentence #70000, processed 2904472 words, keeping 99080 word types
2018-06-24 07:06:33,437 : INFO : PROGRESS

CPU times: user 5h 5min 32s, sys: 3min 18s, total: 5h 8min 50s
Wall time: 1h 46min 15s


In [None]:
## 持久化词向量之前，删除上一次训练生成的 models 和 vocab, 保证不超过工作区空间限制
!rm -vrf ./data/models/
!rm -vrf ./data/vocab/

In [10]:
%%time
import gzip
# gzip.open(filename, mode='rb', compresslevel=9, encoding=None, errors=None, newline=None)
# https://docs.python.org/3/library/gzip.html
with gzip.open('./100_ver_not_pure.bin', 'wt',compresslevel=9,encoding='utf-8') as f:
    for k in model.wv.vocab.keys():
        s = k+' '+' '.join([str(i) for i in model.wv[k]])
        f.write(s+'\n')

CPU times: user 4min 29s, sys: 492 ms, total: 4min 30s
Wall time: 4min 30s


## 2.基于paddle 构建二分类模型，确定 para 筛选特征
recall_eq 查询和材料的 word 共现率
recall_eq_char 查询和材料的 char 共现率
recall_tq 材料标题和查询的 word 共现率
recall_tq_char 材料标题和查询的 char 共现率
通过组合其中一个或几个特征，通过模型训练效果筛选出适合 para 筛选的特征；
最终确定组合 recall_eq 和 recall_eq_char 作为筛选 para 的特征。

**注：kernel 默认环境为 python3 通过将代码写到文件，然后命令行调用 python2 训练 paddlepaddle 模型**

In [172]:
bst_para_str ='''
"""
This module prepares and runs the whole system.
"""
import sys
if sys.version[0] == '2':
    reload(sys)
    sys.setdefaultencoding("utf-8")
import os
import json
from sklearn.model_selection import train_test_split
import paddle.v2 as paddle  
from paddle.v2.plot import Ploter
import numpy as np  
import random
from collections import Counter

root = '/mnt/datasets/dureader/preprocessed/'
s_train = root+'trainset/search.train.json'
z_train = root+'trainset/zhidao.train.json'
s_dev = root +'devset/search.dev.json'
z_dev = root+'devset/zhidao.dev.json'
s_test = root +'testset/search.test.json'
z_test = root +'testset/zhidao.test.json'

max_p_len = 500
def load_dataset(data_path, train=False):
    """
    Loads the dataset
    Args:
        data_path: the data file to load
    """
    all_paras = []
    with open(data_path) as fin:
        data_set = []
        for lidx, line in enumerate(fin):
            sample = json.loads(line.strip())
            if train:
                if len(sample['answer_spans']) == 0:
                    continue
                if sample['answer_spans'][0][1] >= 500:
                    continue

            if 'answer_docs' in sample:
                sample['answer_passages'] = sample['answer_docs']

            sample['question_tokens'] = sample['segmented_question']

            sample['passages'] = []
            for d_idx, doc in enumerate(sample['documents']):
                if train:
                    most_related_para = doc['most_related_para']
                    sample['passages'].append(
                        {'passage_tokens': doc['segmented_paragraphs'][most_related_para],
                         'is_selected': doc['is_selected']}
                    )
                else:
                    para_infos = []
                    for para_tokens, p_tokens, title,title_tokens in zip(doc['segmented_paragraphs'], doc['paragraphs'], doc['title'], doc['segmented_title']):
                        question_tokens = sample['segmented_question']

                        p_char_tokens = [i for i in p_tokens]
                        title_char_tokens = [i for i in title]
                        q_char_tokens = [i for i in sample['question']]

                        common_with_question = Counter(para_tokens) & Counter(question_tokens)
                        common_with_question_char = Counter(p_char_tokens) & Counter(q_char_tokens)
                        title_common_with_question = Counter(title_tokens) & Counter(question_tokens)
                        title_common_with_question_char = Counter(title_char_tokens) & Counter(q_char_tokens)

                        correct_preds = sum(common_with_question.values())
                        recall_eq = 0 if correct_preds==0 else float(correct_preds) / len(question_tokens)

                        correct_preds = sum(common_with_question_char.values())
                        recall_eq_char = 0 if correct_preds==0 else float(correct_preds) / len(q_char_tokens)

                        correct_preds = sum(title_common_with_question.values())
                        recall_tq = 0 if correct_preds == 0 else float(correct_preds) / len(question_tokens)

                        correct_preds = sum(title_common_with_question_char.values())
                        recall_tq_char = 0 if correct_preds == 0 else float(correct_preds) / len(q_char_tokens)

                        recall_wrt_question = float(recall_eq + recall_eq_char + recall_tq + recall_tq_char) / 4
                        para_infos.append((para_tokens,recall_wrt_question,len(para_tokens),recall_wrt_question,recall_eq,recall_eq_char,recall_tq,recall_tq_char,doc['is_selected']))
                    para_infos.sort(key=lambda x: (-x[1],x[2]))
                    try:
                        all_paras.append(para_infos[0])
                    except:
                        continue
                    fake_passage_tokens = []
                    for para_info in para_infos[:1]:
                        fake_passage_tokens += para_info[0]
                    sample['passages'].append({'passage_tokens': fake_passage_tokens})
            data_set.append(sample)
    return (data_set,all_paras)
_,all_paras = load_dataset(s_dev)
train_datas = [list(i) for i in all_paras]
train_datas = np.reshape(train_datas,(len(train_datas),len(train_datas[0])))
lbs = train_datas[:,-1].astype(np.int0)

X_train, X_test, y_train, y_test = train_test_split(train_datas[:,4:5],lbs, test_size=0.33, random_state=42)
print(X_train.shape,y_train.shape,X_test.shape,y_test.shape)

def paddle_reader(dataCharacter,dataLabel):
    def reader():       
        for i in xrange(len(dataLabel)):
            yield dataCharacter[i,:], int(dataLabel[i])
    return reader

train_reader=paddle_reader(X_train,y_train)
test_reader=paddle_reader(X_test,y_test)

paddle.init(use_gpu=False, trainer_count=4)
features = paddle.layer.data(
    name='features', type=paddle.data_type.dense_vector(X_train.shape[1]))
label = paddle.layer.data(
    name='label', type=paddle.data_type.integer_value(2))

fc1 = paddle.layer.fc(input=features, size=3, act=paddle.activation.Relu())
predict = paddle.layer.fc(input=fc1, size=2,act=paddle.activation.Softmax())

cost = paddle.layer.classification_cost(input=predict, label=label)
parameters = paddle.parameters.create(cost)
optimizer = paddle.optimizer.Momentum(momentum=0)
trainer = paddle.trainer.SGD(cost=cost,
                             parameters=parameters,
                             update_equation=optimizer)
train_title = "Train cost"
test_title = "Test cost"
cost_ploter = Ploter(train_title, test_title)

step = 0
# event_handler to plot a figure
def event_handler_plot(event):
    global step
    if isinstance(event, paddle.event.EndIteration):
        if step % 10 == 0:
            print(train_title, step, event.cost)
        step += 1
    if isinstance(event, paddle.event.EndPass):
        # save parameters
        with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
            trainer.save_parameter_to_tar(f)

        result = trainer.test(reader=paddle.batch(
            test_reader, batch_size=4**2))
        print(test_title, step, result.cost)

lists = []
def event_handler(event):
    if isinstance(event, paddle.event.EndIteration):
        if event.batch_id % 10 == 0:
            print "Pass %d, Batch %d, Cost %f, %s" % (
                event.pass_id, event.batch_id, event.cost, event.metrics)
    if isinstance(event, paddle.event.EndPass):
        # save parameters
        with open('params_pass_%d.tar' % event.pass_id, 'w') as f:
            trainer.save_parameter_to_tar(f)

        result = trainer.test(reader=paddle.batch(
            test_reader, batch_size=4**2))
        print("Test with Pass {}, Cost {}, {} ".format(event.pass_id, result.cost, result.metrics))
        lists.append((event.pass_id, result.cost,
                      result.metrics['classification_error_evaluator']))
trainer.train(
    reader=paddle.batch(
        paddle.reader.shuffle(
            train_reader, buf_size=4**4),
        batch_size=4**2),
    #feeding=feeding,
    event_handler=event_handler_plot,
    num_passes=1)

test_data_creator = test_reader()  
test_data = []  
test_label = []  
  
for item in test_data_creator:  
    test_data.append((item[0],))  
    test_label.append(item[1])
probs = paddle.infer(
    output_layer=predict,parameters=parameters,input=test_data)

from sklearn.metrics import accuracy_score
print(accuracy_score(test_label,np.asarray(probs[:,-1]>=0.50,dtype=np.int0)))
'''

In [173]:
with open('bst_para.py',mode='w',encoding='utf-8') as f:
    f.write(bst_para_str)

In [174]:
!python2 bst_para.py

((15612, 1), (15612,), (7690, 1), (7690,))
I0624 10:59:00.687331   724 Util.cpp:166] commandline:  --use_gpu=False --trainer_count=4 
W0624 10:59:00.687381   724 CpuId.h:112] PaddlePaddle wasn't compiled to use avx instructions, but these are available on your machine and could speed up CPU computations via CMAKE .. -DWITH_AVX=ON
I0624 10:59:00.695169   724 GradientMachine.cpp:94] Initing parameters..
I0624 10:59:00.695209   724 GradientMachine.cpp:101] Init parameters done.
('Train cost', 0, 0.6529005169868469)
('Train cost', 10, 0.734377920627594)
('Train cost', 20, 0.7815766334533691)
('Train cost', 30, 0.6830152273178101)
('Train cost', 40, 0.6783555746078491)
('Train cost', 50, 0.6414922475814819)
('Train cost', 60, 0.7307005524635315)
('Train cost', 70, 0.7159216403961182)
('Train cost', 80, 0.6731569170951843)
('Train cost', 90, 0.6336798071861267)
('Train cost', 100, 0.6371124982833862)
('Train cost', 110, 0.6520087718963623)
('Train cost', 120, 0.7425971031188965)
('Train cost

## 3. 加载 github 中的评估代码 和 修改para 选择的 tf-baseline 代码
baseline 代码只修改两处：
1. 修改 dataset.py --> _load_dataset方法 > 
before:
```
recall_wrt_question = recall_eq
```
after:
```python
recall_wrt_question = float(recall_eq + recall_eq_char) / 2
```
2. 修改 run.py --> prepare方法 > 
before:
```python
vocab.randomly_init_embeddings(args.embed_size)
```
after:
```python
if args.use_embe:
    vocab.load_pretrained_embeddings(embedding_path='./100_ver_not_pure.bin')
else:
	  vocab.randomly_init_embeddings(args.embed_size)
```


In [175]:
%%bash
utils_base_url='https://raw.githubusercontent.com/baidu/DuReader/master/utils'
utils_files=("__init__.py" "dureader_eval.py" "get_vocab.py" "preprocess.py" "download_thirdparty.sh")

download() {
    local metric=$1; shift;
    local base_url=$1; shift;
    local fnames=($@);

    mkdir -p ${metric}
    for fname in ${fnames[@]};
    do
        printf "downloading: %s\n" ${base_url}/${fname}
        wget --no-check-certificate ${base_url}/${fname} -O ${metric}/${fname}
    done
}
# prepare utils
download "utils" ${utils_base_url} ${utils_files[@]}

downloading: https://raw.githubusercontent.com/baidu/DuReader/master/utils/__init__.py
downloading: https://raw.githubusercontent.com/baidu/DuReader/master/utils/dureader_eval.py
downloading: https://raw.githubusercontent.com/baidu/DuReader/master/utils/get_vocab.py
downloading: https://raw.githubusercontent.com/baidu/DuReader/master/utils/preprocess.py
downloading: https://raw.githubusercontent.com/baidu/DuReader/master/utils/download_thirdparty.sh


--2018-06-24 11:03:38--  https://raw.githubusercontent.com/baidu/DuReader/master/utils/__init__.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.228.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.228.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 1251 (1.2K) [text/plain]
Saving to: ‘utils/__init__.py’

     0K .                                                     100% 17.0M=0s

2018-06-24 11:03:41 (17.0 MB/s) - ‘utils/__init__.py’ saved [1251/1251]

--2018-06-24 11:03:41--  https://raw.githubusercontent.com/baidu/DuReader/master/utils/dureader_eval.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.228.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.228.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 19017 (19K) [text/plain]
Saving to: ‘utils/dureader_eval.py’

     0K .......... ........                   

In [177]:
!cd utils && bash download_thirdparty.sh

downloading: https://raw.githubusercontent.com/tylin/coco-caption/master/pycocoevalcap/rouge/__init__.py
--2018-06-24 11:06:34--  https://raw.githubusercontent.com/tylin/coco-caption/master/pycocoevalcap/rouge/__init__.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.72.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.72.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 23 [text/plain]
Saving to: ‘rouge_metric/__init__.py’


2018-06-24 11:06:39 (34.7 MB/s) - ‘rouge_metric/__init__.py’ saved [23/23]

downloading: https://raw.githubusercontent.com/tylin/coco-caption/master/pycocoevalcap/rouge/rouge.py
--2018-06-24 11:06:39--  https://raw.githubusercontent.com/tylin/coco-caption/master/pycocoevalcap/rouge/rouge.py
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 151.101.72.133
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|151.101.72.133|:443... connected.
HTTP re

In [178]:
!rm -rf rc_tf

In [179]:
!git clone https://github.com/312shan/rc_tf.git

Cloning into 'rc_tf'...
remote: Counting objects: 59, done.[K
remote: Compressing objects: 100% (47/47), done.[K
remote: Total 59 (delta 23), reused 45 (delta 12), pack-reused 0[K
Unpacking objects: 100% (59/59), done.
Checking connectivity... done.


In [180]:
!cp -r utils rc_tf/

In [181]:
!python -V

Python 3.5.2 :: Continuum Analytics, Inc.


In [195]:
!du -h 100_ver_not_pure.bin

239M	100_ver_not_pure.bin


In [196]:
!du -h ./data/models/

4.0K	./data/models/


In [188]:
!rm -vr ./data/

removed directory: ‘./data/summary’
removed ‘./data/dev.de-en.de’
removed ‘./data/test.de-en.en’
removed ‘./data/dev.de-en.en’
removed ‘./data/test.de-en.de’
removed ‘./data/train.de-en.de’
removed ‘./data/train.de-en.en’
removed ‘./data/results/test.predicted.json’
removed directory: ‘./data/results’
removed directory: ‘./data’


In [190]:
%%time
!pip install tensorflow-gpu==1.2
!python rc_tf/run.py --prepare --embed_size 100 --use_embe 1

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
[31mtensorboard 1.6.0 has requirement markdown>=2.6.8, but you'll have markdown 2.2.0 which is incompatible.[0m
2018-06-24 11:36:10,313 - brc - INFO - Running with args : Namespace(algo='BIDAF', batch_size=32, brc_dir='./data/baidu', dev_files=['/mnt/datasets/dureader/preprocessed/devset/search.dev.json'], dropout_keep_prob=1, embed_size=100, epochs=10, evaluate=False, gpu='0', hidden_size=150, is_restore=0, learning_rate=0.001, log_path=None, max_a_len=200, max_p_len=500, max_p_num=5, max_q_len=60, model_dir='./data/models/', optim='adam', predict=False, prepare=True, result_dir='./data/results/', summary_dir='./data/summary/', test_files=['/mnt/datasets/dureader/preprocessed/testset/search.test.json'], train=False, train_files=['/mnt/datasets/dureader/preprocessed/trainset/search.train.json'], use_embe=1, vocab_dir='./data/vocab/', weight_decay=0)
2018-06-24 11:36:10,313 - brc - INFO - Checking the data files...
2018-06-2

In [201]:
!du -h data/vocab/vocab.data

188M	data/vocab/vocab.data


In [202]:
!du -h ./data/models/

4.0K	./data/models/


In [198]:
!rm 100_ver_not_pure.bin

In [211]:
%%time
# 如果要重新加载上一次的模型，或中断了训练需要重新加载之前训练得模型，增加参数 --is_restore 1 即可
!pip install tensorflow-gpu==1.2
!python rc_tf/run.py --train --algo BIDAF --epochs 3 --embed_size 100 --batch_size 48 --train_files $z_train $s_train

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
[31mtensorboard 1.6.0 has requirement markdown>=2.6.8, but you'll have markdown 2.2.0 which is incompatible.[0m
2018-06-24 23:32:50,868 - brc - INFO - Running with args : Namespace(algo='BIDAF', batch_size=48, brc_dir='./data/baidu', dev_files=['/mnt/datasets/dureader/preprocessed/devset/search.dev.json'], dropout_keep_prob=1, embed_size=100, epochs=2, evaluate=False, gpu='0', hidden_size=150, is_restore=1, learning_rate=0.001, log_path=None, max_a_len=200, max_p_len=500, max_p_num=5, max_q_len=60, model_dir='./data/models/', optim='adam', predict=False, prepare=False, result_dir='./data/results/', summary_dir='./data/summary/', test_files=['/mnt/datasets/dureader/preprocessed/testset/search.test.json'], train=True, train_files=['/mnt/datasets/dureader/preprocessed/trainset/zhidao.train.json', '/mnt/datasets/dureader/preprocessed/trainset/search.train.json'], use_embe=0, vocab_dir='./data/vocab/', weight_decay=0)
2018-06-24

In [38]:
!du -h data/models/

206M	data/models/


In [213]:
!ls data/models/

BIDAF.data-00000-of-00001  BIDAF.index	BIDAF.meta  checkpoint


In [214]:
%%time
!pip install tensorflow-gpu==1.2
!python rc_tf/run.py --predict --algo BIDAF --batch_size 48 --embed_size 100 --test_files $s_test $z_test

Looking in indexes: https://pypi.tuna.tsinghua.edu.cn/simple
[31mtensorboard 1.6.0 has requirement markdown>=2.6.8, but you'll have markdown 2.2.0 which is incompatible.[0m
2018-06-25 04:47:05,533 - brc - INFO - Running with args : Namespace(algo='BIDAF', batch_size=48, brc_dir='./data/baidu', dev_files=['/mnt/datasets/dureader/preprocessed/devset/search.dev.json'], dropout_keep_prob=1, embed_size=100, epochs=10, evaluate=False, gpu='0', hidden_size=150, is_restore=0, learning_rate=0.001, log_path=None, max_a_len=200, max_p_len=500, max_p_num=5, max_q_len=60, model_dir='./data/models/', optim='adam', predict=True, prepare=False, result_dir='./data/results/', summary_dir='./data/summary/', test_files=['/mnt/datasets/dureader/preprocessed/testset/search.test.json', '/mnt/datasets/dureader/preprocessed/testset/zhidao.test.json'], train=False, train_files=['/mnt/datasets/dureader/preprocessed/trainset/search.train.json'], use_embe=0, vocab_dir='./data/vocab/', weight_decay=0)
2018-06-25 

In [15]:
!wget -nv -O kesci_submit https://cdn.kesci.com/submit_tool/v1/kesci_submit&&chmod +x kesci_submit

2018-06-23 16:31:06 URL:https://cdn.kesci.com/submit_tool/v1/kesci_submit [7842088/7842088] -> "kesci_submit" [1]


In [215]:
!./kesci_submit -token 9ad470335513aaeb -file ./data/results/test.predicted.json