In [1]:
import json
import pandas as pd
import jieba
import vocab
import re
import numpy as np
import preprocess_data
import utils

In [2]:
# data = pd.read_csv('data_gen/train_df.csv')
data = pd.read_csv('data_gen/train_df_5000.csv')
data.sample(1)

Unnamed: 0,article_id,article_type,article_title,article_content,question_id,question,answer,question_type
1716,42216,防务快讯,"中国周边果真""起火""了吗？",近几个月来，中国周边出现了一些新的动向，有的基本上是由国内矛盾引起的，有的则存在复杂的国际背...,8297bac6-848d-4da6-9611-fbb76c603a28,美国一位知名人士10多年前就说过什么,相信冲突不可避免的想法本身就是冲突的根源,观点型问题


# 数据预处理

In [3]:
# 除掉多余空格，除掉'\u3000'
def preprocess_data(data):
    result = []
    for i in data:
        i = re.sub(r'\u3000', ' ', i)
        i = re.sub(r'\s+', ' ', i)
        
        # 繁简体转换
        i = re.sub(r'０', '0', i)
        i = re.sub(r'１', '1', i)
        i = re.sub(r'２', '2', i)
        i = re.sub(r'３', '3', i)
        i = re.sub(r'４', '4', i)
        i = re.sub(r'５', '5', i)
        i = re.sub(r'６', '6', i)
        i = re.sub(r'７', '7', i)
        i = re.sub(r'８', '8', i)
        i = re.sub(r'９', '9', i)
        i = re.sub(r'．', '.', i)
        
        result.append(i)
    return result

data.loc[:, 'title'] = preprocess_data(data.article_title.values)
data.loc[:, 'content'] = preprocess_data(data.article_content.values)
data.loc[:, 'question'] = preprocess_data(data.question.values)
data.loc[:, 'answer'] = preprocess_data(data.answer.values)
data.sample(1)

Unnamed: 0,article_id,article_type,article_title,article_content,question_id,question,answer,question_type,title,content
2115,46547,防务快讯,俄罗斯有意联合巴西研制未来先进战斗机,[据俄新社2013年10月14日报道]为使苏-35战斗机重返F-X2战斗机竞标，俄罗斯打算授...,7dbeb36c-1215-4052-a3ba-59070e437e81,俄罗斯有意联合巴西研制什么,未来先进战斗机,事实型问题,俄罗斯有意联合巴西研制未来先进战斗机,[据俄新社2013年10月14日报道]为使苏-35战斗机重返F-X2战斗机竞标，俄罗斯打算授...


In [4]:
# 除掉答案中的不好结尾符
answers = data.answer.values
answers = [answer.strip()[: -1] if answer.strip()[-1] in ['。', '，', '：', '！', '？'] else answer.strip() 
 for answer in answers]
data.loc[:, 'answer'] = answers
data.sample(1)

Unnamed: 0,article_id,article_type,article_title,article_content,question_id,question,answer,question_type,title,content
979,4236,新闻,难上加难：沙特“爱国者”反导难挡胡塞“火山”弹道导弹袭击（16）,招致了这么多强国的敌视，胡塞武装遭到外部打击的命运，其实早已注定了。图为2014年9月22...,0c434657-f98e-4751-a221-de79a457f318,一群胡塞武装人员站在几辆缴获自也门陆军第1装甲师的步兵战车上欢呼,一,数值型问题,难上加难：沙特“爱国者”反导难挡胡塞“火山”弹道导弹袭击（16）,招致了这么多强国的敌视，胡塞武装遭到外部打击的命运，其实早已注定了。图为2014年9月22...


In [191]:
stopword_file = 'data/stopwords'
stopwords = []
with open(stopword_file, 'r') as file:
    for word in file.readlines():
        stopwords.append(word[:-1])
stopwords = set(stopwords)

# 构建数据集

In [35]:

def match(title, content, question):
        result = []
        # 添加标题
        result.append(title)

        # 添加所有最相似行 + 下一行 + 上一行 + (出现在问题中的行+上一行+下一行)
        content_list = content.split('。')
        temp = []
        for c in content_list:
            if c not in ['', ' ']:
                temp.append(c.strip())
        content_list = temp

        question_set = set(jieba.cut(question))
        scores = []
        for c in content_list:
            c_set = set(jieba.cut(c))
            if c_set <= question_set:
                scores.append(-1)
                continue
            score = len(c_set & question_set)
            scores.append(score)

        max_score = max(scores)

        for i in range(len(scores)):
            if scores[i] == max_score or scores[i]<0:
                if i-1 >= 0:
                    result.append(content_list[i-1])
                result.append(content_list[i])
                if i+1 < len(content_list):
                    result.append(content_list[i+1])
                
#         result.append(content_list[0])
#         result.append(content_list[-1])

        # 过滤
        temp = []
        for r in result:
            if r not in temp:
                temp.append(r)
        result = temp

        return '。'.join(result)

titles = data.title.values
contents = data.content.values
questions = data.question.values
answers = data.answer.values

merge = [match(t, c, q) for t, c, q in zip(titles, contents, questions)]
data.loc[:, 'merge'] = merge
data.sample(1)

Unnamed: 0,article_id,article_type,article_title,article_content,question_id,question,answer,question_type,title,content,merge,is_in,merge_len
3272,36391,防务快讯,美媒：美国新入役P-8A反潜巡逻机问题多,核心提示： P-8A是双引擎喷气式飞机，速度为910公里/时。参考消息网2月7日报道 外媒称...,c3d838a5-1759-4a56-a20c-781a3f481af2,P-8A与P-3一样，机翼下有可挂载鱼雷或导弹的挂点。海军飞行员相信，P-8A将与什么一样稳...,P-3,事实型问题,美媒：美国新入役P-8A反潜巡逻机问题多,核心提示： P-8A是双引擎喷气式飞机，速度为910公里/时。参考消息网2月7日报道 外媒称...,美媒：美国新入役P-8A反潜巡逻机问题多。报道称，当然，P-8A也有许多优势。P-8A是双引...,True,176


# 评估数据集构建效果

In [36]:
# 准确率
is_in = [True if a in m else False for m, a in zip(merge, answers)]
data.loc[:, 'is_in'] = is_in
print('accuracy: %.4f' % (sum(is_in)/len(is_in)))

# 长度
merge_len = [len(list(jieba.cut(m))) for m in merge]
data.loc[:, 'merge_len'] = merge_len
print('max length: %d' % max(merge_len))
print('min length: %d' % min(merge_len))
print('mean length: %d' % data.merge_len.mean())

a = data.merge_len.value_counts()[list(range(100000))]
num = a[a.notnull()].cumsum() / len(data)
print('the split length (data > 0.98): %d' % (num[num>0.98].index[0]))
print('mean length (data > 0.98): %d' % int(data[data.merge_len <= num[num>0.98].index[0]].merge_len.mean()))


accuracy: 0.9498
max length: 5406
min length: 29
mean length: 123
the split length (data > 0.98): 395
mean length (data > 0.98): 108


395     0.9802
398     0.9804
403     0.9806
406     0.9808
407     0.9810
408     0.9812
410     0.9814
412     0.9816
416     0.9820
419     0.9822
421     0.9824
425     0.9826
428     0.9828
429     0.9830
430     0.9834
431     0.9836
433     0.9838
436     0.9840
439     0.9842
444     0.9844
445     0.9846
446     0.9848
448     0.9852
453     0.9856
458     0.9858
460     0.9860
462     0.9862
463     0.9864
464     0.9866
466     0.9868
         ...  
734     0.9942
743     0.9944
744     0.9946
745     0.9948
770     0.9950
813     0.9952
816     0.9954
846     0.9956
853     0.9958
863     0.9960
917     0.9962
929     0.9964
971     0.9966
1093    0.9968
1130    0.9970
1227    0.9972
1365    0.9974
1412    0.9976
1457    0.9978
1475    0.9980
1495    0.9982
1692    0.9984
1703    0.9986
1751    0.9988
2053    0.9990
2253    0.9992
3500    0.9994
3771    0.9996
4037    0.9998
5406    1.0000
Name: merge_len, Length: 90, dtype: float64

In [None]:
accuracy: 0.9498
max length: 5406
min length: 29
mean length: 123
the split length (data > 0.98): 395
mean length (data > 0.98): 108