## Import packages

In [1]:
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
import jieba
import jieba.posseg as pseg
from pyhanlp import *

## Data file path

In [2]:
train_path = './data/ai_challenger_oqmrc_trainingset.json' # train set
valid_path = './data/ai_challenger_oqmrc_validationset.json' # validation set
test_path = './data/ai_challenger_oqmrc_testa.json' # test set

## Load my own dictionary from sougou to help jieba cut

jieba.load_userdict('./my_dict.txt')

##  Read file

In [3]:
train_set = pd.read_json(train_path, orient='records', encoding='utf-8', lines=True)
train_set.shape

(250000, 6)

In [4]:
valid_set = pd.read_json(valid_path, orient='records', encoding='utf-8', lines=True)
valid_set.head()
#valid_set.shape

Unnamed: 0,alternatives,answer,passage,query,query_id,url
0,有|没有|无法确定,有,动漫好看的H：爱的魔法，KEY的作品，喧嚣学院，草莓100%，双恋，爱丽丝学园，灼眼的夏娜，...,有没有好看的h,250001,http://iask.sina.com.cn/key/5a18d46b84aedabb5c...
1,能|不能|无法确定,能,醋泡鸡蛋确实具有一定美白嫩化肌肤、提高皮肤亮度、祛斑的效果，因为白醋中含有的醋酸可以加速表皮...,醋泡鸡蛋真能去斑吗,250002,http://www.120ask.com/question/65970789.htm
2,听不懂|听得懂|无法确定,听不懂,人有人言，兽有兽语。动物是不会听懂人说话的,老鼠听得懂人话吗,250003,http://wenwen.sogou.com/z/q166740184.htm
3,无法确定|大|不大,无法确定,1.前期投资约5-10万元设备投资：柜台、门面装修、电脑及简单家具，一次性投入约2万元。2....,开洗车店投资大吗,250004,http://wenwen.sogou.com/z/q705319471.htm
4,会|不会|无法确定,会,性接触没有保护措施，是有感染的几率的，艾滋病没有特异性的症状。,类似性行为会不会感染艾滋病,250005,http://www.169kang.com/question/166710467.html


In [5]:
test_set = pd.read_json(test_path, orient='records', encoding='utf-8', lines=True)
print (test_set.shape)
test_set.head()

(10000, 5)


Unnamed: 0,alternatives,passage,query,query_id,url
0,能|不能|无法确定,武威公交一体化纪实 10家运输公司中标经营包括凉州区、古浪、民勤、天祝在内的城乡公交线路。经...,武威的公交卡古浪能不能用,280001,http://gsrb.gansudaily.com.cn/system/2009/08/2...
1,能|不能|无法确定,现在这个社会什么买不到，只要你有钱是不是 欢迎光临【深圳平安安防】无线的有线的都有呢，看你喜...,能买到无线偷拍器吗,280002,http://wenwen.sogou.com/z/q701006723.htm
2,是真的|不是真的|无法确定,请问朋友们网上中安信业代款是真的吗？ 【百度反诈骗联盟团队】特别提醒：网上发布的所有只凭身份...,中安信业减免还款是真实的吗,280003,http://wenwen.sogou.com/z/q763575352.htm
3,能|不能|无法确定,对于这些的话也可以咨询一下你的直属上司或者是领导，他们专业的意见也都是可以的。,petct医保报销吗,280004,http://www.mama.cn/ask/q13547252-p1.html
4,慢热|不慢热|无法确定,在巨蟹座当中，慢热型的性格，更是让她们的爱心与细腻，更好的发挥到极致。,巨蟹座慢热么,280005,http://www.d1xz.net/astro/Cancer/art117849.aspx


## Part I: Preprocess

In [6]:
def nlp_seg(text):
    NLPTokenizer = JClass("com.hankcs.hanlp.tokenizer.NLPTokenizer")
    seg = NLPTokenizer.segment(text)
    l = [i.word for i in seg]
    return l
print (nlp_seg('甲减能自愈吗'))

['甲减', '能', '自', '愈', '吗']


#### preprocess function to clean a text 
cut words, remove punctuation, lower case

In [7]:
def preprocess(text, aug=False):
    jieba.suggest_freq(('不', '会'), tune=True)
    jieba.suggest_freq(('不', '能'), tune=True)
    jieba.suggest_freq(('不', '行'), tune=True)
    jieba.suggest_freq(('不', '好'), tune=True)
    jieba.suggest_freq(('不', '要'), tune=True)
    jieba.suggest_freq(('不', '是'), tune=True)
    jieba.suggest_freq(('不'), tune=True)
    jieba.suggest_freq('无法确定', tune=True)
    sent = jieba.lcut(text, HMM=False)

    for i in range(len(sent)):
        if sent[i] in "[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）：【】]+":
            sent[i] = ' '
        elif aug and random.random()<0.1: # data augmentation
            sent[i] = ' '
        else:
            sent[i].lower()
    sent = ' '.join(sent)
    return sent

#print (preprocess('塞庚啶好还是扑尔敏好', 0))

In [13]:
# concatenate query and alternatives
def query_alt(query, alternatives, a):
    '''
    query: line['query'] from original dataframe
    alternatives: line['alternatives'] from original dataframe
    a: current option in alternatives to be merged with query
    
    return: query and current option a concatenated (preprocessed)
    '''
    
    query = query.strip()
    if query[-1] == "吗" or query[-1] == "么" or query[-1] == "嘛" or query[-1] == "不": 
        query = query[:-1]
        match = None
        o = alternatives.split('|')
        o = [m.strip() for m in o]
        if '无法确认' in o:
            o.remove('无法确认')
        if '无法确定' in o:
            o.remove('无法确定') 
        if o[0] in o[1]:
            long = o[1]
            short = o[0]
        else:
            long = o[0]
            short = o[1]
        if long in query:
            match = long
        else:
            if short in query:
                match = short
            elif (short == '能') and ('可以' in query):
                match = '可以'
            elif (short == '可以') and ('能' in query):
                match = '能'
            elif (short == '可以') and ('会' in query):
                match = '会'
            elif (short == '会') and ('可以' in query):
                match = '可以'
            elif (short == '会') and ('能' in query):
                match = '能'
            elif (short == '能') and ('会' in query):
                match = '会'

        if match:
            query = query.replace(match, a)
        else:
            query = a + query
            
        merged = preprocess(query, alternatives)
        return merged
            
    else: # 问题里正反两个词都要替换
        match = alternatives.split('|')
        match = [m.strip() for m in match]
        if '无法确认' in match:
            match.remove('无法确认')
        if '无法确定' in match:
            match.remove('无法确定') 
        if match[0] in query and match[1] in query: # 两个词都出现了
            if match[0] + match[1] in query: # 有没有，会不会
                query = query.replace(match[0] + match[1], a)
            elif match[1] + match[0] in query:
                query = query.replace(match[1] + match[0], a)
            else: # A好还是B好
                if a == match[0]:
                    query = query.replace(match[1], ' ')
                elif a == match[1]:
                    query = query.replace(match[0], ' ')
                else: # 无法确定
                    query = query.replace(match[0], ' ')
                    query = query.replace(match[1], a)
        else: # 两个词没完整出现
            if '能否' in query:
                query = query.replace('能否', a)
            elif '是否' in query:
                query = query.replace('是否', a)
            elif '可否' in query:
                query = query.replace('可否', a)
            
        merged = preprocess(query, alternatives)
        return merged

### <font color=red>Some records have wrong data 三个选项中有重复
train: 25635, 59325, 89727, 120004, 143044, 154519, 219549, 249465, 249485

valid: 22156, 23656

In [9]:
print (train_set.iloc[154519])

alternatives                        浴网好|浴板好|无法确定
answer                                       浴网好
passage             婴儿浴盆的选择是要看宝宝的情况的，一般来说浴网比较好用。
query                              婴儿浴盆带浴网好还是浴板好
query_id                                  154520
url             http://www.mama.cn/z/wiki/56092/
Name: 154519, dtype: object


#### write into the tsv file

In [15]:
with open('./data/train1.tsv', 'w', encoding='utf-8') as fw:
    fw.write('id' + '\t' + 'passage' + '\t' + 'a0' + '\t' + 'a1' + '\t' + 'a2' + '\t' + 'answer' + '\n')
    for i in tqdm(range(train_set.shape[0])):
        line = train_set.iloc[i]
        p = preprocess(line['passage'])
        q = line['query']
        fw.write(str(line['query_id'])+ '\t'+ p)
        l = line['alternatives'].split('|')
        random.shuffle(l)
        ind = 0
        for i, a in enumerate(l):
            a = a.strip()
            m = query_alt(query=q, alternatives=line['alternatives'], a=a)
            if a == line['answer'].strip():
                ind = i
            fw.write('\t'+ m)
        fw.write('\t'+ str(ind)+ '\n')


  0%|                                                                                       | 0/250000 [00:00<?, ?it/s]
  0%|                                                                             | 79/250000 [00:00<05:19, 782.96it/s]
  0%|                                                                            | 172/250000 [00:00<05:16, 790.41it/s]
  0%|                                                                            | 252/250000 [00:00<05:16, 789.95it/s]
  0%|                                                                            | 337/250000 [00:00<05:09, 805.96it/s]
  0%|▏                                                                           | 432/250000 [00:00<05:07, 811.15it/s]
  0%|▏                                                                           | 516/250000 [00:00<05:06, 813.44it/s]
  0%|▏                                                                           | 611/250000 [00:00<05:04, 818.92it/s]
  0%|▏                                 

In [16]:
with open('./data/valid1.tsv', 'w', encoding='utf-8') as fw:
    fw.write('id' + '\t' + 'passage' + '\t' + 'a0' + '\t' + 'a1' + '\t' + 'a2' + '\t' + 'answer' + '\n')
    for i in tqdm(range(valid_set.shape[0])):
        line = valid_set.iloc[i]
        p = preprocess(line['passage'])
        q = line['query']
        fw.write(str(line['query_id'])+ '\t'+ p)
        l = line['alternatives'].split('|')
        random.shuffle(l)
        ind = 0
        for i, a in enumerate(l):
            a = a.strip()
            m = query_alt(query=q, alternatives=line['alternatives'], a=a)
            if a == line['answer'].strip():
                ind = i
            fw.write('\t'+ m)
        fw.write('\t'+ str(ind)+ '\n')

100%|███████████████████████████████████████████████████████████████████████████| 30000/30000 [00:48<00:00, 622.06it/s]


In [17]:
with open('./data/test1.tsv', 'w', encoding='utf-8') as fw:
    fw.write('id' + '\t' + 'passage' + '\t' + 'a0' + '\t' + 'a1' + '\t' + 'a2' + '\n')
    for i in tqdm(range(test_set.shape[0])):
        line = test_set.iloc[i]
        p = preprocess(line['passage'])
        q = line['query']
        fw.write(str(line['query_id'])+ '\t'+ p)
        l = line['alternatives'].split('|')
        for i, a in enumerate(l):
            a = a.strip()
            m = query_alt(query=q, alternatives=line['alternatives'], a=a)
            fw.write('\t'+ m)
        fw.write('\n')

100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:16<00:00, 598.02it/s]


## <font color=red>测试集test A 有两条有误！

289730只有一个选项，无法确定。

289334只有两个选项，不能or无法确定。选不能

## Part II: Add features for words in passage
Add Exact match, soft align and Pos tag features. Refer to SAN model: https://arxiv.org/abs/1712.03556

Delete soft-align and pos tag. Add option exact match.

In [18]:
maxlen_p = 150
maxlen_q = 15

In [19]:
train_path = './data/train1.tsv' # train set
valid_path = './data/valid1.tsv' # validation set
test_path = './data/test1.tsv' # test set

In [20]:
train = pd.read_csv(train_path, sep='\t', header=0)
valid = pd.read_csv(valid_path, sep='\t', header=0)
test = pd.read_csv(test_path, sep='\t', header=0)

In [22]:
print (train.shape, valid.shape, test.shape)

(250000, 6) (30000, 6) (10000, 5)


In [20]:
pl = []
ql = []
for i in tqdm(range(train.shape[0])):
    line = train.iloc[i]
    q_words = line['query'].split()
    p_words = line['passage'].split()
    option = line['option']
    
    if len(p_words) > maxlen_p: # truncate pre
        lt = len(p_words) - maxlen_p
        p_words = p_words[lt:]
    if len(q_words) > maxlen_q: # truncate post
        q_words = q_words[:maxlen_q]

    pfea = []
    for w in p_words:
        # exact match
        if w in q_words:
            em = 1
        else:
            em = 0
        # option match
        if w == option:
            om = 1
        else:
            om = 0
        pfea.append([em, om])
        
    qfea = []
    for w in q_words:
        # exact match
        if w in p_words:
            em = 1
        else:
            em = 0
        # option match
        if w == option:
            om = 1
        else:
            om = 0
        qfea.append([em, om])
        
    while len(pfea) < maxlen_p: # pad with 0 pre
        pfea.insert(0, [0] * 2)
    pl.append(pfea)
    while len(qfea) < maxlen_q: # pad with 0 post
        qfea.append([0] * 2)
    ql.append(qfea)

pl = np.asarray(pl)
ql = np.asarray(ql)
np.save('./data/train_fea_p', pl)
np.save('./data/train_fea_q', ql)
print (np.shape(pl), np.shape(ql))

100%|█████████████████████████████████████████████████████████████████████████| 750000/750000 [32:30<00:00, 384.44it/s] 88%|███████████████████████████████████████████████████████████████▉         | 656584/750000 [31:47<04:31, 344.12it/s]


(750000, 150, 2) (750000, 15, 2)


In [21]:
pl = []
ql = []
for i in tqdm(range(valid.shape[0])):
    line = valid.iloc[i]
    q_words = line['query'].split()
    p_words = line['passage'].split()
    option = line['option']
    
    if len(p_words) > maxlen_p: # truncate pre
        lt = len(p_words) - maxlen_p
        p_words = p_words[lt:]
    if len(q_words) > maxlen_q: # truncate post
        q_words = q_words[:maxlen_q]

    pfea = []
    for w in p_words:
        # exact match
        if w in q_words:
            em = 1
        else:
            em = 0
        # option match
        if w == option:
            om = 1
        else:
            om = 0
        pfea.append([em, om])
        
    qfea = []
    for w in q_words:
        # exact match
        if w in p_words:
            em = 1
        else:
            em = 0
        # option match
        if w == option:
            om = 1
        else:
            om = 0
        qfea.append([em, om])
        
    while len(pfea) < maxlen_p: # pad with 0 pre
        pfea.insert(0, [0] * 2)
    pl.append(pfea)
    while len(qfea) < maxlen_q: # pad with 0 post
        qfea.append([0] * 2)
    ql.append(qfea)

pl = np.asarray(pl)
ql = np.asarray(ql)
np.save('./data/valid_fea_p', pl)
np.save('./data/valid_fea_q', ql)
print (np.shape(pl), np.shape(ql))

100%|██████████████████████████████████████████████████████████████████████████| 90000/90000 [00:35<00:00, 2535.87it/s]


(90000, 150, 2) (90000, 15, 2)


In [22]:
pl = []
ql = []
for i in tqdm(range(test.shape[0])):
    line = test.iloc[i]
    q_words = line['query'].split()
    p_words = line['passage'].split()
    option = line['option']
    
    if len(p_words) > maxlen_p: # truncate pre
        lt = len(p_words) - maxlen_p
        p_words = p_words[lt:]
    if len(q_words) > maxlen_q: # truncate post
        q_words = q_words[:maxlen_q]

    pfea = []
    for w in p_words:
        # exact match
        if w in q_words:
            em = 1
        else:
            em = 0
        # option match
        if w == option:
            om = 1
        else:
            om = 0
        pfea.append([em, om])
        
    qfea = []
    for w in q_words:
        # exact match
        if w in p_words:
            em = 1
        else:
            em = 0
        # option match
        if w == option:
            om = 1
        else:
            om = 0
        qfea.append([em, om])
        
    while len(pfea) < maxlen_p: # pad with 0 pre
        pfea.insert(0, [0] * 2)
    pl.append(pfea)
    while len(qfea) < maxlen_q: # pad with 0 post
        qfea.append([0] * 2)
    ql.append(qfea)

pl = np.asarray(pl)
ql = np.asarray(ql)
np.save('./data/test_fea_p', pl)
np.save('./data/test_fea_q', ql)
print (np.shape(pl), np.shape(ql))

100%|██████████████████████████████████████████████████████████████████████████| 30000/30000 [00:09<00:00, 3117.64it/s]


(30000, 150, 2) (30000, 15, 2)
