## Import packages

In [1]:
import numpy as np
import pandas as pd
import random
from tqdm import tqdm
import jieba
import jieba.posseg as pseg
from pyhanlp import *

## Data file path

In [2]:
train_path = './data/ai_challenger_oqmrc_trainingset.json' # train set
valid_path = './data/ai_challenger_oqmrc_validationset.json' # validation set
test_path = './data/ai_challenger_oqmrc_testa.json' # test set

## Load my own dictionary from sougou to help jieba cut

jieba.load_userdict('./my_dict.txt')

##  Read file

In [3]:
train_set = pd.read_json(train_path, orient='records', encoding='utf-8', lines=True)
train_set.shape

(250000, 6)

In [4]:
valid_set = pd.read_json(valid_path, orient='records', encoding='utf-8', lines=True)
valid_set.head()
#valid_set.shape

Unnamed: 0,alternatives,answer,passage,query,query_id,url
0,有|没有|无法确定,有,动漫好看的H：爱的魔法，KEY的作品，喧嚣学院，草莓100%，双恋，爱丽丝学园，灼眼的夏娜，...,有没有好看的h,250001,http://iask.sina.com.cn/key/5a18d46b84aedabb5c...
1,能|不能|无法确定,能,醋泡鸡蛋确实具有一定美白嫩化肌肤、提高皮肤亮度、祛斑的效果，因为白醋中含有的醋酸可以加速表皮...,醋泡鸡蛋真能去斑吗,250002,http://www.120ask.com/question/65970789.htm
2,听不懂|听得懂|无法确定,听不懂,人有人言，兽有兽语。动物是不会听懂人说话的,老鼠听得懂人话吗,250003,http://wenwen.sogou.com/z/q166740184.htm
3,无法确定|大|不大,无法确定,1.前期投资约5-10万元设备投资：柜台、门面装修、电脑及简单家具，一次性投入约2万元。2....,开洗车店投资大吗,250004,http://wenwen.sogou.com/z/q705319471.htm
4,会|不会|无法确定,会,性接触没有保护措施，是有感染的几率的，艾滋病没有特异性的症状。,类似性行为会不会感染艾滋病,250005,http://www.169kang.com/question/166710467.html


In [5]:
test_set = pd.read_json(test_path, orient='records', encoding='utf-8', lines=True)
print (test_set.shape)
test_set.head()

(10000, 5)


Unnamed: 0,alternatives,passage,query,query_id,url
0,能|不能|无法确定,武威公交一体化纪实 10家运输公司中标经营包括凉州区、古浪、民勤、天祝在内的城乡公交线路。经...,武威的公交卡古浪能不能用,280001,http://gsrb.gansudaily.com.cn/system/2009/08/2...
1,能|不能|无法确定,现在这个社会什么买不到，只要你有钱是不是 欢迎光临【深圳平安安防】无线的有线的都有呢，看你喜...,能买到无线偷拍器吗,280002,http://wenwen.sogou.com/z/q701006723.htm
2,是真的|不是真的|无法确定,请问朋友们网上中安信业代款是真的吗？ 【百度反诈骗联盟团队】特别提醒：网上发布的所有只凭身份...,中安信业减免还款是真实的吗,280003,http://wenwen.sogou.com/z/q763575352.htm
3,能|不能|无法确定,对于这些的话也可以咨询一下你的直属上司或者是领导，他们专业的意见也都是可以的。,petct医保报销吗,280004,http://www.mama.cn/ask/q13547252-p1.html
4,慢热|不慢热|无法确定,在巨蟹座当中，慢热型的性格，更是让她们的爱心与细腻，更好的发挥到极致。,巨蟹座慢热么,280005,http://www.d1xz.net/astro/Cancer/art117849.aspx


## Part I: Preprocess

In [6]:
f=open('./tongyici/tongyici.txt','r', encoding='gbk')
lines=f.readlines()
print (lines[1])
sym_words=[]
for line in lines:
    line=line.replace('\n','')
    items=line.split(' ')
    index=items[0]
    if(index[-1]=='='):
        sym_words.append(items[1:])
f.close()

Aa01A02= 人类 生人 全人类



In [7]:
def expand(l):
    for w in l:
        for each in sym_words:
            if w in each:
                w = each
    return l

#### preprocess function to clean a text 
cut words, remove punctuation, lower case

In [21]:
def preprocess(text, alternatives, aug=False):
    '''
    dic = {}
    for i in alternatives.split('|'):
        if not (i == '无法确定' or i == '无法确认'):
            f = jieba.get_FREQ(i, 0)
            #print (f)
            dic[i] = f
            jieba.add_word(i)
    '''
    sent = jieba.lcut(text, HMM=False)
    #sent = nlp_seg(text)
    '''
    for i in dic:
        jieba.del_word(i)
        jieba.add_word(i, dic[i])
    '''
    for i in range(len(sent)):
        if sent[i] in "[\s+\.\!\/_,$%^*(+\"\']+|[+——！，。？、~@#￥%……&*（）：【】]+":
            sent[i] = ' '
        elif aug and random.random()<0.1: # data augmentation
            sent[i] = ' '
        else:
            sent[i].lower()
    sent = expand(sent)
    sent = ' '.join(sent)
    return sent

#### Merge the query and alternatives
Use the preprocessed query, remove the last 吗 or 么. If same word exists, replace it with the current option. Otherwise put the option in the head 

In [23]:
# concatenate query and alternatives
def query_alt(query, alternatives, a):
    '''
    query: line['query'] from original dataframe
    alternatives: line['alternatives'] from original dataframe
    a: current option in alternatives to be merged with query
    
    return: query and current option a concatenated (preprocessed)
    '''
    
    query = preprocess(query, alternatives).split()
    if query[-1] == "吗" or query[-1] == "么" or query[-1] == "嘛" or query[-1] == "不":
        del query[-1]
    '''
    match = None # 问题里要替换的词
    for i in alternatives.split('|'):
        if i in query:
            match = i
            break
        elif (i == '能') and ('可以' in query):
            match = '可以'
            break
        elif (i == '可以') and ('能' in query):
            match = '能'
            break
        elif (i == '可以') and ('会' in query):
            match = '会'
            break
        elif (i == '会') and ('可以' in query):
            match = '可以'
            break
        elif (i == '会') and ('能' in query):
            match = '能'
            break
        elif (i == '能') and ('会' in query):
            match = '会'
            break
            
    if match:
        ind = query.index(match)
        query[ind] = a
        merged = ' '.join(query)
    else:
        l = [a]
        l.extend(query)
        merged = ' '.join(l)
    '''
    l = [a]
    l.extend(query)
    merged = ' '.join(l)
        
    return merged

#### write into the tsv file

In [22]:
with open('.//data//train.tsv', 'w', encoding='utf-8') as fw:
    fw.write('id' + '\t' + 'passage' + '\t' + 'query' + '\t' + 'option' + '\t' + 'label' + '\n')
    for i in tqdm(range(train_set.shape[0])):
        line = train_set.iloc[i]
        for a in line['alternatives'].split('|'):
            m = query_alt(query=line['query'], alternatives=line['alternatives'], a=a)
            if a == line['answer']:
                fw.write(str(line['query_id'])+ '\t'+ preprocess(line['passage'], line['alternatives'], True)+ '\t'+ m+ '\t'+ a+ '\t'+ '1'+'\n')
            else:
                fw.write(str(line['query_id'])+ '\t'+ preprocess(line['passage'], line['alternatives'], True)+ '\t'+ m+ '\t'+ a+ '\t'+ '0'+'\n')
fw.close()

  0%|                                                                                       | 0/250000 [00:00<?, ?it/s]Building prefix dict from the default dictionary ...
Loading model from cache C:\Users\SJ\AppData\Local\Temp\jieba.cache
Loading model cost 0.587 seconds.
Prefix dict has been built succesfully.
  0%|▏                                                                         | 430/250000 [01:39<10:47:24,  6.42it/s]

KeyboardInterrupt: 

In [9]:
with open('./data/valid.tsv', 'w', encoding='utf-8') as fw:
    fw.write('id' + '\t' + 'passage' + '\t' + 'query' + '\t' + 'option' + '\t' + 'label' + '\n')
    for i in tqdm(range(valid_set.shape[0])):
        line = valid_set.iloc[i]
        for a in line['alternatives'].split('|'):
            m = query_alt(query=line['query'], alternatives=line['alternatives'], a=a)
            if a == line['answer']:
                fw.write(str(line['query_id'])+ '\t'+ preprocess(line['passage'], line['alternatives'])+ '\t'+ m+ '\t'+ a+ '\t'+ '1'+'\n')
            else:
                fw.write(str(line['query_id'])+ '\t'+ preprocess(line['passage'], line['alternatives'])+ '\t'+ m+ '\t'+ a+ '\t'+ '0'+'\n')
fw.close()

100%|███████████████████████████████████████████████████████████████████████████| 30000/30000 [00:40<00:00, 733.97it/s]


In [10]:
with open('./data/test.tsv', 'w', encoding='utf-8') as fw:
    fw.write('id' + '\t' + 'passage' + '\t' + 'query' + '\t'+ 'option'+ '\n')
    for i in tqdm(range(test_set.shape[0])):
        line = test_set.iloc[i]
        for a in line['alternatives'].split('|'):
            m = query_alt(query=line['query'], alternatives=line['alternatives'], a=a)
            fw.write(str(line['query_id'])+ '\t'+ preprocess(line['passage'], line['alternatives'])+ '\t'+ m+ '\t'+ a+ '\n')
fw.close()

100%|███████████████████████████████████████████████████████████████████████████| 10000/10000 [00:13<00:00, 753.40it/s]


## <font color=red>测试集test A 有两条有误！

289730只有一个选项，无法确定。

289334只有两个选项，不能or无法确定。选不能

## Part II: Add features for words in passage
Add Exact match, soft align and Pos tag features. Refer to SAN model: https://arxiv.org/abs/1712.03556

Delete soft-align and pos tag. Add option exact match.

In [11]:
maxlen_p = 150
maxlen_q = 15

In [12]:
train_path = './data/train.tsv' # train set
valid_path = './data/valid.tsv' # validation set
test_path = './data/test.tsv' # test set

In [13]:
train = pd.read_csv(train_path, sep='\t', header=0)
valid = pd.read_csv(valid_path, sep='\t', header=0)
test = pd.read_csv(test_path, sep='\t', header=0)
print (train.shape, valid.shape, test.shape)
print (valid.head())

(750000, 5) (90000, 5) (29997, 4)
       id                                            passage  \
0  250001  动漫 好看 的 H   爱 的 魔法   KEY 的 作品   喧嚣 学院   草莓 100...   
1  250001  动漫 好看 的 H   爱 的 魔法   KEY 的 作品   喧嚣 学院   草莓 100...   
2  250001  动漫 好看 的 H   爱 的 魔法   KEY 的 作品   喧嚣 学院   草莓 100...   
3  250002  醋 泡 鸡蛋 确实 具有 一定 美 白嫩 化 肌肤   提高 皮肤 亮度   祛斑 的 效果...   
4  250002  醋 泡 鸡蛋 确实 具有 一定 美 白嫩 化 肌肤   提高 皮肤 亮度   祛斑 的 效果...   

               query option  label  
0       有 有没有 好看 的 h      有      1  
1      没有 有没有 好看 的 h     没有      0  
2    无法确定 有没有 好看 的 h   无法确定      0  
3   能 醋 泡 鸡蛋 真 能 去 斑      能      1  
4  不能 醋 泡 鸡蛋 真 能 去 斑     不能      0  


In [14]:
pl = []
ql = []
for i in tqdm(range(train.shape[0])):
    line = train.iloc[i]
    q_words = line['query'].split()
    p_words = line['passage'].split()
    option = line['option']
    
    if len(p_words) > maxlen_p: # truncate pre
        lt = len(p_words) - maxlen_p
        p_words = p_words[lt:]
    if len(q_words) > maxlen_q: # truncate post
        q_words = q_words[:maxlen_q]

    pfea = []
    for w in p_words:
        # exact match
        if w in q_words:
            em = 1
        else:
            em = 0
        # option match
        if w == option:
            om = 1
        else:
            om = 0
        pfea.append([em, om])
        
    qfea = []
    for w in q_words:
        # exact match
        if w in p_words:
            em = 1
        else:
            em = 0
        # option match
        if w == option:
            om = 1
        else:
            om = 0
        qfea.append([em, om])
        
    while len(pfea) < maxlen_p: # pad with 0 pre
        pfea.insert(0, [0] * 2)
    pl.append(pfea)
    while len(qfea) < maxlen_q: # pad with 0 post
        qfea.append([0] * 2)
    ql.append(qfea)

pl = np.asarray(pl)
ql = np.asarray(ql)
np.save('./data/train_fea_p', pl)
np.save('./data/train_fea_q', ql)
print (np.shape(pl), np.shape(ql))

100%|████████████████████████████████████████████████████████████████████████| 750000/750000 [03:19<00:00, 3756.82it/s]


(750000, 150, 2) (750000, 15, 2)


In [15]:
pl = []
ql = []
for i in tqdm(range(valid.shape[0])):
    line = valid.iloc[i]
    q_words = line['query'].split()
    p_words = line['passage'].split()
    option = line['option']
    
    if len(p_words) > maxlen_p: # truncate pre
        lt = len(p_words) - maxlen_p
        p_words = p_words[lt:]
    if len(q_words) > maxlen_q: # truncate post
        q_words = q_words[:maxlen_q]

    pfea = []
    for w in p_words:
        # exact match
        if w in q_words:
            em = 1
        else:
            em = 0
        # option match
        if w == option:
            om = 1
        else:
            om = 0
        pfea.append([em, om])
        
    qfea = []
    for w in q_words:
        # exact match
        if w in p_words:
            em = 1
        else:
            em = 0
        # option match
        if w == option:
            om = 1
        else:
            om = 0
        qfea.append([em, om])
        
    while len(pfea) < maxlen_p: # pad with 0 pre
        pfea.insert(0, [0] * 2)
    pl.append(pfea)
    while len(qfea) < maxlen_q: # pad with 0 post
        qfea.append([0] * 2)
    ql.append(qfea)

pl = np.asarray(pl)
ql = np.asarray(ql)
np.save('./data/valid_fea_p', pl)
np.save('./data/valid_fea_q', ql)
print (np.shape(pl), np.shape(ql))

100%|██████████████████████████████████████████████████████████████████████████| 90000/90000 [00:22<00:00, 3982.55it/s]


(90000, 150, 2) (90000, 15, 2)


In [16]:
pl = []
ql = []
for i in tqdm(range(test.shape[0])):
    line = test.iloc[i]
    q_words = line['query'].split()
    p_words = line['passage'].split()
    option = line['option']
    
    if len(p_words) > maxlen_p: # truncate pre
        lt = len(p_words) - maxlen_p
        p_words = p_words[lt:]
    if len(q_words) > maxlen_q: # truncate post
        q_words = q_words[:maxlen_q]

    pfea = []
    for w in p_words:
        # exact match
        if w in q_words:
            em = 1
        else:
            em = 0
        # option match
        if w == option:
            om = 1
        else:
            om = 0
        pfea.append([em, om])
        
    qfea = []
    for w in q_words:
        # exact match
        if w in p_words:
            em = 1
        else:
            em = 0
        # option match
        if w == option:
            om = 1
        else:
            om = 0
        qfea.append([em, om])
        
    while len(pfea) < maxlen_p: # pad with 0 pre
        pfea.insert(0, [0] * 2)
    pl.append(pfea)
    while len(qfea) < maxlen_q: # pad with 0 post
        qfea.append([0] * 2)
    ql.append(qfea)

pl = np.asarray(pl)
ql = np.asarray(ql)
np.save('./data/test_fea_p', pl)
np.save('./data/test_fea_q', ql)
print (np.shape(pl), np.shape(ql))

100%|██████████████████████████████████████████████████████████████████████████| 29997/29997 [00:08<00:00, 3665.21it/s]


(29997, 150, 2) (29997, 15, 2)
