In [164]:
import os
import sys
import statistics
import collections
import numpy as np
import pandas as pd
import json
import random


In [179]:
train_file = '/data/sulixin/research/contest/WikiQACodePackage/data/wiki/WikiQASent-train.txt'
dev_file =  '/data/sulixin/research/contest/WikiQACodePackage/data/wiki/WikiQASent-dev.txt'
test_file = '/data/sulixin/research/contest/WikiQACodePackage/data/wiki/WikiQASent-test.txt'

In [185]:
def load_file(data_file):
    rv = []
    for line in open(data_file):
        data = line.strip().split('\t')
        assert len(data) == 3
        assert data[-1] in set(['0', '1'])
        rv.append({'query': data[0], 'passage': data[1], 'label': data[-1]})
    return rv 
train = load_file(train_file)
dev = load_file(dev_file)
test = load_file(test_file)

In [4]:
def analysis_by_pandas(data, label=None):
    print('\n========{}======='.format(label))
    df = pd.DataFrame(data)
    df['passage_len'] = df['passage'].apply(lambda x:len(x.split()))
    df['query_len'] = df['query'].apply(lambda x:len(x.split()))
    df['label_int'] = df['label'].apply(lambda x:int(x))
    print(df.describe())
    gdf = df.groupby(['query']).sum()['label_int'].value_counts(normalize=False,ascending=True)
    #print(gdf)
    gdf = df.groupby(['query']).mean()
    print(gdf.describe())
    gdf = df.groupby(['query']).mean()
    #print(gdf)
    print(gdf[gdf['label_int']!=0].describe())
    gdf = df.groupby(['query']).count()
    print(gdf.describe())
analysis_by_pandas(train, 'train')
analysis_by_pandas(dev, 'dev')
analysis_by_pandas(test, 'test')


        passage_len     query_len     label_int
count  20360.000000  20360.000000  20360.000000
mean      25.290766      7.117534      0.051081
std       13.019002      2.578649      0.220167
min        1.000000      2.000000      0.000000
25%       16.000000      5.000000      0.000000
50%       23.000000      7.000000      0.000000
75%       32.000000      8.000000      0.000000
max      236.000000     23.000000      1.000000
       passage_len    query_len    label_int
count  2118.000000  2118.000000  2118.000000
mean     25.300579     7.156280     0.079204
std       6.786060     2.570615     0.147173
min       2.250000     2.000000     0.000000
25%      21.166667     5.000000     0.000000
50%      24.785714     7.000000     0.000000
75%      28.692308     8.000000     0.111111
max     111.000000    23.000000     1.000000
       passage_len   query_len   label_int
count   873.000000  873.000000  873.000000
mean     25.779438    6.454754    0.192158
std       7.164447    2.072223   

数据汇总

| train      |     dev |   test   |
| :-------- | --------:| :------: |
|1245/2118 (no answer/all query)   |   170/296 |  390/633  |
| 20360(passages)    |   2733 |  6165  |
| 9.61(passages/query)    |   9.233 |   9.73  |
| 25.3(words/passage)    |  24.6 |  24.95  |
| 7.1(words/query)    |   7.23 |  7.25  |
| 0.192(positive/passages on answerable)    |   0.21 |  0.20  |

In [152]:
# 展示可回答的问题的passage 和 label
df = pd.DataFrame(train + dev +test )
df['label_int'] = df['label'].apply(int)
sub_an = df.groupby(['query']).sum()
sub_an = sub_an[sub_an['label_int']!=0]

def format_qp(x):
    return x[['passage','label']].to_dict(orient='records')
sub_qp = df.groupby(['query']).apply(format_qp).to_frame()

merge_df = pd.merge(sub_an, sub_qp, how='inner', left_index=True, right_index=True)
dd = merge_df.reset_index().to_dict(orient='records')


In [136]:
# 保存query，传到本地去爬取passages
json.dump(merge_df.reset_index()[['query']].to_dict(orient='records'), open('/data/sulixin/research/wikiquery.json','w'))

In [153]:
# 保存新格式，传到71，进行BERT匹配
an_queries = set([x['query'] for x in dd])
print(len(an_queries))

def format_dataset(data, out_file, an_queries):
    fout = open(out_file, 'w')
    for idx, d in enumerate(data):
        if d['query'] not in an_queries:
            continue
        fout.write(json.dumps({'id': '{}'.format(idx),
                                   'A': d['query'], 'B': d['passage'], 'label': d['label']}) + '\n')
    
format_dataset(train, 'train.jsonl', an_queries)
format_dataset(dev, 'dev.jsonl', an_queries)
format_dataset(test, 'test.jsonl', an_queries)

1242


In [174]:
!mkdir wikiqa_with_sent_1_sep

In [198]:
# 保存带检索文本的新格式，传到71，进行BERT匹配
q_ps_mapping = {}
for line in open('all_wikiqa_aug.json'):
    data = json.loads(line)
    q_ps_mapping[data['query']] = data['passages']
    

def format_dataset_with_sent(data, out_file, an_queries, add_sep=True, topn=3):
    if not os.path.exists(os.path.dirname(out_file)):
        os.makedirs(os.path.dirname(out_file))
    fout = open(out_file, 'w')
    expand_query_lengths = []
    for idx, d in enumerate(data):
        if d['query'] not in an_queries:
            continue
        if d['query'] not in q_ps_mapping:
            print('no retrieval result {}'.format(d['query']))
        ps = q_ps_mapping[d['query']]
        if add_sep:
            expand_query = '{} SS {}'.format(d['query'], ' '.join(ps[:topn]))
        else:
            expand_query = '{} {}'.format(d['query'], ' '.join(ps[:topn]))
        expand_query_lengths.append(len(expand_query.split()) + len(d['passage'].split()))
        fout.write(json.dumps({'id': '{}'.format(idx),
                                   'A': expand_query, 'B': d['passage'], 'label': d['label']}) + '\n')
    print(statistics.mean(expand_query_lengths))
    
    
for add_sep in [True, False]:
    for nb_a in  [1, 2, 3, 4]:
        print('save {}'.format('wikiqa_with_sent_{}{}/train.jsonl'.format(nb_a, '_sep' if add_sep else '')))
        print('save {}'.format('wikiqa_with_sent_{}{}/dev.jsonl'.format(nb_a, '_sep' if add_sep else '')))
        format_dataset_with_sent(train, 'wikiqa_with_sent_{}{}/train.jsonl'.format(nb_a, '_sep' if add_sep else ''),
                                   an_queries, topn=nb_a, add_sep=add_sep)
        format_dataset_with_sent(dev, 'wikiqa_with_sent_{}{}/dev.jsonl'.format(nb_a, '_sep' if add_sep else ''),
                                   an_queries, topn=nb_a, add_sep=add_sep)
    

save wikiqa_with_sent_1_sep/train.jsonl
save wikiqa_with_sent_1_sep/dev.jsonl
82.83302583025831
79.82300884955752
save wikiqa_with_sent_2_sep/train.jsonl
save wikiqa_with_sent_2_sep/dev.jsonl
126.96171586715867
126.63097345132743
save wikiqa_with_sent_3_sep/train.jsonl
save wikiqa_with_sent_3_sep/dev.jsonl
172.3197647601476
171.75486725663717
save wikiqa_with_sent_4_sep/train.jsonl
save wikiqa_with_sent_4_sep/dev.jsonl
214.81480627306274
216.29469026548674
save wikiqa_with_sent_1/train.jsonl
save wikiqa_with_sent_1/dev.jsonl
81.83302583025831
78.82300884955752
save wikiqa_with_sent_2/train.jsonl
save wikiqa_with_sent_2/dev.jsonl
125.96171586715867
125.63097345132743
save wikiqa_with_sent_3/train.jsonl
save wikiqa_with_sent_3/dev.jsonl
171.3197647601476
170.75486725663717
save wikiqa_with_sent_4/train.jsonl
save wikiqa_with_sent_4/dev.jsonl
213.81480627306274
215.29469026548674


In [196]:
!head -n2 wikiqa_with_sent_1_sep/train.jsonl wikiqa_with_sent_1/train.jsonl
!head -n2 wikiqa_with_sent_2/train.jsonl wikiqa_with_sent_1/train.jsonl

==> wikiqa_with_sent_1_sep/train.jsonl <==
{"id": "0", "A": "how are glacier caves formed ? A A glacier cave can also be formed by geothermal heat in the ground below a glacier. Warm air during the summer can enter and increase the size of a glacier cave. Where a glacier meets a body of water, wave action can form a glacier cave that may be partially submerged.", "B": "A partly submerged glacier cave on Perito Moreno Glacier .", "label": "0"}
{"id": "1", "A": "how are glacier caves formed ? A A glacier cave can also be formed by geothermal heat in the ground below a glacier. Warm air during the summer can enter and increase the size of a glacier cave. Where a glacier meets a body of water, wave action can form a glacier cave that may be partially submerged.", "B": "The ice facade is approximately 60 m high", "label": "0"}

==> wikiqa_with_sent_1/train.jsonl <==
{"id": "0", "A": "how are glacier caves formed ? A glacier cave can also be formed by geothermal heat in the ground below a gl

In [197]:
# 保存带检索文本的新格式，传到71，进行BERT匹配
q_answer_mapping = collections.defaultdict(list)
data = json.load(open('predictions.json'))
for k, v in data.items():
    query = k.split('--p')[0]
    q_answer_mapping[query].append(v)
    

def format_dataset_with_answer(data, out_file, an_queries, topn=3, add_sep=True):
    if not os.path.exists(os.path.dirname(out_file)):
        os.makedirs(os.path.dirname(out_file))
    fout = open(out_file, 'w')
    expand_query_lengths = []
    for idx, d in enumerate(data):
        if d['query'] not in an_queries:
            continue
        if d['query'] not in q_answer_mapping:
            print('no retrieval result {}'.format(d['query']))
        aug = ' '.join(q_answer_mapping[d['query']][:topn])
        if add_sep:
            expand_query = '{} AA {}'.format(d['query'], aug)
        else:
            expand_query = '{} {}'.format(d['query'], aug)
        expand_query_lengths.append(len(expand_query.split()) + len(d['passage'].split()))
        fout.write(json.dumps({'id': '{}'.format(idx),
                                   'A': expand_query, 'B': d['passage'], 'label': d['label']}) + '\n')
    print(statistics.mean(expand_query_lengths))
    

for add_sep in [True, False]:
    for nb_a in  [1, 2, 3, 4]:
        print('save {}'.format('wikiqa_with_answer_{}{}/train.jsonl'.format(nb_a, '_sep' if add_sep else '')))
        print('save {}'.format('wikiqa_with_answer_{}{}/dev.jsonl'.format(nb_a, '_sep' if add_sep else '')))
        format_dataset_with_answer(train, 'wikiqa_with_answer_{}{}/train.jsonl'.format(nb_a, '_sep' if add_sep else ''),
                                   an_queries, topn=nb_a, add_sep=add_sep)
        format_dataset_with_answer(dev, 'wikiqa_with_answer_{}{}/dev.jsonl'.format(nb_a, '_sep' if add_sep else ''),
                                   an_queries, topn=nb_a, add_sep=add_sep)
        

save wikiqa_with_answer_1_sep/train.jsonl
save wikiqa_with_answer_1_sep/dev.jsonl
no retrieval result what is high emotional intelligence ?
39.52709870848709
39.11592920353982
save wikiqa_with_answer_2_sep/train.jsonl
save wikiqa_with_answer_2_sep/dev.jsonl
46.424008302583026
45.727433628318586
save wikiqa_with_answer_3_sep/train.jsonl
save wikiqa_with_answer_3_sep/dev.jsonl
52.583487084870846
51.59557522123894
save wikiqa_with_answer_4_sep/train.jsonl
save wikiqa_with_answer_4_sep/dev.jsonl
59.492273985239855
57.59911504424779
save wikiqa_with_answer_1/train.jsonl
save wikiqa_with_answer_1/dev.jsonl
38.52709870848709
38.11592920353982
save wikiqa_with_answer_2/train.jsonl
save wikiqa_with_answer_2/dev.jsonl
45.424008302583026
44.727433628318586
save wikiqa_with_answer_3/train.jsonl
save wikiqa_with_answer_3/dev.jsonl
51.583487084870846
50.59557522123894
save wikiqa_with_answer_4/train.jsonl
save wikiqa_with_answer_4/dev.jsonl
58.492273985239855
56.59911504424779


In [189]:
!head -n2 wikiqa_with_answer_1_sep/train.jsonl wikiqa_with_answer_1/train.jsonl
!head -n2 wikiqa_with_answer_2/train.jsonl wikiqa_with_answer_1/train.jsonl

==> wikiqa_with_answer_1_sep/train.jsonl <==
{"id": "0", "A": "how are glacier caves formed ? A geothermal heat", "B": "A partly submerged glacier cave on Perito Moreno Glacier .", "label": "0"}
{"id": "1", "A": "how are glacier caves formed ? A geothermal heat", "B": "The ice facade is approximately 60 m high", "label": "0"}

==> wikiqa_with_answer_1/train.jsonl <==
{"id": "0", "A": "how are glacier caves formed ? geothermal heat", "B": "A partly submerged glacier cave on Perito Moreno Glacier .", "label": "0"}
{"id": "1", "A": "how are glacier caves formed ? geothermal heat", "B": "The ice facade is approximately 60 m high", "label": "0"}
==> wikiqa_with_answer_2/train.jsonl <==
{"id": "0", "A": "how are glacier caves formed ? geothermal heat geothermal heat from volcanic vents or hotsprings beneath the ice", "B": "A partly submerged glacier cave on Perito Moreno Glacier .", "label": "0"}
{"id": "1", "A": "how are glacier caves formed ? geothermal heat geothermal heat from volcanic v

In [161]:
# 转换成SQuAD格式
rv = {'data':[]}
for line in open('all_wikiqa_aug.json'):
    data = json.loads(line)
    query = data['query']
    for pidx, p in enumerate(data['passages']):
        rv['data'].append({"paragraphs": [{"context": p, "qas": [{"id": '{}--p{}'.format(query, pidx), "question": query}]}]})
json.dump(rv, open('wikiqa_squad.json', 'w'))

In [162]:
!realpath wikiqa_squad.json 

/data/sulixin/research/contest/data_repo/wikiqa_squad.json


In [201]:
!head -n2 semeval_query_aug.json

{"query": "Where I can buy good oil for massage?", "passages": ["A solar eclipse occurs when an observer (on Earth) passes through the shadow cast by the Moon which fully or partially blocks (\"occults\") the Sun.This can only happen when the Sun, Moon and Earth are nearly aligned on a straight line in three dimensions during a new moon when the Moon is close to the ecliptic plane. In a total eclipse, the disk of the Sun is fully obscured by the Moon.", "Partial solar eclipses. A partial solar eclipse occurs when only the penumbra (the partial shadow) passes over you. In these cases, a part of the sun always remains in view during the eclipse.", "This is NASA's official solar eclipse page. It contains maps and tables for 5,000 years of lunar eclipses and includes information on eclipse photography and observing tips.", "The third and final solar eclipse of the year will be a \"ring of fire\" eclipse on Dec. 26, and it will be visible from Saudi Arabia, Qatar, India, Sumatra, Borneo, Gu