# 问题描述
```
对于一个给定的问题q和一个篇章p，参赛系统需要根据篇章内容，给出该问题的答案a。数据集中的每个样本，是一个三元组 < q, p, a > ，例如：
Given a question q and a paragraph p, a participating MRC system is expected to output an answer a to the question q based on the evidences in p. Each sample in the dataset is a triplet of < q, p, a >. An example is shown as below:

问题(q) : 乔丹打了多少个赛季
篇章 p : 迈克尔.乔丹在NBA打了15个赛季。他在84年进入nba，期间在1993年10月6日第一次退役改打棒球，95年3月18日重新回归，在99年1月13日第二次退役，后于2001年10月31日复出，在03年最终退役…
参考答案(a): [‘15个’,‘15个赛季’]
Question(q): How many seasons did Jordan play
Paragraph p: Michael Jordan has played in NBA for 15 seasons. He entered the NBA in 1984. On October 6, 1993, he retired for the first time, and started to play baseball. Then he returned to NBA on March 18, 1995. On January 13, 1999, he retired for the second time, and came back on October 31st, 2001. Finally, he retired in 2003 …
Reference answers (a): [‘15’, ‘15 seasons’]
```

# 1.数据分析

In [1]:
import json
import numpy as np
import random
from collections import Counter
from copy import deepcopy
random.seed(42)

def search(pattern, sequence):
    """从 sequence 中寻找子串 pattern
    如果找到，返回第一个下标；否则返回-1。
    """
    n = len(pattern)
    for i in range(len(sequence)):
        if sequence[i:i + n] == pattern:
            return i
    return -1

In [2]:
with open('../datasets/test1.json', 'r', encoding='utf-8') as f:
    data = json.load(f)['data']

paras = data[0]['paragraphs']

q_len = []
p_len = []
a_len = []
num_qa = 0

for i,p in enumerate(paras):
    p_len.append(len(p['context']))
    for qa in p['qas']:
        num_qa += 1
        question = qa['question']
        q_len.append(len(question))

print('num_qa:', num_qa)
print('\ncontext 平均长度:', np.mean(p_len))
print(Counter(p_len))
p_len = np.array(p_len)
print(f'长度大于32句子占比：{len(p_len[p_len>32])/len(p_len)*100}%')
print(f'长度大于64句子占比：{len(p_len[p_len>64])/len(p_len)*100}%')
print(f'长度大于128句子占比：{len(p_len[p_len>128])/len(p_len)*100}%')
print(f'长度大于384句子占比：{len(p_len[p_len>384])/len(p_len)*100}%')
print(f'长度大于512句子占比：{len(p_len[p_len>512])/len(p_len)*100}%')
print(f'长度大于768句子占比：{len(p_len[p_len>768])/len(p_len)*100}%')
print(f'长度大于1024句子占比：{len(p_len[p_len>1024])/len(p_len)*100}%')

print('\nquestion 平均长度:', np.mean(q_len))
print(Counter(q_len))
q_len = np.array(q_len)
print(f'问题大于32占比：{len(q_len[q_len>32])/len(q_len)*100}%')

num_qa: 50000

context 平均长度: 318.3483372960297
Counter({156: 253, 103: 180, 116: 170, 100: 169, 107: 166, 118: 160, 106: 154, 101: 153, 104: 151, 120: 149, 146: 147, 105: 146, 113: 146, 102: 145, 112: 144, 108: 144, 111: 144, 123: 141, 115: 141, 110: 139, 127: 139, 139: 136, 132: 135, 117: 132, 109: 132, 129: 130, 114: 128, 122: 128, 119: 127, 138: 127, 121: 127, 124: 125, 141: 125, 135: 122, 145: 122, 133: 119, 134: 118, 144: 116, 130: 115, 128: 114, 126: 114, 125: 114, 136: 114, 150: 113, 149: 112, 142: 108, 143: 106, 148: 104, 140: 104, 153: 102, 160: 102, 155: 102, 137: 102, 162: 100, 154: 99, 180: 96, 169: 96, 203: 95, 201: 94, 131: 94, 147: 93, 205: 93, 211: 91, 157: 90, 173: 89, 152: 88, 158: 87, 164: 87, 221: 86, 177: 86, 178: 86, 171: 86, 207: 86, 166: 85, 174: 84, 175: 83, 159: 83, 233: 81, 219: 81, 179: 81, 213: 81, 151: 81, 161: 80, 251: 80, 209: 80, 172: 80, 257: 80, 249: 80, 165: 78, 170: 76, 259: 76, 185: 76, 253: 76, 191: 76, 225: 76, 241: 76, 235: 75, 229: 75, 227: 74,

In [3]:
with open('../datasets/train.json', 'r', encoding='utf-8') as f:
    data = json.load(f)['data']

paras = data[0]['paragraphs']

q_len = []
p_len = []
a_len = []
num_qa = 0

for i,p in enumerate(paras):
    p_len.append(len(p['context']))
    for qa in p['qas']:
        num_qa += 1
        question = qa['question']
        q_len.append(len(question))
        answers = qa['answers']
        if answers != []:
            answer = answers[0]['text']
            a_len.append(len(answer))

print('num_qa:', num_qa)
print('\ncontext 平均长度:', np.mean(p_len))
print(Counter(p_len))
p_len = np.array(p_len)
print(f'长度大于32句子占比：{len(p_len[p_len>32])/len(p_len)*100}%')
print(f'长度大于64句子占比：{len(p_len[p_len>64])/len(p_len)*100}%')
print(f'长度大于128句子占比：{len(p_len[p_len>128])/len(p_len)*100}%')
print(f'长度大于384句子占比：{len(p_len[p_len>384])/len(p_len)*100}%')
print(f'长度大于512句子占比：{len(p_len[p_len>512])/len(p_len)*100}%')
print(f'长度大于768句子占比：{len(p_len[p_len>768])/len(p_len)*100}%')
print(f'长度大于1024句子占比：{len(p_len[p_len>1024])/len(p_len)*100}%')

print('\nquestion 平均长度:', np.mean(q_len))
print(Counter(q_len))
q_len = np.array(q_len)
print(f'问题大于32占比：{len(q_len[q_len>32])/len(q_len)*100}%')

print('\nanswer 平均长度:', np.mean(a_len))
print(Counter(a_len))
a_len = np.array(a_len)
print(f'答案大于32占比：{len(a_len[a_len>32])/len(a_len)*100}%')

num_qa: 14520

context 平均长度: 282.2969696969697
Counter({111: 108, 103: 105, 113: 104, 104: 103, 100: 102, 120: 101, 107: 100, 105: 98, 130: 98, 106: 97, 101: 97, 116: 96, 115: 95, 122: 92, 112: 90, 121: 90, 108: 89, 123: 88, 139: 87, 118: 87, 110: 86, 102: 86, 143: 85, 134: 84, 132: 84, 129: 83, 125: 82, 137: 82, 135: 82, 138: 81, 141: 80, 114: 79, 133: 77, 109: 76, 117: 75, 126: 75, 119: 75, 124: 75, 127: 74, 136: 74, 142: 71, 144: 70, 131: 70, 128: 69, 166: 69, 177: 68, 150: 67, 157: 67, 148: 67, 154: 66, 152: 66, 140: 66, 151: 65, 145: 65, 155: 64, 153: 64, 199: 63, 156: 62, 147: 61, 169: 61, 162: 61, 159: 59, 211: 58, 146: 58, 168: 57, 149: 57, 165: 55, 181: 55, 173: 54, 175: 54, 158: 53, 176: 53, 160: 53, 179: 52, 171: 52, 187: 51, 193: 51, 188: 51, 164: 50, 167: 49, 170: 49, 191: 49, 183: 48, 189: 48, 174: 47, 192: 46, 185: 46, 178: 46, 180: 45, 198: 45, 203: 45, 163: 45, 202: 45, 161: 44, 227: 43, 190: 43, 221: 41, 172: 41, 222: 40, 210: 40, 228: 40, 208: 40, 201: 39, 218: 39, 1

In [30]:
with open('../datasets/cmrc2018.json', 'r', encoding='utf-8') as f:
    data = json.load(f)['data']

paras = data[0]['paragraphs']

q_len = []
p_len = []
a_len = []
num_qa = 0
paras_small = []

for i,p in enumerate(paras):
    p_len.append(len(p['context']))
    new_qas = []
    for qa in p['qas']:
        num_qa += 1
        question = qa['question']
#         if len(question) > 32:
#             print(qa)
        q_len.append(len(question))
        answers = qa['answers']
        if answers != []:
            answer = answers[0]['text']
            a_len.append(len(answer))
            if len(answer) <= 64:
                new_qas.append(qa)
    if new_qas:
        p['qas'] = new_qas
        paras_small.append(p)

print('num_qa:', num_qa)
print('\ncontext 平均长度:', np.mean(p_len))
print(Counter(p_len))
p_len = np.array(p_len)
print(f'长度大于32句子占比：{len(p_len[p_len>32])/len(p_len)*100}%')
print(f'长度大于64句子占比：{len(p_len[p_len>64])/len(p_len)*100}%')
print(f'长度大于128句子占比：{len(p_len[p_len>128])/len(p_len)*100}%')
print(f'长度大于384句子占比：{len(p_len[p_len>384])/len(p_len)*100}%')
print(f'长度大于512句子占比：{len(p_len[p_len>512])/len(p_len)*100}%')
print(f'长度大于768句子占比：{len(p_len[p_len>768])/len(p_len)*100}%')
print(f'长度大于1024句子占比：{len(p_len[p_len>1024])/len(p_len)*100}%')

print('\nquestion 平均长度:', np.mean(q_len))
print(Counter(q_len))
q_len = np.array(q_len)
print(f'问题大于32占比：{len(q_len[q_len>32])/len(q_len)*100}%')

print('\nanswer 平均长度:', np.mean(a_len))
print(Counter(a_len))
a_len = np.array(a_len)
print(f'答案大于64占比：{len(a_len[a_len>64])/len(a_len)*100}%')

data_small = {'data': [{'paragraphs': paras_small}]}
with open('../datasets/cmrc2018_small.json', "w", encoding='utf-8') as f:
    f.write(json.dumps(data_small, ensure_ascii=False, indent=4) + "\n")

num_qa: 14362

context 平均长度: 509.0818363273453
Counter({314: 19, 307: 19, 426: 19, 286: 19, 349: 18, 301: 17, 291: 17, 308: 17, 306: 16, 304: 16, 294: 16, 369: 16, 305: 15, 298: 15, 288: 15, 311: 15, 293: 15, 354: 15, 323: 15, 332: 15, 398: 14, 336: 14, 353: 14, 322: 14, 355: 14, 289: 14, 348: 14, 345: 13, 438: 13, 405: 13, 333: 13, 394: 13, 370: 13, 320: 13, 459: 13, 358: 13, 319: 13, 409: 13, 341: 13, 352: 12, 299: 12, 312: 12, 399: 12, 363: 12, 411: 12, 375: 12, 350: 12, 303: 12, 393: 12, 564: 12, 302: 12, 378: 12, 379: 12, 347: 12, 500: 12, 374: 12, 281: 12, 315: 12, 483: 11, 321: 11, 343: 11, 295: 11, 361: 11, 325: 11, 351: 11, 359: 11, 367: 11, 451: 11, 485: 11, 391: 11, 441: 11, 290: 11, 521: 11, 300: 10, 342: 10, 340: 10, 309: 10, 284: 10, 327: 10, 376: 10, 415: 10, 365: 10, 381: 10, 373: 10, 385: 10, 324: 10, 357: 10, 296: 10, 413: 10, 462: 10, 338: 10, 402: 10, 372: 10, 388: 10, 592: 10, 436: 10, 318: 10, 468: 10, 429: 10, 285: 10, 421: 10, 313: 9, 297: 9, 476: 9, 331: 9, 428

In [35]:
with open('../datasets/cmrc2018_small.json', 'r', encoding='utf-8') as f:
    data = json.load(f)['data']

paras = data[0]['paragraphs']

q_len = []
p_len = []
a_len = []
num_qa = 0

for i,p in enumerate(paras):
    p_len.append(len(p['context']))
    if len(p['context']) > 512:
        print(p['context'])
    for qa in p['qas']:
        num_qa += 1
        question = qa['question']
        q_len.append(len(question))
        answers = qa['answers']
        if answers != []:
            answer = answers[0]['text']
            a_len.append(len(answer))

print('num_qa:', num_qa)
print('\ncontext 平均长度:', np.mean(p_len))
print(Counter(p_len))
p_len = np.array(p_len)
print(f'长度大于128句子占比：{len(p_len[p_len>128])/len(p_len)*100}%')
print(f'长度大于384句子占比：{len(p_len[p_len>384])/len(p_len)*100}%')
print(f'长度大于512句子占比：{len(p_len[p_len>512])/len(p_len)*100}%')
print(f'长度大于768句子占比：{len(p_len[p_len>768])/len(p_len)*100}%')
print(f'长度大于1024句子占比：{len(p_len[p_len>1024])/len(p_len)*100}%')

print('\nquestion 平均长度:', np.mean(q_len))
print(Counter(q_len))
q_len = np.array(q_len)
print(f'问题大于32占比：{len(q_len[q_len>32])/len(q_len)*100}%')

print('\nanswer 平均长度:', np.mean(a_len))
print(Counter(a_len))
a_len = np.array(a_len)
print(f'答案大于64占比：{len(a_len[a_len>64])/len(a_len)*100}%')

形容希望拥有芭比娃娃所代表的外表和生活方式的欲望。虽然芭比症候群多青春期前和青少年女性相关，但亦适用于任何年龄层。有芭比症候群的儿童希望可以尽量对男性显得美丽，而且，虽然芭比的身型比例极端，但她们仍然相信可以像芭比一样美丽。患芭比症候群者努力追求无法达到的身型。此外也有，指努力令自己充满魅力和吸引力的男性，他们会努力令自己像芭比的男伴-{zh:肯尼;zh-hans:肯尼;zh-hk:阿 Ken;zh-tw:肯尼;}-一样受人欢迎。于美国俄亥俄州长大，现居伦敦的辛蒂·杰克森被指为芭比症候群的现实例子，她曾接受了20多次的整容手术 ，花费共5万5千美元元美金，令自己变成活的芭比娃娃，还向英国国税局登记自己为生化女人，后来她以自身经验，在伦敦开设「美容整形网」。她表示过个人哲学：「男人的确会被女人的容貌吸引，他们不能忍受一个生病的女人，更别提一个既病又丑，而且臃肿的妇女，此外，我认为男人也会担心你将死在手术台上，那么就没有人为他煮晚餐了。」共有：在1993年时，辛帝全身上下只有下嘴唇还没有整过形。其他被指为芭比症候群患者的名人包括。永远的芭比 智库文化 M.G. Lord著 阎蕙群译 ISBN 957-9553-36-X
新发田长敦（しばた ながあつ，1538年（天文7年）─1580年（天正8年）。新发田家的祖先是在源平合战中，讨伐越后城资盛的总指挥御家人佐佐木盛纲。1530年，上杉家一族的上条定宪和守护代长尾家相互对立，发生史称的上条之乱。五十公野城城主新发田纲贞（新发田长敦之父）与本庄房长、鲇川清长、水原政家、黑川清实、五十公野景家、加地春纲、竹俣昌纲等等下越国人（即扬北众）一起站到上条定宪一方，与长尾为景交战，在历经一番战战合合后，最终都为上杉谦信的勇武所折服，成为其麾下势力。上杉谦信平定越后之际，新发田纲贞的嫡子新发田长敦继承了家名，成为新发田城城主。新发田长敦侍奉上杉谦信后逐渐头角显露，以优秀的外交手腕著称，担任春日山城门的班职，并在各地的战役中活跃，因此受上杉谦信的重用，列为越后七手组大将之一。第四次川中岛之战中，新发田部队击溃了武田军诸角虎定部，并讨取老将诸角虎定，立下了大功。1568年，武田军进入信州长沼，新发田家受到上杉谦信动员的命令，新发田长敦率同一族的五十公野家出阵信州饭山，这以后他又先后担当了上杉家内政和充分发挥了其优秀的外交手腕，在与武田胜赖的和

In [36]:
with open('../datasets/cmrc2018_small.json', 'r', encoding='utf-8') as f:
    data_aug = json.load(f)
with open('../datasets/train.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

data["data"][0]["paragraphs"] += data_aug['data'][0]['paragraphs']
random.shuffle(data["data"][0]["paragraphs"])

with open('../datasets/train_aug.json', "w", encoding='utf-8') as f:
    f.write(json.dumps(data, ensure_ascii=False, indent=4) + "\n")

In [2]:
with open('../datasets/cmrc2018_small.json', 'r', encoding='utf-8') as f:
    data_aug = json.load(f)
with open('../datasets/train.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

paras = data_aug['data'][0]['paragraphs']
random.shuffle(paras)
d1 = data["data"][0]["paragraphs"] + paras[:int(len(paras)/2)]
d2 = data["data"][0]["paragraphs"] + paras[int(len(paras)/2):]
random.shuffle(d1)
random.shuffle(d2)

d1_final = {'data': [{'paragraphs': d1}]}
d2_final = {'data': [{'paragraphs': d2}]}
with open('../datasets/train_aug_d1.json', "w", encoding='utf-8') as f:
    f.write(json.dumps(d1_final, ensure_ascii=False, indent=4) + "\n")
with open('../datasets/train_aug_d2.json', "w", encoding='utf-8') as f:
    f.write(json.dumps(d2_final, ensure_ascii=False, indent=4) + "\n")

In [12]:
def search(pattern, sequence):
    """从 sequence 中寻找子串 pattern
    如果找到，返回第一个下标；否则返回-1。
    """
    n = len(pattern)
    for i in range(len(sequence)):
        if sequence[i:i + n] == pattern:
            return i
    return -1


with open('../datasets/WebQA.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

paras = []

for i,d in enumerate(data):
    question = d['question']
    idx = d['id']
    history = []
    for p in d['passages']:
        if p['passage'] in history:
            continue
        history.append(p['passage'])
        if p["answer"] != '':
            start = search(p["answer"], p['passage'])
            qas = [{'question': question, 'id': idx, 'answers': [{'text': p['answer'], 'answer_start': start}]}]
            paras.append({'context': p['passage'], 'qas': qas})

data = {'data': [{'paragraphs': paras}]}
with open('../datasets/webqa_no_dup.json', "w", encoding='utf-8') as f:
    f.write(json.dumps(data, ensure_ascii=False, indent=4) + "\n")
paras = [p for p in paras if len(p['context'])>= 32]
data = {'data': [{'paragraphs': paras}]}
with open('../datasets/webqa_no_dup_minlen32.json', "w", encoding='utf-8') as f:
    f.write(json.dumps(data, ensure_ascii=False, indent=4) + "\n")
paras = [p for p in paras if len(p['context'])>= 64]
data = {'data': [{'paragraphs': paras}]}
with open('../datasets/webqa_no_dup_minlen64.json', "w", encoding='utf-8') as f:
    f.write(json.dumps(data, ensure_ascii=False, indent=4) + "\n")

In [14]:
with open('../datasets/webqa_no_dup_minlen64.json', 'r', encoding='utf-8') as f:
    data = json.load(f)['data']

paras = data[0]['paragraphs']

q_len = []
p_len = []
a_len = []
num_qa = 0

for i,p in enumerate(paras):
    p_len.append(len(p['context']))
#     if len(p['context']) > 512:
#         print(p['context'])
    for qa in p['qas']:
        num_qa += 1
        question = qa['question']
        q_len.append(len(question))
        answers = qa['answers']
        if answers != []:
            answer = answers[0]['text']
            a_len.append(len(answer))

print('num_qa:', num_qa)
print('\ncontext 平均长度:', np.mean(p_len))
print(Counter(p_len))
p_len = np.array(p_len)
print(f'长度大于32句子占比：{len(p_len[p_len>32])/len(p_len)*100}%')
print(f'长度大于64句子占比：{len(p_len[p_len>64])/len(p_len)*100}%')
print(f'长度大于128句子占比：{len(p_len[p_len>128])/len(p_len)*100}%')
print(f'长度大于384句子占比：{len(p_len[p_len>384])/len(p_len)*100}%')
print(f'长度大于512句子占比：{len(p_len[p_len>512])/len(p_len)*100}%')
print(f'长度大于768句子占比：{len(p_len[p_len>768])/len(p_len)*100}%')
print(f'长度大于1024句子占比：{len(p_len[p_len>1024])/len(p_len)*100}%')

print('\nquestion 平均长度:', np.mean(q_len))
print(Counter(q_len))
q_len = np.array(q_len)
print(f'问题大于32占比：{len(q_len[q_len>32])/len(q_len)*100}%')

print('\nanswer 平均长度:', np.mean(a_len))
print(Counter(a_len))
a_len = np.array(a_len)
print(f'答案大于64占比：{len(a_len[a_len>64])/len(a_len)*100}%')

num_qa: 108563

context 平均长度: 176.61989812367014
Counter({82: 1431, 68: 1380, 64: 1375, 65: 1369, 84: 1347, 66: 1340, 83: 1330, 69: 1313, 70: 1304, 81: 1302, 85: 1292, 67: 1280, 118: 1247, 80: 1247, 73: 1245, 72: 1239, 71: 1225, 86: 1209, 79: 1208, 74: 1208, 77: 1203, 76: 1198, 75: 1189, 78: 1188, 87: 1113, 119: 1071, 120: 1023, 89: 1008, 88: 1004, 90: 996, 92: 947, 93: 924, 121: 900, 91: 890, 97: 845, 122: 839, 94: 838, 95: 832, 98: 815, 117: 797, 100: 775, 96: 774, 123: 773, 116: 748, 113: 744, 124: 741, 111: 734, 107: 732, 103: 725, 115: 725, 99: 721, 110: 720, 114: 706, 108: 703, 112: 694, 102: 691, 104: 689, 106: 689, 101: 685, 125: 676, 105: 656, 109: 651, 127: 602, 126: 602, 130: 537, 128: 533, 129: 511, 132: 455, 131: 448, 134: 440, 137: 419, 133: 404, 135: 397, 136: 388, 139: 370, 138: 367, 141: 333, 142: 333, 140: 325, 143: 320, 149: 316, 157: 315, 144: 314, 145: 312, 148: 308, 147: 305, 154: 294, 146: 292, 151: 289, 150: 286, 152: 275, 156: 267, 153: 264, 155: 259, 165: 257,

In [15]:
with open('../datasets/SogouQA.json', 'r', encoding='utf-8') as f:
    data = json.load(f)

paras = []

for i,d in enumerate(data):
    question = d['question']
    idx = d['id']
    history = []
    for p in d['passages']:
        if p['passage'] in history:
            continue
        history.append(p['passage'])
        if p["answer"] != '':
            start = search(p["answer"], p['passage'])
            qas = [{'question': question, 'id': idx, 'answers': [{'text': p['answer'], 'answer_start': start}]}]
            paras.append({'context': p['passage'], 'qas': qas})

data = {'data': [{'paragraphs': paras}]}
with open('../datasets/sogouqa_no_dup.json', "w", encoding='utf-8') as f:
    f.write(json.dumps(data, ensure_ascii=False, indent=4) + "\n")
paras = [p for p in paras if len(p['context'])>= 32]
data = {'data': [{'paragraphs': paras}]}
with open('../datasets/sogouqa_no_dup_minlen32.json', "w", encoding='utf-8') as f:
    f.write(json.dumps(data, ensure_ascii=False, indent=4) + "\n")
paras = [p for p in paras if len(p['context'])>= 64]
data = {'data': [{'paragraphs': paras}]}
with open('../datasets/sogouqa_no_dup_minlen64.json', "w", encoding='utf-8') as f:
    f.write(json.dumps(data, ensure_ascii=False, indent=4) + "\n")

In [16]:
with open('../datasets/sogouqa_no_dup_minlen64.json', 'r', encoding='utf-8') as f:
    data = json.load(f)['data']

paras = data[0]['paragraphs']

q_len = []
p_len = []
a_len = []
num_qa = 0

for i,p in enumerate(paras):
    p_len.append(len(p['context']))
#     if len(p['context']) > 512:
#         print(p['context'])
    for qa in p['qas']:
        num_qa += 1
        question = qa['question']
        q_len.append(len(question))
        answers = qa['answers']
        if answers != []:
            answer = answers[0]['text']
            a_len.append(len(answer))

print('num_qa:', num_qa)
print('\ncontext 平均长度:', np.mean(p_len))
print(Counter(p_len))
p_len = np.array(p_len)
print(f'长度大于32句子占比：{len(p_len[p_len>32])/len(p_len)*100}%')
print(f'长度大于64句子占比：{len(p_len[p_len>64])/len(p_len)*100}%')
print(f'长度大于128句子占比：{len(p_len[p_len>128])/len(p_len)*100}%')
print(f'长度大于384句子占比：{len(p_len[p_len>384])/len(p_len)*100}%')
print(f'长度大于512句子占比：{len(p_len[p_len>512])/len(p_len)*100}%')
print(f'长度大于768句子占比：{len(p_len[p_len>768])/len(p_len)*100}%')
print(f'长度大于1024句子占比：{len(p_len[p_len>1024])/len(p_len)*100}%')

print('\nquestion 平均长度:', np.mean(q_len))
print(Counter(q_len))
q_len = np.array(q_len)
print(f'问题大于32占比：{len(q_len[q_len>32])/len(q_len)*100}%')

print('\nanswer 平均长度:', np.mean(a_len))
print(Counter(a_len))
a_len = np.array(a_len)
print(f'答案大于64占比：{len(a_len[a_len>64])/len(a_len)*100}%')

num_qa: 64441

context 平均长度: 114.39524526310889
Counter({92: 1249, 93: 1248, 91: 1219, 94: 1199, 88: 1187, 96: 1187, 95: 1164, 86: 1138, 89: 1132, 90: 1101, 85: 1088, 87: 1078, 83: 1060, 97: 1054, 98: 1042, 84: 1038, 82: 1003, 99: 935, 100: 923, 81: 907, 101: 876, 80: 859, 103: 823, 64: 820, 79: 808, 65: 800, 78: 793, 66: 791, 104: 785, 102: 778, 67: 764, 77: 764, 68: 761, 106: 757, 69: 743, 105: 741, 70: 725, 71: 725, 76: 681, 73: 675, 107: 654, 75: 647, 72: 646, 108: 642, 74: 634, 109: 603, 111: 602, 110: 593, 112: 549, 115: 496, 116: 490, 113: 487, 119: 479, 114: 479, 118: 470, 117: 460, 120: 455, 122: 416, 121: 394, 124: 390, 123: 383, 126: 374, 127: 351, 131: 333, 125: 321, 128: 314, 130: 312, 129: 310, 304: 300, 134: 294, 133: 281, 132: 276, 135: 268, 136: 249, 139: 243, 140: 242, 137: 241, 138: 229, 141: 216, 143: 205, 148: 203, 150: 201, 142: 200, 147: 197, 149: 193, 146: 192, 145: 191, 144: 185, 153: 172, 154: 171, 151: 152, 152: 151, 155: 151, 161: 150, 158: 146, 159: 146, 15

In [7]:
with open('../datasets/webqa_no_dup.json', 'r', encoding='utf-8') as f:
    webqa = json.load(f)
with open('../datasets/sogouqa_no_dup.json', 'r', encoding='utf-8') as f:
    sogouqa = json.load(f)

paras = webqa['data'][0]['paragraphs'] + sogouqa['data'][0]['paragraphs']
random.shuffle(paras)

data = {'data': [{'paragraphs': paras}]}
with open('../datasets/qa_no_dup.json', "w", encoding='utf-8') as f:
    f.write(json.dumps(data, ensure_ascii=False, indent=4) + "\n")

In [10]:
with open('../datasets/webqa_no_dup_minlen32.json', 'r', encoding='utf-8') as f:
    webqa = json.load(f)
with open('../datasets/sogouqa_no_dup_minlen32.json', 'r', encoding='utf-8') as f:
    sogouqa = json.load(f)

paras = webqa['data'][0]['paragraphs'] + sogouqa['data'][0]['paragraphs']
random.shuffle(paras)

data = {'data': [{'paragraphs': paras}]}
with open('../datasets/qa_no_dup_minlen32.json', "w", encoding='utf-8') as f:
    f.write(json.dumps(data, ensure_ascii=False, indent=4) + "\n")

In [17]:
with open('../datasets/webqa_no_dup_minlen64.json', 'r', encoding='utf-8') as f:
    webqa = json.load(f)
with open('../datasets/sogouqa_no_dup_minlen64.json', 'r', encoding='utf-8') as f:
    sogouqa = json.load(f)

paras = webqa['data'][0]['paragraphs'] + sogouqa['data'][0]['paragraphs']
random.shuffle(paras)

data = {'data': [{'paragraphs': paras}]}
with open('../datasets/qa_no_dup_minlen64.json', "w", encoding='utf-8') as f:
    f.write(json.dumps(data, ensure_ascii=False, indent=4) + "\n")

In [18]:
with open('../datasets/qa_no_dup_minlen64.json', 'r', encoding='utf-8') as f:
    data = json.load(f)['data']

paras = data[0]['paragraphs']

q_len = []
p_len = []
a_len = []
num_qa = 0

for i,p in enumerate(paras):
    p_len.append(len(p['context']))
#     if len(p['context']) > 512:
#         print(p['context'])
    for qa in p['qas']:
        num_qa += 1
        question = qa['question']
        q_len.append(len(question))
        answers = qa['answers']
        if answers != []:
            answer = answers[0]['text']
            a_len.append(len(answer))

print('num_qa:', num_qa)
print('\ncontext 平均长度:', np.mean(p_len))
print(Counter(p_len))
p_len = np.array(p_len)
print(f'长度大于32句子占比：{len(p_len[p_len>32])/len(p_len)*100}%')
print(f'长度大于64句子占比：{len(p_len[p_len>64])/len(p_len)*100}%')
print(f'长度大于128句子占比：{len(p_len[p_len>128])/len(p_len)*100}%')
print(f'长度大于384句子占比：{len(p_len[p_len>384])/len(p_len)*100}%')
print(f'长度大于512句子占比：{len(p_len[p_len>512])/len(p_len)*100}%')
print(f'长度大于768句子占比：{len(p_len[p_len>768])/len(p_len)*100}%')
print(f'长度大于1024句子占比：{len(p_len[p_len>1024])/len(p_len)*100}%')

print('\nquestion 平均长度:', np.mean(q_len))
print(Counter(q_len))
q_len = np.array(q_len)
print(f'问题大于32占比：{len(q_len[q_len>32])/len(q_len)*100}%')

print('\nanswer 平均长度:', np.mean(a_len))
print(Counter(a_len))
a_len = np.array(a_len)
print(f'答案大于64占比：{len(a_len[a_len>64])/len(a_len)*100}%')

num_qa: 173004

context 平均长度: 153.44229035166816
Counter({82: 2434, 83: 2390, 84: 2385, 85: 2380, 86: 2347, 81: 2209, 92: 2196, 64: 2195, 87: 2191, 88: 2191, 93: 2172, 65: 2169, 68: 2141, 89: 2140, 66: 2131, 91: 2109, 80: 2106, 90: 2097, 69: 2056, 67: 2044, 94: 2037, 70: 2029, 79: 2016, 95: 1996, 78: 1981, 77: 1967, 96: 1961, 71: 1950, 73: 1920, 97: 1899, 72: 1885, 76: 1879, 98: 1857, 74: 1842, 75: 1836, 118: 1717, 100: 1698, 99: 1656, 101: 1561, 119: 1550, 103: 1548, 120: 1478, 104: 1474, 102: 1469, 106: 1446, 105: 1397, 107: 1386, 108: 1345, 111: 1336, 110: 1313, 121: 1294, 117: 1257, 122: 1255, 109: 1254, 112: 1243, 116: 1238, 113: 1231, 115: 1221, 114: 1185, 123: 1156, 124: 1131, 125: 997, 126: 976, 127: 953, 130: 849, 128: 847, 129: 821, 131: 781, 134: 734, 132: 731, 133: 685, 135: 665, 137: 660, 136: 637, 139: 613, 138: 596, 140: 567, 141: 549, 142: 533, 143: 525, 148: 511, 149: 509, 145: 503, 147: 502, 144: 499, 150: 487, 146: 484, 154: 465, 157: 459, 151: 441, 153: 436, 152: 42

In [7]:
idx = 0
paras = []
names = ['法研杯阅读理解_2019.json', '军事阅读理解_2018.json', '军事阅读理解_2019.json']
pre_context = ''
for name in names:
    with open(f'aug_data/{name}', 'r', encoding='utf-8') as f:
        qas = []
        for line in f.readlines():
            sample = json.loads(line.strip())
            if pre_context == '':
                pre_context = sample['context']
                qas = [{'question': sample['question'], 'id': f'aug_{idx}', 'answers': [{'text': sample['answer'], 'answer_start': sample['answer_start']}]}]
                continue
            if pre_context == sample['context']:
                qas.append({'question': sample['question'], 'id': f'aug_{idx}', 'answers': [{'text': sample['answer'], 'answer_start': sample['answer_start']}]})
            else:
                paras.append({'context': pre_context, 'qas': qas})
                pre_context = sample['context']
                qas = [{'question': sample['question'], 'id': f'aug_{idx}', 'answers': [{'text': sample['answer'], 'answer_start': sample['answer_start']}]}]
            idx += 1
        if qas:
            paras.append({'context': pre_context, 'qas': qas})

data = {'data': [{'paragraphs': paras}]}
with open('datasets/new.json', "w", encoding='utf-8') as f:
    f.write(json.dumps(data, ensure_ascii=False, indent=4) + "\n")

In [8]:
with open('datasets/new.json', 'r', encoding='utf-8') as f:
    data = json.load(f)['data']

paras = data[0]['paragraphs']

q_len = []
p_len = []
a_len = []
num_qa = 0

for i,p in enumerate(paras):
    p_len.append(len(p['context']))
#     if len(p['context']) > 512:
#         print(p['context'])
    for qa in p['qas']:
        num_qa += 1
        question = qa['question']
        q_len.append(len(question))
        answers = qa['answers']
        if answers != []:
            answer = answers[0]['text']
            a_len.append(len(answer))

print('num_qa:', num_qa)
print('\ncontext 平均长度:', np.mean(p_len))
print(Counter(p_len))
p_len = np.array(p_len)
print(f'长度大于32句子占比：{len(p_len[p_len>32])/len(p_len)*100}%')
print(f'长度大于64句子占比：{len(p_len[p_len>64])/len(p_len)*100}%')
print(f'长度大于128句子占比：{len(p_len[p_len>128])/len(p_len)*100}%')
print(f'长度大于384句子占比：{len(p_len[p_len>384])/len(p_len)*100}%')
print(f'长度大于512句子占比：{len(p_len[p_len>512])/len(p_len)*100}%')
print(f'长度大于768句子占比：{len(p_len[p_len>768])/len(p_len)*100}%')
print(f'长度大于1024句子占比：{len(p_len[p_len>1024])/len(p_len)*100}%')

print('\nquestion 平均长度:', np.mean(q_len))
print(Counter(q_len))
q_len = np.array(q_len)
print(f'问题大于32占比：{len(q_len[q_len>32])/len(q_len)*100}%')

print('\nanswer 平均长度:', np.mean(a_len))
print(Counter(a_len))
a_len = np.array(a_len)
print(f'答案大于64占比：{len(a_len[a_len>64])/len(a_len)*100}%')

num_qa: 92176

context 平均长度: 699.4060352385532
Counter({999: 65, 941: 63, 510: 59, 537: 58, 517: 57, 504: 57, 597: 55, 879: 55, 503: 54, 626: 54, 273: 54, 625: 53, 938: 53, 873: 53, 862: 52, 571: 52, 501: 52, 530: 51, 672: 51, 993: 51, 548: 51, 556: 51, 532: 51, 630: 51, 665: 51, 526: 51, 997: 50, 980: 50, 497: 50, 890: 50, 590: 49, 423: 49, 338: 49, 988: 49, 871: 49, 888: 49, 528: 48, 506: 48, 900: 48, 439: 48, 940: 48, 971: 47, 491: 47, 923: 47, 350: 47, 566: 47, 460: 47, 558: 47, 905: 46, 302: 46, 966: 46, 348: 46, 514: 46, 297: 46, 978: 46, 990: 46, 910: 46, 522: 46, 563: 46, 535: 46, 594: 46, 669: 46, 950: 46, 870: 46, 924: 45, 738: 45, 391: 45, 328: 45, 986: 45, 425: 45, 994: 45, 339: 45, 716: 45, 957: 45, 733: 45, 312: 45, 596: 45, 1019: 45, 880: 44, 973: 44, 943: 44, 584: 44, 948: 44, 779: 44, 524: 44, 545: 44, 624: 44, 907: 44, 619: 44, 581: 44, 979: 44, 920: 44, 412: 44, 750: 44, 527: 44, 996: 43, 869: 43, 670: 43, 998: 43, 903: 43, 599: 43, 987: 43, 917: 43, 927: 43, 845: 43

In [9]:
with open('datasets/qa_no_dup_minlen32.json', 'r', encoding='utf-8') as f:
    d1 = json.load(f)
with open('datasets/new.json', 'r', encoding='utf-8') as f:
    d2 = json.load(f)

paras = d1['data'][0]['paragraphs'] + d2['data'][0]['paragraphs']
random.shuffle(paras)

data = {'data': [{'paragraphs': paras}]}
with open('datasets/mix.json', "w", encoding='utf-8') as f:
    f.write(json.dumps(data, ensure_ascii=False, indent=4) + "\n")

In [2]:
idx = 0
paras = []
names = ['mix_all_286935.json']
pre_context = ''
for name in names:
    with open(f'/home/xyz/AI/nlp/datasets/{name}', 'r', encoding='utf-8') as f:
        qas = []
        for line in f.readlines():
            sample = json.loads(line.strip())
            if pre_context == '':
                pre_context = sample['context']
                qas = [{'question': sample['question'], 'id': f'aug_{idx}', 'answers': [{'text': sample['answer'], 'answer_start': sample['answer_start']}]}]
                continue
            if pre_context == sample['context']:
                qas.append({'question': sample['question'], 'id': f'aug_{idx}', 'answers': [{'text': sample['answer'], 'answer_start': sample['answer_start']}]})
            else:
                paras.append({'context': pre_context, 'qas': qas})
                pre_context = sample['context']
                qas = [{'question': sample['question'], 'id': f'aug_{idx}', 'answers': [{'text': sample['answer'], 'answer_start': sample['answer_start']}]}]
            idx += 1
        if qas:
            paras.append({'context': pre_context, 'qas': qas})

print(idx, len(paras))
random.shuffle(paras)
data = {'data': [{'paragraphs': paras}]}
with open('/home/xyz/AI/nlp/datasets/hq.json', "w", encoding='utf-8') as f:
    f.write(json.dumps(data, ensure_ascii=False, indent=4) + "\n")

286934 220159


In [3]:
with open('/home/xyz/AI/nlp/datasets/hq.json', 'r', encoding='utf-8') as f:
    data = json.load(f)['data']

paras = data[0]['paragraphs']

q_len = []
p_len = []
a_len = []
num_qa = 0

for i,p in enumerate(paras):
    p_len.append(len(p['context']))
#     if len(p['context']) > 512:
#         print(p['context'])
    for qa in p['qas']:
        num_qa += 1
        question = qa['question']
        q_len.append(len(question))
        answers = qa['answers']
        if answers != []:
            answer = answers[0]['text']
            a_len.append(len(answer))

print('num_qa:', num_qa)
print('\ncontext 平均长度:', np.mean(p_len))
print(Counter(p_len))
p_len = np.array(p_len)
print(f'长度大于32句子占比：{len(p_len[p_len>32])/len(p_len)*100}%')
print(f'长度大于64句子占比：{len(p_len[p_len>64])/len(p_len)*100}%')
print(f'长度大于128句子占比：{len(p_len[p_len>128])/len(p_len)*100}%')
print(f'长度大于384句子占比：{len(p_len[p_len>384])/len(p_len)*100}%')
print(f'长度大于512句子占比：{len(p_len[p_len>512])/len(p_len)*100}%')
print(f'长度大于768句子占比：{len(p_len[p_len>768])/len(p_len)*100}%')
print(f'长度大于1024句子占比：{len(p_len[p_len>1024])/len(p_len)*100}%')

print('\nquestion 平均长度:', np.mean(q_len))
print(Counter(q_len))
q_len = np.array(q_len)
print(f'问题大于32占比：{len(q_len[q_len>32])/len(q_len)*100}%')

print('\nanswer 平均长度:', np.mean(a_len))
print(Counter(a_len))
a_len = np.array(a_len)
print(f'答案大于64占比：{len(a_len[a_len>64])/len(a_len)*100}%')

num_qa: 286935

context 平均长度: 254.50731970984606
Counter({32: 1887, 34: 1856, 35: 1819, 33: 1818, 55: 1713, 36: 1708, 39: 1694, 82: 1694, 59: 1687, 38: 1679, 85: 1678, 37: 1676, 41: 1674, 57: 1673, 86: 1670, 56: 1664, 40: 1660, 83: 1652, 84: 1634, 43: 1620, 93: 1619, 61: 1602, 64: 1597, 63: 1592, 81: 1590, 42: 1590, 62: 1587, 92: 1584, 52: 1580, 54: 1563, 58: 1545, 88: 1536, 53: 1533, 48: 1532, 87: 1531, 91: 1529, 44: 1528, 80: 1523, 66: 1522, 47: 1519, 51: 1515, 65: 1515, 89: 1514, 60: 1512, 45: 1509, 94: 1496, 90: 1487, 68: 1484, 49: 1473, 69: 1469, 50: 1464, 96: 1436, 70: 1430, 79: 1419, 67: 1417, 78: 1408, 95: 1404, 97: 1391, 46: 1385, 77: 1384, 71: 1354, 73: 1345, 98: 1341, 72: 1336, 76: 1322, 74: 1306, 75: 1298, 100: 1242, 99: 1237, 118: 1228, 101: 1164, 103: 1122, 119: 1089, 102: 1075, 105: 1062, 106: 1060, 120: 1054, 104: 1051, 107: 1008, 111: 992, 110: 983, 108: 978, 121: 927, 116: 920, 117: 906, 109: 900, 115: 896, 122: 895, 113: 890, 112: 889, 114: 888, 123: 839, 124: 822, 1

In [2]:
idx = 0
paras = []
names = ['dureader_2019.json']
pre_context = ''
for name in names:
    with open(f'datasets/{name}', 'r', encoding='utf-8') as f:
        qas = []
        for line in f.readlines():
            sample = json.loads(line.strip())
            if pre_context == '':
                pre_context = sample['context']
                qas = [{'question': sample['question'], 'id': f'dureader_{idx}', 'answers': [{'text': sample['answer'], 'answer_start': sample['answer_start']}]}]
                continue
            if pre_context == sample['context']:
                qas.append({'question': sample['question'], 'id': f'dureader_{idx}', 'answers': [{'text': sample['answer'], 'answer_start': sample['answer_start']}]})
            else:
                paras.append({'context': pre_context, 'qas': qas})
                pre_context = sample['context']
                qas = [{'question': sample['question'], 'id': f'dureader_{idx}', 'answers': [{'text': sample['answer'], 'answer_start': sample['answer_start']}]}]
            idx += 1
        if qas:
            paras.append({'context': pre_context, 'qas': qas})

data = {'data': [{'paragraphs': paras}]}
with open('datasets/dureader.json', "w", encoding='utf-8') as f:
    f.write(json.dumps(data, ensure_ascii=False, indent=4) + "\n")

In [3]:
with open('datasets/dureader.json', 'r', encoding='utf-8') as f:
    data = json.load(f)['data']

paras = data[0]['paragraphs']

q_len = []
p_len = []
a_len = []
num_qa = 0

for i,p in enumerate(paras):
    p_len.append(len(p['context']))
#     if len(p['context']) > 512:
#         print(p['context'])
    for qa in p['qas']:
        num_qa += 1
        question = qa['question']
        q_len.append(len(question))
        answers = qa['answers']
        if answers != []:
            answer = answers[0]['text']
            a_len.append(len(answer))

print('num_qa:', num_qa)
print('\ncontext 平均长度:', np.mean(p_len))
print(Counter(p_len))
p_len = np.array(p_len)
print(f'长度大于32句子占比：{len(p_len[p_len>32])/len(p_len)*100}%')
print(f'长度大于64句子占比：{len(p_len[p_len>64])/len(p_len)*100}%')
print(f'长度大于128句子占比：{len(p_len[p_len>128])/len(p_len)*100}%')
print(f'长度大于384句子占比：{len(p_len[p_len>384])/len(p_len)*100}%')
print(f'长度大于512句子占比：{len(p_len[p_len>512])/len(p_len)*100}%')
print(f'长度大于768句子占比：{len(p_len[p_len>768])/len(p_len)*100}%')
print(f'长度大于1024句子占比：{len(p_len[p_len>1024])/len(p_len)*100}%')

print('\nquestion 平均长度:', np.mean(q_len))
print(Counter(q_len))
q_len = np.array(q_len)
print(f'问题大于32占比：{len(q_len[q_len>32])/len(q_len)*100}%')

print('\nanswer 平均长度:', np.mean(a_len))
print(Counter(a_len))
a_len = np.array(a_len)
print(f'答案大于64占比：{len(a_len[a_len>64])/len(a_len)*100}%')

num_qa: 52246

context 平均长度: 318.7227843684578
Counter({38: 301, 32: 299, 39: 297, 41: 296, 34: 292, 58: 290, 36: 289, 37: 285, 35: 284, 45: 281, 49: 280, 43: 275, 33: 273, 57: 271, 44: 265, 47: 264, 50: 263, 52: 260, 48: 251, 62: 250, 54: 247, 40: 247, 56: 244, 51: 244, 64: 243, 53: 243, 46: 243, 63: 241, 55: 240, 59: 230, 60: 228, 61: 227, 42: 219, 73: 212, 75: 206, 74: 204, 69: 203, 78: 202, 68: 201, 67: 201, 72: 196, 66: 195, 81: 192, 71: 189, 65: 186, 70: 186, 77: 183, 85: 182, 79: 176, 76: 174, 82: 168, 89: 167, 100: 164, 87: 162, 83: 159, 104: 156, 93: 154, 91: 154, 86: 153, 113: 153, 95: 152, 84: 149, 80: 147, 94: 146, 92: 142, 99: 142, 115: 140, 110: 140, 108: 140, 96: 136, 103: 135, 105: 134, 88: 132, 90: 131, 98: 131, 102: 131, 101: 130, 97: 128, 109: 128, 116: 127, 111: 125, 112: 122, 117: 120, 123: 118, 121: 118, 139: 118, 126: 117, 134: 117, 119: 115, 107: 112, 146: 111, 154: 110, 132: 107, 106: 107, 129: 105, 136: 104, 114: 104, 148: 104, 130: 103, 137: 102, 140: 102, 13

In [4]:
with open('datasets/mix.json', 'r', encoding='utf-8') as f:
    d1 = json.load(f)
with open('datasets/dureader.json', 'r', encoding='utf-8') as f:
    d2 = json.load(f)

paras = d1['data'][0]['paragraphs'] + d2['data'][0]['paragraphs']
random.shuffle(paras)

data = {'data': [{'paragraphs': paras}]}
with open('datasets/mix+dureader.json', "w", encoding='utf-8') as f:
    f.write(json.dumps(data, ensure_ascii=False, indent=4) + "\n")

In [5]:
with open('datasets/train_aug.json', 'r', encoding='utf-8') as f:
    d1 = json.load(f)
with open('datasets/dureader.json', 'r', encoding='utf-8') as f:
    d2 = json.load(f)

paras = d1['data'][0]['paragraphs'] + d2['data'][0]['paragraphs']
random.shuffle(paras)

data = {'data': [{'paragraphs': paras}]}
with open('datasets/train+cmrc+dureader.json', "w", encoding='utf-8') as f:
    f.write(json.dumps(data, ensure_ascii=False, indent=4) + "\n")

In [3]:
with open('datasets/sim_questions_test.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
for k in data:
    data[k]['sim_questions'] = list(set(data[k]['sim_questions']))

with open('datasets/sim_questions_test_set.json', "w", encoding='utf-8') as f:
    f.write(json.dumps(data, ensure_ascii=False, indent=2) + "\n")

In [2]:
with open('datasets/sim_questions_test_set.json', 'r', encoding='utf-8') as f:
    d = json.load(f)
with open('datasets/test1.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
paras = data['data'][0]['paragraphs']
for p in paras:
    new_qas = []
    for qas in p['qas']:
        if qas['id'] in d:
            for i, q in enumerate(d[qas['id']]['sim_questions']):
                tmp = deepcopy(qas)
                tmp['id'] += f'_aug_{i}'
                tmp['question'] = q
                new_qas.append(tmp)
    if new_qas:
        p['qas'] += new_qas
with open('datasets/test1_aug.json', "w", encoding='utf-8') as f:
    f.write(json.dumps(data, ensure_ascii=False, indent=4) + "\n")

In [2]:
with open('datasets/sim_questions_train.json', 'r', encoding='utf-8') as f:
    data = json.load(f)
for k in data:
    data[k]['sim_questions'] = list(set(data[k]['sim_questions']))

with open('datasets/sim_questions_train_set.json', "w", encoding='utf-8') as f:
    f.write(json.dumps(data, ensure_ascii=False, indent=2) + "\n")