In [1]:
import os
import sys
import numpy as np
import pandas as pd
import json
import random

In [2]:
train_file = '/data/sulixin/research/contest/hotpot/hotpot_train_v1.1.json'
dev_full_file = '/data/sulixin/research/contest/hotpot/hotpot_dev_fullwiki_v1.json'
dev_dis_file = '/data/sulixin/research/contest/hotpot/hotpot_dev_distractor_v1.json'

In [3]:
train = json.load(open(train_file))
dev_full = json.load(open(dev_full_file))
dev_dis = json.load(open(dev_dis_file))

In [4]:
dev_dis[0]

{'_id': '5a8b57f25542995d1e6f1371',
 'answer': 'yes',
 'question': 'Were Scott Derrickson and Ed Wood of the same nationality?',
 'supporting_facts': [['Scott Derrickson', 0], ['Ed Wood', 0]],
 'context': [['Ed Wood (film)',
   ['Ed Wood is a 1994 American biographical period comedy-drama film directed and produced by Tim Burton, and starring Johnny Depp as cult filmmaker Ed Wood.',
    " The film concerns the period in Wood's life when he made his best-known films as well as his relationship with actor Bela Lugosi, played by Martin Landau.",
    ' Sarah Jessica Parker, Patricia Arquette, Jeffrey Jones, Lisa Marie, and Bill Murray are among the supporting cast.']],
  ['Scott Derrickson',
   ['Scott Derrickson (born July 16, 1966) is an American director, screenwriter and producer.',
    ' He lives in Los Angeles, California.',
    ' He is best known for directing horror films such as "Sinister", "The Exorcism of Emily Rose", and "Deliver Us From Evil", as well as the 2016 Marvel Cinema

In [5]:
def word_len(text):
    return len(text.split())

def avg(li): 
    if len(li)==0:
        return 0
    return sum(li) * 1.0 / len(li)

def sent_cnt(contexts):
    return avg( [len(p[1]) for p in contexts] )

def passage_len(passage):
    return sum([word_len(sent) for sent in passage[1]])

def title_len(contexts):
    if len(contexts) == 0:
        return 0
    return avg([len(p[0].split()) for p in contexts if p])

def avg_p_len(contexts):
    return avg([passage_len(p) for p in contexts])
        
def my_stat(data):
    df = pd.DataFrame(data)
    df['问题长度'] = df['question'].apply(word_len)
    df['答案长度'] = df['answer'].apply(word_len)
    df['文本数量'] = df['context'].apply(len)
    df['文本长度'] = df['context'].apply(avg_p_len)
    df['证据文档数'] = df['supporting_facts'].apply(len)
    df['句子数量'] = df['context'].apply(sent_cnt)
    df['标题长度'] = df['context'].apply(title_len)
    return df.describe()

In [6]:
my_stat(train)

Unnamed: 0,问题长度,答案长度,文本数量,文本长度,证据文档数,句子数量,标题长度
count,90447.0,90447.0,90447.0,90447.0,90447.0,90447.0,90447.0
mean,17.818402,2.226287,9.946897,88.941357,2.384645,4.111189,3.083111
std,9.513881,1.809021,0.590802,24.615028,0.67289,1.095305,0.76629
min,3.0,1.0,2.0,14.5,2.0,1.0,1.0
25%,12.0,1.0,10.0,72.3,2.0,3.4,2.6
50%,15.0,2.0,10.0,86.8,2.0,4.0,3.0
75%,21.0,3.0,10.0,102.9,3.0,4.7,3.5
max,108.0,89.0,10.0,279.2,12.0,14.4,8.5


In [7]:
my_stat(dev_dis)

Unnamed: 0,问题长度,答案长度,文本数量,文本长度,证据文档数,句子数量,标题长度
count,7405.0,7405.0,7405.0,7405.0,7405.0,7405.0,7405.0
mean,15.721404,2.461985,9.952735,89.98063,2.431465,4.153713,3.07216
std,5.52239,1.817681,0.558498,24.578971,0.711531,1.105089,0.76605
min,6.0,1.0,2.0,18.7,2.0,1.0,1.0
25%,12.0,1.0,10.0,73.5,2.0,3.4,2.5
50%,15.0,2.0,10.0,87.6,2.0,4.0,3.0
75%,19.0,3.0,10.0,103.9,3.0,4.7,3.5
max,46.0,29.0,10.0,258.7,8.0,14.7,7.7


In [8]:
my_stat(dev_full)

Unnamed: 0,问题长度,答案长度,文本数量,文本长度,证据文档数,句子数量,标题长度
count,7405.0,7405.0,7405.0,7405.0,7405.0,7405.0,7405.0
mean,15.721404,2.461985,9.944902,92.361568,2.431465,4.262854,3.123751
std,5.52239,1.817681,0.644505,26.411746,0.711531,1.185284,0.813969
min,6.0,1.0,0.0,0.0,2.0,0.0,0.0
25%,12.0,1.0,10.0,74.9,2.0,3.5,2.5
50%,15.0,2.0,10.0,89.9,2.0,4.1,3.0
75%,19.0,3.0,10.0,107.1,3.0,4.9,3.6
max,46.0,29.0,10.0,266.0,8.0,14.8,8.0
