In [38]:
import collections
import json
import numpy as np
import pandas as pd

from nltk.tokenize import word_tokenize
from itertools import chain
from sklearn.model_selection import StratifiedShuffleSplit
from argparse import Namespace
from tqdm import tqdm_notebook

In [4]:

args = Namespace(
    path="data/SQuAd/BatchSQuAD",
    train_proportion=0.7,
    val_proportion=0.15,
    test_proportion=0.15,
    output_munged_csv="data/SQuAd/SQuAD_with_splits_without_title_id.csv",
    passage_text='data/SQuAd/passage_text',
    seed=1337
)

In [5]:
np.random.seed(args.seed)

In [63]:
QADataset = []
passage = []
bar = tqdm_notebook(desc='paragraph',
                   total = 45,
                    position=1,
                    leave=True)
for i in tqdm_notebook(range(10)):
    with open(args.path+"/batch_{}.json".format(i+1)) as fp:
        batch = json.load(fp)
        for para in batch:
            title = para['title']
            for par in para['paragraphs']:
                text = {}
                text['passage'] = par['context'].lower()
                passage.append(text)
                for question in par['qas']:
                    sample = {}
                    sample['title'] =  title
                    sample['context'] = par['context'].lower()
                    sample['question'] = question['question'].lower()
                    sample['is_impossible'] = question['is_impossible']
                    if not question['is_impossible']:
                        sample['answer_start'] = question['answers'][0]['answer_start']
                        sample['answer_text'] = question['answers'][0]['text'].lower()
                    else:
                        sample['answer_start'] = -1
                        sample['answer_text'] = ''
                    QADataset.append(sample)
            bar.update()
            
            

HBox(children=(IntProgress(value=0, description='paragraph', max=45, style=ProgressStyle(description_width='in…

HBox(children=(IntProgress(value=0, max=10), HTML(value='')))

In [60]:

QADataset = pd.DataFrame(QADataset)
QADataset['split'] = 0
passage = pd.DataFrame(passage)


In [64]:

split = StratifiedShuffleSplit(n_splits=1, test_size=args.test_proportion+args.val_proportion, random_state=args.seed)

for train_index, test_index in split.split(QADataset, QADataset['title']):
    QADataset[train_index]['split'] = 'train'#修改值的常规方式，其他方式可能会在copy上修改，没用
    QADataset[test_index]['split'] = 'test'
    first_index = test_index
                                           
split = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=args.seed)

for _, test_index in split.split(QADataset[first_index], QADataset[first_index]['title']):
    QADataset[test_index]['split'] = 'val'


TypeError: list indices must be integers or slices, not str

In [31]:
#由Title分层采样  放弃使用 适用于df
split = StratifiedShuffleSplit(n_splits=1, test_size=args.test_proportion+args.val_proportion, random_state=args.seed)

for train_index, test_index in split.split(QADataset, QADataset.title):
    QADataset.iloc[train_index,-1] = 'train'#修改值的常规方式，其他方式可能会在copy上修改，没用
    QADataset.iloc[test_index,-1] = 'test'
                                           
split = StratifiedShuffleSplit(n_splits=1, test_size=0.5, random_state=args.seed)

for _, test_index in split.split(QADataset[QADataset.split=='train'], QADataset[QADataset.split=='train'].title):
    QADataset.iloc[test_index,-1] = 'val'



In [32]:
QADataset.describe()

Unnamed: 0,answer_start
count,130319.0
mean,212.727384
std,264.883218
min,-1.0
25%,-1.0
50%,110.0
75%,357.0
max,3126.0


In [33]:
final_dataset = QADataset

In [34]:
final_dataset.head()

Unnamed: 0,answer_start,answer_text,context,is_impossible,question,title,split
0,269,in the late 1990s,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,False,when did beyonce start becoming popular?,Beyoncé,train
1,207,singing and dancing,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,False,what areas did beyonce compete in when she was...,Beyoncé,val
2,526,2003,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,False,when did beyonce leave destiny's child and bec...,Beyoncé,train
3,166,"houston, texas",beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,False,in what city and state did beyonce grow up?,Beyoncé,val
4,276,late 1990s,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,False,in which decade did beyonce become famous?,Beyoncé,train


In [35]:
for i in final_dataset.head().question:
    for word in i:
        print(word)

w
h
e
n
 
d
i
d
 
b
e
y
o
n
c
e
 
s
t
a
r
t
 
b
e
c
o
m
i
n
g
 
p
o
p
u
l
a
r
?
w
h
a
t
 
a
r
e
a
s
 
d
i
d
 
b
e
y
o
n
c
e
 
c
o
m
p
e
t
e
 
i
n
 
w
h
e
n
 
s
h
e
 
w
a
s
 
g
r
o
w
i
n
g
 
u
p
?
w
h
e
n
 
d
i
d
 
b
e
y
o
n
c
e
 
l
e
a
v
e
 
d
e
s
t
i
n
y
'
s
 
c
h
i
l
d
 
a
n
d
 
b
e
c
o
m
e
 
a
 
s
o
l
o
 
s
i
n
g
e
r
?
i
n
 
w
h
a
t
 
c
i
t
y
 
a
n
d
 
s
t
a
t
e
 
d
i
d
 
b
e
y
o
n
c
e
 
 
g
r
o
w
 
u
p
?
 
i
n
 
w
h
i
c
h
 
d
e
c
a
d
e
 
d
i
d
 
b
e
y
o
n
c
e
 
b
e
c
o
m
e
 
f
a
m
o
u
s
?


In [36]:
final_dataset.split.value_counts()

train    59325
val      45612
test     25382
Name: split, dtype: int64

In [37]:
final_dataset.to_csv(args.output_munged_csv, index=False)
passage.to_csv(args.passage_text)

In [40]:
final_dataset.head()

Unnamed: 0,answer_start,answer_text,context,is_impossible,question,title,split
0,269,in the late 1990s,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,False,when did beyonce start becoming popular?,Beyoncé,train
1,207,singing and dancing,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,False,what areas did beyonce compete in when she was...,Beyoncé,val
2,526,2003,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,False,when did beyonce leave destiny's child and bec...,Beyoncé,train
3,166,"houston, texas",beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,False,in what city and state did beyonce grow up?,Beyoncé,val
4,276,late 1990s,beyoncé giselle knowles-carter (/biːˈjɒnseɪ/ b...,False,in which decade did beyonce become famous?,Beyoncé,train


In [49]:
k = final_dataset.head().to_json(orient='split', force_ascii = False)

In [51]:
with open("data/SQuAd/k.josn", 'w+') as fp:
    json.dump(k, fp)

In [54]:
with open("data/SQuAd/k.josn") as fp:
    s = json.load(fp)
    s = json.loads(s)

In [58]:
from pandas.io.json import json_normalize
S = json_normalize(s)


In [59]:
S

Unnamed: 0,columns,data,index
0,"[answer_start, answer_text, context, is_imposs...","[[269, in the late 1990s, beyoncé giselle know...","[0, 1, 2, 3, 4]"
