In [1]:
import numpy as np
import pandas as pd

# All question data

In [2]:
questions = pd.read_csv('../data/SQUAD_Data/v1.1/question_data/questions.csv')

In [3]:
questions.head()

Unnamed: 0,Question ID,Fold,Answer,Category,Text
0,3009,train,Henry IV of France,History,"With the assistence of his chief minister, the..."
1,200192,train,Great Famine (Ireland),History:Europe,"During this event, Charles Wood instructed Lor..."
2,205242,train,Mass spectrometry,Science:Chemistry,One form of this technique based off the cyclo...
3,202398,train,Ester,Science:Chemistry,A carbon alpha to two carbons with this functi...
4,111634,train,Tyr,Social_Science,"This god is sometimes called Thingsus, which r..."


In [4]:
questions.describe()

Unnamed: 0,Question ID
count,20407.0
mean,115643.94203
std,74456.576812
min,1.0
25%,13039.5
50%,114433.0
75%,193736.5
max,212891.0


In [5]:
questions.columns

Index(['Question ID', 'Fold', 'Answer', 'Category', 'Text'], dtype='object')

In [6]:
questions['Fold'].value_counts()

train    15667
test      2370
dev       2370
Name: Fold, dtype: int64

In [7]:
questions[questions['Question ID']==121407]

Unnamed: 0,Question ID,Fold,Answer,Category,Text
20393,121407,test,Dona Flor and Her Two Husbands,Literature,The second section of this novel opens with a ...


In [8]:
questions

Unnamed: 0,Question ID,Fold,Answer,Category,Text
0,3009,train,Henry IV of France,History,"With the assistence of his chief minister, the..."
1,200192,train,Great Famine (Ireland),History:Europe,"During this event, Charles Wood instructed Lor..."
2,205242,train,Mass spectrometry,Science:Chemistry,One form of this technique based off the cyclo...
3,202398,train,Ester,Science:Chemistry,A carbon alpha to two carbons with this functi...
4,111634,train,Tyr,Social_Science,"This god is sometimes called Thingsus, which r..."
5,189284,train,Johann Sebastian Bach,Fine_Arts,This composer was given the task of improvisin...
6,3873,train,Inca Empire,History,"Their gods included Illapa, the god of thunder..."
7,192507,train,Enthalpy,Science,This quantity remains constant in the Joule-Th...
8,13075,train,Jesus,Social_Science,One religion says that this figure was taken t...
9,107378,train,Gibbs free energy,Science:Chemistry,A Frost diagram plots oxidation state against ...


# JSON files (train & dev sets)

In [9]:
import json

In [10]:
dev = json.load(open('../data/SQUAD_Data/v1.1/dev-v1.1.json', 'rt'))
train = json.load(open('../data/SQUAD_Data/v1.1/train-v1.1.json', 'rt'))

### Data structure
* dev['data']: array of articles
* dev['data'][0]: 1 article
* dev['data'][0]['title']: title of the article
* dev['data'][0]['paragraphs']: list of paragraphs
* dev['data'][0]['paragraphs'][0]: 1 paragraph, including 2 keys: `context` & `qas`
* dev['data'][0]['paragraphs'][0]['context']: text content of the paragraph
* dev['data'][0]['paragraphs'][0]['qas']: list of question-answer sets on the pararaph
* dev['data'][0]['paragraphs'][0]['qas'][0]: 1 set of question-answer, including 3 keys: `answers`, `question` & `id`
* dev['data'][0]['paragraphs'][0]['qas'][0]['id']: ID of the question
* dev['data'][0]['paragraphs'][0]['qas'][0]['question']: question text
* dev['data'][0]['paragraphs'][0]['qas'][0]['answers']: list of 3 answers of the questions. Each answer consists of 2 keys: `answer_start` & `text`
* dev['data'][0]['paragraphs'][0]['qas'][0]['answers']['answer_start']: index of the start character of the answer text in the paragraph
* dev['data'][0]['paragraphs'][0]['qas'][0]['answers']['text']: the answer text

# Stats

### Num of articles

In [11]:
print('Num of train articles: {}'.format(len(train)))
print('Num of dev articles: {}'.format(len(dev)))

Num of train articles: 2
Num of dev articles: 2


### Num of paragraphs

In [12]:
def num_paragraphs(ds):
    return np.sum([len(article['paragraphs']) for article in ds['data']])        

In [13]:
print('Num of train paragraphs: {:,}'.format(num_paragraphs(train)))
print('Num of dev paragraphs: {:,}'.format(num_paragraphs(dev)))

Num of train paragraphs: 18,896
Num of dev paragraphs: 2,067


### Num of questions

In [14]:
def num_questions(ds):
    return np.sum([len(para['qas']) for article in ds['data'] for para in article['paragraphs']])

print('Num of train questions: {:,}'.format(num_questions(train)))
print('Num of dev questions: {:,}'.format(num_questions(dev)))

Num of train questions: 87,599
Num of dev questions: 10,570


* `dev['data'][0]` corresponds to the article https://rajpurkar.github.io/SQuAD-explorer/explore/1.1/dev/Super_Bowl_50.html

# Samples

In [15]:
dev['data'][0].keys()

dict_keys(['title', 'paragraphs'])

In [16]:
len(dev['data'][0]['paragraphs'])

54

In [17]:
dev['data'][0]['paragraphs'][0]['context']

'Super Bowl 50 was an American football game to determine the champion of the National Football League (NFL) for the 2015 season. The American Football Conference (AFC) champion Denver Broncos defeated the National Football Conference (NFC) champion Carolina Panthers 24–10 to earn their third Super Bowl title. The game was played on February 7, 2016, at Levi\'s Stadium in the San Francisco Bay Area at Santa Clara, California. As this was the 50th Super Bowl, the league emphasized the "golden anniversary" with various gold-themed initiatives, as well as temporarily suspending the tradition of naming each Super Bowl game with Roman numerals (under which the game would have been known as "Super Bowl L"), so that the logo could prominently feature the Arabic numerals 50.'

In [18]:
len(dev['data'][0]['paragraphs'][0]['qas'])

30

In [19]:
dev['data'][0]['paragraphs'][0]['qas'][0].keys()

dict_keys(['answers', 'question', 'id'])

In [20]:
dev['data'][0]['paragraphs'][0]['qas']

[{'answers': [{'answer_start': 177, 'text': 'Denver Broncos'},
   {'answer_start': 177, 'text': 'Denver Broncos'},
   {'answer_start': 177, 'text': 'Denver Broncos'}],
  'question': 'Which NFL team represented the AFC at Super Bowl 50?',
  'id': '56be4db0acb8001400a502ec'},
 {'answers': [{'answer_start': 249, 'text': 'Carolina Panthers'},
   {'answer_start': 249, 'text': 'Carolina Panthers'},
   {'answer_start': 249, 'text': 'Carolina Panthers'}],
  'question': 'Which NFL team represented the NFC at Super Bowl 50?',
  'id': '56be4db0acb8001400a502ed'},
 {'answers': [{'answer_start': 403, 'text': 'Santa Clara, California'},
   {'answer_start': 355, 'text': "Levi's Stadium"},
   {'answer_start': 355,
    'text': "Levi's Stadium in the San Francisco Bay Area at Santa Clara, California."}],
  'question': 'Where did Super Bowl 50 take place?',
  'id': '56be4db0acb8001400a502ee'},
 {'answers': [{'answer_start': 177, 'text': 'Denver Broncos'},
   {'answer_start': 177, 'text': 'Denver Broncos'

In [21]:
[q['question'] for q in dev['data'][0]['paragraphs'][0]['qas']]

['Which NFL team represented the AFC at Super Bowl 50?',
 'Which NFL team represented the NFC at Super Bowl 50?',
 'Where did Super Bowl 50 take place?',
 'Which NFL team won Super Bowl 50?',
 'What color was used to emphasize the 50th anniversary of the Super Bowl?',
 'What was the theme of Super Bowl 50?',
 'What day was the game played on?',
 'What is the AFC short for?',
 'What was the theme of Super Bowl 50?',
 'What does AFC stand for?',
 'What day was the Super Bowl played on?',
 'Who won Super Bowl 50?',
 'What venue did Super Bowl 50 take place in?',
 'What city did Super Bowl 50 take place in?',
 'If Roman numerals were used, what would Super Bowl 50 have been called?',
 'Super Bowl 50 decided the NFL champion for what season?',
 'What year did the Denver Broncos secure a Super Bowl title for the third time?',
 'What city did Super Bowl 50 take place in?',
 'What stadium did Super Bowl 50 take place in?',
 'What was the final score of Super Bowl 50? ',
 'What month, day and y

In [22]:
len(dev['data'][0]['paragraphs'][0]['qas'])

30

# Please note that test set is hidden -> no data for it