In [1]:
from google.colab import files
files.upload()

Saving dev-v2.0.json to dev-v2.0.json




In [2]:
from google.colab import files
files.upload()

Saving train-v2.0.json to train-v2.0.json


In [99]:
import json

with open('dev-v2.0.json') as f:
  data = json.load(f)

with open('train-v2.0.json') as f:
  train_data = json.load(f)

In [100]:
from nltk.corpus import stopwords  
from nltk.tokenize import word_tokenize  
import nltk
nltk.download('stopwords')
nltk.download('punkt')
nltk.download('wordnet')
from nltk.corpus import wordnet 


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


Structure of the data is as follows. 
The training dataset is a dictionary which has 2 key-value pairs.
1. 'version': v2.0
2. 'data': list of 442 elements where each element is another dictionary containing,
*   'title': Single word describing the paragraphs. (Ex. 'IPod')
*   'paragraphs': List of elements where each element is another dictionary containing,
1. 'context': Context paragraph from which the model needs to highlight answer to each question if possible and return not available if it is unanswerable.
2. 'qas': List of elements where each element is another dictionary containing,
*   'id': Unique ID through which the answers are compared while evaluating our model.
*   'is_impossible': Boolean value which describes whether the answer can be found in the context or not.
*   'question': Question string.
*   'answers': List of elements where each element is another dictionary containing,
1. 'answer_start': The character number of the answer in the context.
2. 'text': The answer string from context paragraph.
*  Generally there is only one answer to every question so a question arises why a list of dictionary and why not just dictionary? The reason we think is in dev-set, the structure is exactly the same except 'answers' where it contains upto 5 dictionaries (crowdsourced) and our model is awarded points if it matches any of those answers.
*  If is_impossible is True, then 'answers' will be an empty string and there will be another key called 'plausible_answers' whose structure is same as 'answers'.

*  We will go through the structure of dataset taking an example.




In [101]:
train_data.keys()

dict_keys(['version', 'data'])

In [102]:
train_data['version']

'v2.0'

In [103]:
# There are a total of 442 title, paragraphs tuples.
type(train_data['data']), len(train_data['data'])

(list, 442)

In [104]:
train_data['data'][3].keys()

dict_keys(['title', 'paragraphs'])

In [105]:
train_data['data'][3]['title']

'IPod'

In [106]:
# For this example, there are 60 context paragraphs. We will publish the detail statistics of number of paragraphs under each title later in the notebook.
len(train_data['data'][3]['paragraphs'])

60

In [107]:
train_data['data'][3]['paragraphs'][0].keys()

dict_keys(['qas', 'context'])

In [108]:
train_data['data'][3]['paragraphs'][0]['context']

'The iPod is a line of portable media players and multi-purpose pocket computers designed and marketed by Apple Inc. The first line was released on October 23, 2001, about 8½ months after iTunes (Macintosh version) was released. The most recent iPod redesigns were announced on July 15, 2015. There are three current versions of the iPod: the ultra-compact iPod Shuffle, the compact iPod Nano and the touchscreen iPod Touch.'

In [109]:
# For this example, there are 10 question-answer pairs. We will publish the detail statistics of number of questions under each context later.
len(train_data['data'][3]['paragraphs'][0]['qas'])

10

In [110]:
train_data['data'][3]['paragraphs'][0]['qas'][0].keys()

dict_keys(['question', 'id', 'answers', 'is_impossible'])

In [111]:
train_data['data'][3]['paragraphs'][0]['qas'][0]

{'answers': [{'answer_start': 105, 'text': 'Apple'}],
 'id': '56cc55856d243a140015ef0a',
 'is_impossible': False,
 'question': 'Which company produces the iPod?'}

In [112]:
# If is_impossible is True, the 'answers' value will be empty and there will be a new key 'plausible_answers'
train_data['data'][441]['paragraphs'][0]['qas'][0]

{'answers': [],
 'id': '5a7db48670df9f001a87505f',
 'is_impossible': True,
 'plausible_answers': [{'answer_start': 50,
   'text': 'ordinary matter composed of atoms'}],
 'question': 'What did the term matter include after the 20th century?'}

In [113]:
# This is an example of dev-set. The only difference is that it contains upto 5 answers (in this case 4 answers). If our output resembles any of these
# answers, we get the points.
data['data'][0]['paragraphs'][0]['qas'][4]

{'answers': [{'answer_start': 671, 'text': '10th century'},
  {'answer_start': 649, 'text': 'the first half of the 10th century'},
  {'answer_start': 671, 'text': '10th'},
  {'answer_start': 671, 'text': '10th'}],
 'id': '56ddde6b9a695914005b962c',
 'is_impossible': False,
 'question': 'What century did the Normans first gain their separate identity?'}

In [114]:
# Brief stats about the dataset.

poss = 0
imposs = 0
all_para_cnt = []
all_q_cnt = []
for elems in train_data['data']:
  para_cnt = 0
  for paras_under_title in elems['paragraphs']:
    para_cnt += 1
    q_cnt = 0
    for each_ques in paras_under_title['qas']:
      q_cnt += 1
      if each_ques['is_impossible'] == False:
        poss += 1
      else:
        imposs += 1
    all_q_cnt.append(q_cnt)
  all_para_cnt.append(para_cnt)

In [115]:
print(f'Total No. context paragraphs = {sum(all_para_cnt)}, Average No. of context paragraphs per title = {sum(all_para_cnt)/len(all_para_cnt)}')
print(f'Total No. Questions = {sum(all_q_cnt)}, Average No. of Questions per context paragraph = {sum(all_q_cnt)/len(all_q_cnt)}')
print(f'No. questions which have answers in the context paragraph = {poss}, which do not have answers in the context paragraph = {imposs}')

Total No. context paragraphs = 19035, Average No. of context paragraphs per title = 43.06561085972851
Total No. Questions = 130319, Average No. of Questions per context paragraph = 6.84628316259522
No. questions which have answers in the context paragraph = 86821, which do not have answers in the context paragraph = 43498


In [116]:
# Given a word, get_synonyms(word) returns a list of synonyms of this particular word. 

def get_synonyms(word):
    synonyms = set()
    
    for syn in wordnet.synsets(word): 
        for l in syn.lemmas(): 
            synonym = l.name().replace("_", " ").replace("-", " ").lower()
            synonym = "".join([char for char in synonym if char in ' qwertyuiopasdfghjklzxcvbnm'])
            synonyms.add(synonym) 
    
    if word in synonyms:
        synonyms.remove(word)
    
    return list(synonyms)


stop_words = set(stopwords.words('english'))
import random
all_chars = ' qwertyuiopasdfghjklzxcvbnmQWERTYUIOPASDFGHJKLZXCVBNM'
# Goal is to replace some words in a context with their respective synonyms so that the meaning won't change and the performance will increase
# if these modified context paragraphs are added to the original training dataset.
# For every word in the context, with 0.2 probability we replace it with a synonym (if exists), with 0.8 probability, we leave it unchanged.
# After tokenizing the context string into individual words and applying this algorithm, we need to untokenize it so that
# a coherent passage string is formed. 
# To append these new data samples, we also need answers to each question corresponding to the contexts. 
# As we have seen in the data analysis phase, we need to modify 'answer_start' feature and 'text' feature to new values.
# For example, if an answer in original context is 'happy moments', the answer in modified context may be 'joyful moments'.
# Not only did the words change, but also the start character position has changed.
# To retrieve new answer_starts, we first pass a list of old answer_starts, make a copy of it and sort it. 
# Keep a flag value which stores the change in length of the string till now. Say for example 'happy moments' has answer_start = 53 in old context
# and 'joyful moments' has answer_start = 61 in new context. Logic to update flag is flag <-- flag + len(synonym) - len(old word). 
# Until 53 has reached, we keep adding to flag, and once it is reached we store it in a flag_list to be returned as output later.


# Takes context paragraph and start indices of answers corresponding to a particular context's questions.
# Outputs list of synonym words, flag list ()

def modify_contexts(context, ans_start_list):
  ans_start_list.sort()
  flag = 0
  word_tokens = word_tokenize(context)
  output = []

  # running_length will help us compare whether we have reached answer_start after iterating over each word.
  running_length = 0
  flag_list = [0 for i in range(len(ans_start_list))]
  for word in word_tokens:

    # This list is used to fetch p = 0.2
    lstt = [0,1,1,1,1]

    # If word contains anything other than alphabets, we don't find synonyms
    if set(word).issubset(set(all_chars)):
      # adding +1 as well to accompany spaces
      running_length += len(word) + 1
      # p = 0.2
      if random.choice(lstt) == 0:
        
        # If word is a stop_word, we don't find synonym.
        if word not in stop_words:
          temp = get_synonyms(word)
          if len(temp) == 0:
            output.append(word)
          else:
            substitute = random.choice(temp)
            for i in range(len(flag_list)):
              if running_length < ans_start_list[i]:
                # Logic to update flag
                flag_list[i] += len(substitute) - len(word)
            output.append(substitute)
        else:
          output.append(word)
      else:
        output.append(word)
    else:
      running_length += len(word)
      output.append(word)

  return output, flag_list

# Input is a list of words and output is a string which contains all the words in a coherent manner (paragraph)
import re
def untokenize(words):
    text = ' '.join(words)
    step1 = text.replace("`` ", '"').replace(" ''", '"').replace('. . .',  '...')
    step2 = step1.replace(" ( ", " (").replace(" ) ", ") ")
    step3 = re.sub(r' ([.,:;?!%]+)([ \'"`])', r"\1\2", step2)
    step4 = re.sub(r' ([.,:;?!%]+)$', r"\1", step3)
    step5 = step4.replace(" '", "'").replace(" n't", "n't").replace(
         "can not", "cannot")
    step6 = step5.replace(" ` ", " '")
    step7 = step6.replace(" & ", "&")
    step8 = step7.replace(" [ ", " [").replace(" ] ", "] ")
    return step8.strip()



In [117]:
# Takes an arr of numbers and flag_list as input and returns a flag_list which correspond to original ans_start_list sent into modify_context method
# This helps us update the new 'answer_start' and 'text' easily

def find_new_indices(arr, flag_list):
  arr2 = arr.copy()
  arr2.sort()
  s = numpy.array(arr)
  sort_index = numpy.argsort(s)
  flag_list_return = flag_list.copy()
  i = 0
  for elem in sort_index:
    flag_list_return[elem] = flag_list[i]
    i += 1
  return flag_list_return

In [133]:
# Making another copy to be modified and to demonstrate how a paragraph and corresponding answers are changed.
with open('train-v2.0.json') as f:
  train_data2 = json.load(f)

In [134]:
# Iterating and modifying the train_data to generate synonym contexts.

import numpy
for elems442 in train_data2['data']:
  for paras_under_title in elems442['paragraphs']:
    ans_start_arr = []
    for each_ques in paras_under_title['qas']:
      if not each_ques['is_impossible']:
        # store answer_start for each question in ans_start_arr
        ans_start_arr.append(each_ques['answers'][0]['answer_start'])
        ans_start_arr.append(each_ques['answers'][0]['answer_start'] + len(each_ques['answers'][0]['text']))
      else:
        # store answer_start for each question in ans_start_arr
        ans_start_arr.append(each_ques['plausible_answers'][0]['answer_start'])
        ans_start_arr.append(each_ques['plausible_answers'][0]['answer_start'] + len(each_ques['plausible_answers'][0]['text']))

    # store current context
    temp_context = paras_under_title['context']

    arr = ans_start_arr.copy()
    # pass current context, list of answer_start into modify_contexts method.
    ans, flag_list = modify_contexts(temp_context, arr)
    # untokenize the list of words into single string
    new_final_ans = untokenize(ans)

    # "UNSORT" the flag values such that they correspong to the answer_start list sent into modify_contexts method
    fl = find_new_indices(ans_start_arr, flag_list)
    i = 0

    # store new answers in a list
    ans_texts = []
    while i < len(ans_start_arr)-1:
      ans_texts.append(new_final_ans[ans_start_arr[i] + fl[i]:ans_start_arr[i+1] + fl[i+1]])
      i += 2
    
    i = 0
    j = 0

    # overwrite the answer text and answer_start in the dataset with new values

    for each_ques in paras_under_title['qas']:
      if each_ques['is_impossible'] == False:
        each_ques['answers'][0]['answer_start'] = ans_start_arr[i] + fl[i]
        each_ques['answers'][0]['text'] = ans_texts[j]
      else:
        each_ques['plausible_answers'][0]['answer_start'] = ans_start_arr[i] + fl[i]
        each_ques['plausible_answers'][0]['text'] = ans_texts[j]
      j += 1
      i += 2


In [135]:
# This is how an Question-Answers look like. Notice the change in answer_start and 'text'
# For reference, we printed the original Quesion-Answers below
train_data2['data'][0]['paragraphs'][0]['qas']

[{'answers': [{'answer_start': 268, 'text': 'in the late 1990s'}],
  'id': '56be85543aeaaa14008c9063',
  'is_impossible': False,
  'question': 'When did Beyonce start becoming popular?'},
 {'answers': [{'answer_start': 204, 'text': 'singing and saltation'}],
  'id': '56be85543aeaaa14008c9065',
  'is_impossible': False,
  'question': 'What areas did Beyonce compete in when she was growing up?'},
 {'answers': [{'answer_start': 534, 'text': '2003'}],
  'id': '56be85543aeaaa14008c9066',
  'is_impossible': False,
  'question': "When did Beyonce leave Destiny's Child and become a solo singer?"},
 {'answers': [{'answer_start': 166, 'text': 'Houston, tx'}],
  'id': '56bf6b0f3aeaaa14008c9601',
  'is_impossible': False,
  'question': 'In what city and state did Beyonce  grow up? '},
 {'answers': [{'answer_start': 275, 'text': 'late 1990s'}],
  'id': '56bf6b0f3aeaaa14008c9602',
  'is_impossible': False,
  'question': 'In which decade did Beyonce become famous?'},
 {'answers': [{'answer_start': 32

In [136]:
train_data['data'][0]['paragraphs'][0]['qas']

[{'answers': [{'answer_start': 269, 'text': 'in the late 1990s'}],
  'id': '56be85543aeaaa14008c9063',
  'is_impossible': False,
  'question': 'When did Beyonce start becoming popular?'},
 {'answers': [{'answer_start': 207, 'text': 'singing and dancing'}],
  'id': '56be85543aeaaa14008c9065',
  'is_impossible': False,
  'question': 'What areas did Beyonce compete in when she was growing up?'},
 {'answers': [{'answer_start': 526, 'text': '2003'}],
  'id': '56be85543aeaaa14008c9066',
  'is_impossible': False,
  'question': "When did Beyonce leave Destiny's Child and become a solo singer?"},
 {'answers': [{'answer_start': 166, 'text': 'Houston, Texas'}],
  'id': '56bf6b0f3aeaaa14008c9601',
  'is_impossible': False,
  'question': 'In what city and state did Beyonce  grow up? '},
 {'answers': [{'answer_start': 276, 'text': 'late 1990s'}],
  'id': '56bf6b0f3aeaaa14008c9602',
  'is_impossible': False,
  'question': 'In which decade did Beyonce become famous?'},
 {'answers': [{'answer_start': 3

In [137]:
# Augmented data in this dictionary
temp_dict = {}
temp_dict['version'] = 'v2.0'
temp_dict['data'] = []
for i in range(len(train_data2['data'])):
  temp_dict['data'].append(train_data2['data'][i].copy())

In [138]:
# repliction-factor = 2
for i in range(len(train_data2['data'])):
  train_data['data'].append(train_data2['data'][i].copy())

In [139]:
with open("finaloutputfile.json", "w") as outfile:  
    json.dump(train_data, outfile)