## Convert Covid Data to look like the Squad JSON file

In [1]:
import json
from uuid import uuid4
import pandas as pd

In [2]:
df = pd.read_csv('formatted_questions.csv',encoding='utf8' ,index_col=[0])
df.rename(columns={"A1":"text"}, inplace = True)
df.rename(columns={"category":"title"}, inplace = True)
from ast import literal_eval
df.text = df.text.apply(literal_eval)
df.answer_start = df.answer_start.apply(literal_eval)

In [4]:
df.head()

Unnamed: 0,title,question,context,text,answer_start
0,About the Virus FAQs,Am I at risk for COVID-19 from a package or pr...,There is still a lot that is unknown about the...,[very low risk],[620]
1,About the Virus FAQs,Am I likely to get sicker if I'm exposed to mu...,Symptom severity can be influenced by many dif...,[stay at least 6 feet away from everyone and a...,[679]
2,About the Virus FAQs,Are antibiotics effective in preventing or tre...,No. According to the World Health Organization...,[No],[0]
3,About the Virus FAQs,Are there therapies available to treat COVID-19?,Scientists are currently testing different typ...,"[Remdesivir, Dexamethasone, favipiravir, ribav...","[167, 530, 811, 824, 838]"
4,About the Virus FAQs,Are there two strains of the COVID-19 virus?,The existence of an S strain and an L strain r...,[The existence of an S strain and an L strain ...,[0]


In [4]:
end = len(df['text'])
for context in range(0,end):
    df['context'][context] = df['context'][context].replace('\n'," ")      #escape parentheses (regex can't handle them
    

In [5]:
for list_answers in df['text']:
    for answer in list_answers:
        answer = answer.replace('\n',"")      #escape parentheses (regex can't handle them
    

In [7]:
def extract_answers(series, answer_list):
        answer_text = [{"text":text} for text in series["text"]]
        answer_start = [{"answer_start": start} for start in series["answer_start"]]
        for text, start in zip(answer_text, answer_start):
            answer_list.append({**text, **start})



In [8]:

def group_by_title(df, data_dict):
    """After grouping by title and question we create the nested data structure ourselves
    
    Parameters
    ----------
    df : pd.DataFrame
        DataFrame grouped by "title" and "question"
    data_dict : dict
        placeholder dictionary used to store intermediate results of transforming the data.
    """
    title = df.loc[0, "title"]
    data = {
        "title": title,
        "paragraphs":[]
    }
    
    qas = {
        "question": df.loc[0, "question"],
        "id": str(uuid4()), # Generate ID
        "answers": [],
        "is_impossible": False,
    }
    context = df.loc[0,"context"]
    
    answers = []
    df.apply(extract_answers, axis =1, answer_list = answers)

    qas["answers"].extend(answers)
    
    if data_dict.get(title):
        data_dict[title]["paragraphs"].append({"qas":[qas], "context":context})
    else:
        data["paragraphs"].append({"qas":[qas], "context":context})
        data_dict[title] = data


In [9]:
data = {}

In [10]:
# apply our custom group by function
df.groupby(["title", "question"]).apply(group_by_title, data_dict=data)


In [11]:
# create the final output
version = {"version": "v2", "data": [val for val in data.values()]}

In [12]:
type(version)

dict

In [13]:
print(json.dumps(version, indent=2))

{
  "version": "v2",
  "data": [
    {
      "title": "About the Virus FAQs",
      "paragraphs": [
        {
          "qas": [
            {
              "question": "Am I at risk for COVID-19 from a package or products shipping from China?",
              "id": "aaa89796-01c7-4108-b6a2-ea6990cfc4f5",
              "answers": [
                {
                  "text": "very low risk",
                  "answer_start": 620
                }
              ],
              "is_impossible": false
            }
          ],
          "context": "There is still a lot that is unknown about the newly emerged COVID-19 and how it spreads. Two other coronaviruses have emerged previously to cause severe illness in people (MERS-CoV and SARS-CoV). The virus that causes COVID-19 is more genetically related to SARS-CoV than MERS-CoV, but both are betacoronaviruses with their origins in bats. While we don't know for sure that this virus will behave the same way as SARS-CoV and MERS-CoV, we can us

In [15]:
out_file = open("covid2squad_div.json", "w") 
json.dump(version,out_file, indent = 2)
  
out_file.close() 