# Data Collection: FDA Covid-19 Q & A

https://www.fda.gov/emergency-preparedness-and-response/coronavirus-disease-2019-covid-19/covid-19-frequently-asked-questions

In [1]:
from time import sleep
import json  

import requests
from bs4 import BeautifulSoup

In [2]:
url = 'https://www.fda.gov/emergency-preparedness-and-response/coronavirus-disease-2019-covid-19/covid-19-frequently-asked-questions'

In [3]:
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

In [4]:
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, 'html.parser')

In [5]:
soup

<!DOCTYPE html>

<html dir="ltr" lang="en" prefix="content: http://purl.org/rss/1.0/modules/content/  dc: http://purl.org/dc/terms/  foaf: http://xmlns.com/foaf/0.1/  og: http://ogp.me/ns#  rdfs: http://www.w3.org/2000/01/rdf-schema#  schema: http://schema.org/  sioc: http://rdfs.org/sioc/ns#  sioct: http://rdfs.org/sioc/types#  skos: http://www.w3.org/2004/02/skos/core#  xsd: http://www.w3.org/2001/XMLSchema# ">
<head>
<meta charset="utf-8"/>
<script async="" src="https://www.googletagmanager.com/gtag/js?id=UA-22737364-1"></script>
<meta content="The FDA is working with U.S. government partners including the CDC, medical product manufacturers, and international partners to closely monitor and mitigate the effects of COVID-19.  These frequently asked questions are for a general public or consumer audience." name="description"/>
<meta content="COVID-19 Frequently Asked Questions" name="dcterms.title"/>
<meta content="Office of the Commissioner" name="dcterms.creator"/>
<meta content="Th

In [6]:
main_dom = soup.select_one('div[role="main"]')

In [15]:
directChildren = main_dom.findChildren()

In [16]:
directChildren

[<div><div class="row"> <div class="col-sm-5"> <div type="info">
 <div class="alert alert-info">
 <p><strong>On this page:</strong></p>
 <ul><li><a href="#general">General Information</a></li>
 <li><a href="#biologics">Vaccines, Biologics, Human Tissues, and Blood Products</a></li>
 <li><a href="#drugs">Drugs (Medicines)</a></li>
 <li><a href="#devices">Medical Devices Including Tests for COVID-19</a></li>
 <li><a href="#food">Food Products</a></li>
 <li><a href="#animals">Animals, Pets and Animal Drug Products</a></li>
 </ul></div>
 </div>
 </div>
 <div class="col-sm-7">
 <div alt="Woman looking at computer with man in the background" class="embedded-entity" data-embed-button="media_browser" data-entity-embed-display="media_image" data-entity-type="media" data-entity-uuid="b5e86406-b8f2-4d16-b6e1-1cbff17db696" data-langcode="en" title="COVID-19 FAQs page image"> <img alt="Woman looking at computer with man in the background" class="img-responsive" src="/files/1600x900-covid19-QA-subpa

In [45]:
all_data = []
cur_title = ''

for cur_child in directChildren:
    
    if ( (cur_child.name == 'h2') and cur_child.text ): 
        if (cur_child.has_attr('class') and ('panel-title' in cur_child.get("class")) ):
            # Exclude Panel Title
            pass
        else:
            cur_title = cur_child.text.strip()
            is_new_title = True
            print('|--- ', cur_title)
        
    else:
        is_new_title = False
        
    if (not is_new_title):
        cur_data = []

        if cur_child.has_attr('class'):

            if 'panel-group' in cur_child.get("class"):
                # Current topic

                print('* Topic found! ', cur_title)
                cur_faqs = cur_child.select('.fda-accordion-panel')
                print('|---> # FAQs: ', len(cur_faqs))
                for cur_faq in cur_faqs:
                    cur_headers = cur_faq.select('h2')
                    cur_question = cur_headers[0].text.strip()[3:]
        
                    cur_bodies = cur_faq.select('.panel-body')
                    cur_answer = cur_bodies[0].text.strip()[3:]

                    cur_data.append({'question': cur_question, 'answer': cur_answer})

                all_data.append({
                    'title': cur_title,
                    'url': url,
                    'data': cur_data
                })
            

|---  General Information
* Topic found!  General Information
|---> # FAQs:  18
|---  Vaccines, Biologics, Human Tissues, and Blood Products
* Topic found!  Vaccines, Biologics, Human Tissues, and Blood Products
|---> # FAQs:  14
|---  Drugs (Medicines)
* Topic found!  Drugs (Medicines)
|---> # FAQs:  18
|---  Medical Devices Including Tests for COVID-19
* Topic found!  Medical Devices Including Tests for COVID-19
|---> # FAQs:  10
|---  Food Products
* Topic found!  Food Products
|---> # FAQs:  14
|---  Animals, Pets and Animal Drug Products
* Topic found!  Animals, Pets and Animal Drug Products
|---> # FAQs:  13


In [46]:
all_data

[{'title': 'General Information',
  'url': 'https://www.fda.gov/emergency-preparedness-and-response/coronavirus-disease-2019-covid-19/covid-19-frequently-asked-questions',
  'data': [{'question': 'What is the FDA doing to respond to the COVID-19 pandemic?',
    'answer': 'The FDA, along with other federal, state, and local agencies and public health officials across the country and internationally, plays a critical role in protecting public health during the COVID-19 pandemic. FDA staff are working around the clock to support development of \xa0medical countermeasures and are providing regulatory advice, guidance, and technical assistance to advance the development and availability of vaccines, therapies, diagnostic tests and other medical devices for use diagnosing, treating, and preventing this novel virus. The FDA continues to monitor the human and animal food supply and take swift action on fraudulent COVID-19 products.'},
   {'question': 'What is an emergency use authorization and

Save data to .json file

In [47]:
with open("qna/fda.json", "w") as outfile: 
    json.dump(all_data, outfile)