# Data Collection: CDC Covid-19 Q & A

https://www.cdc.gov/coronavirus/2019-ncov/faq.html

In [1]:
from time import sleep
import json  

import requests
from bs4 import BeautifulSoup

In [2]:
def get_url():
    # To update for Vaccine
    url = 'https://www.cdc.gov/coronavirus/2019-ncov/faq.html'
    return url

In [3]:
url = get_url()

In [4]:
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

### Collect data from the main FAQ page

https://www.cdc.gov/coronavirus/2019-ncov/faq.html

In [5]:
def collect_data(url, prefix_title=""):
    req = requests.get(url, headers)
    soup = BeautifulSoup(req.content, 'html.parser')
    
    all_rows = soup.select('.row .splash-col')
    all_topics = []
    for cur_row in all_rows:
        cur_topic = cur_row.select('.cdc-faq-item')
        if (len(cur_topic) > 0):
            all_topics.append(cur_row)
        
    data_main_topics = []

    for cur_topic in all_topics:

        # Get main topic info
        cur_title = cur_topic.select_one('h2').text.strip()
        
        if prefix_title:
            
            if cur_title:
                cur_title = prefix_title + " : " + cur_title
            else:
                cur_title = prefix_title
        
        
        cur_url = url 

        # ============
        # Get Q&A data for the current main topic
        # ============

        cur_headers = cur_topic.select('.card-header')
        cur_bodies = cur_topic.select('.card-body')

        cur_topic_data = []
        for cur_question, cur_answer in zip(cur_headers, cur_bodies):
            cur_topic_data.append({'question': cur_question.text.strip(), 'answer': cur_answer.text.strip()})

        # ====
        # Assemble data for the current topic
        # ====
        data_main_topics.append({
            'title': cur_title,
            'url': cur_url,
            'data': cur_topic_data
        })
        
    return data_main_topics

In [6]:
all_main_topics = collect_data(url)

In [7]:
all_main_topics

[{'title': 'Basics',
  'url': 'https://www.cdc.gov/coronavirus/2019-ncov/faq.html',
  'data': [{'question': 'What is COVID-19?',
    'answer': 'COVID-19 is a disease caused by a virus called SARS-CoV-2. Most people with COVID-19 have mild symptoms, but some people can become severely ill. Although most people with COVID-19 get better within weeks of illness, some people experience post-COVID conditions. Post-COVID conditions\xa0are a wide range of new, returning, or ongoing health problems people can experience\xa0more than four weeks after first being infected with the virus that causes COVID-19. Older people and those who have certain underlying medical conditions\xa0are more likely to get severely ill from COVID-19. Vaccines\xa0against COVID-19 are safe and effective.'},
   {'question': 'Why is the disease being called coronavirus disease 2019, COVID-19?',
    'answer': 'On February 11, 2020 the World Health Organization announced an official name for the disease that is causing the

## Collect data for additional FAQs

CDC also have a FAQ page for additional areas (e.g. Vaccine, Trave, etc.) and we need to scrape data from those page

In [64]:
# .row.footnotes .card-body li a
url = get_url()
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, 'html.parser')

In [72]:
all_lis = soup.select('.card-body.bg-quaternary li a')
all_topics_more_faqs = []
prefix_url = 'https://www.cdc.gov'

for cur_li in all_lis:
    cur_title = cur_li.text
    cur_url = cur_li.get('href')
    if (not cur_url.startswith(prefix_url)):
        cur_url = prefix_url + cur_url
    
    # Add a small pause to prevent a DoS block 
    sleep(0.5)
    
    # Only collect data for an available faq
    cur_req = requests.get(cur_url, headers)
    cur_soup = BeautifulSoup(cur_req.content, 'html.parser')
    
    if (len(cur_soup.select('.cdc-faq')) > 0):
        all_topics_more_faqs.append({
            'title': cur_title,
            'url': cur_url
        })
        
    

In [92]:
data_more_faqs = []
for cur_more_faq in all_topics_more_faqs:
    cur_theme = cur_more_faq['title']
    cur_url = cur_more_faq['url']

    # Add a small pause to prevent a DoS block 
    sleep(0.5)
    
    req = requests.get(cur_url, headers)
    soup = BeautifulSoup(req.content, 'html.parser')
    
    all_rows = soup.select('.row .splash-col')
    find_topic_rows = None
    for cur_row in all_rows:
        cur_topic = cur_row.select('.cdc-faq-item')
        if (len(cur_topic) > 0):
            find_topic_rows = cur_row
            
    directChildren = find_topic_rows.findChildren()
    
    all_data_more_faqs = []
    cur_title = cur_theme

    for cur_child in directChildren:

        if ((cur_child.name == 'h2') and cur_child.text):
            new_title = cur_child.text.strip()
            
            if new_title:
                cur_title = cur_title + " : " + new_title
            
            is_new_title = True
        else:
            is_new_title = False

        if (not is_new_title):
            cur_data = []

            if cur_child.has_attr('class'):

                if 'cdc-faq' in cur_child.get("class"):
                    # Current topic

                    print('* Topic found! ', cur_title)
                    cur_faqs = cur_child.select('.cdc-faq-item')
                    print('|---> # FAQs: ', len(cur_faqs))
                    for cur_faq in cur_faqs:
                        cur_headers = cur_faq.select('.card-header')
                        cur_question = cur_headers[0].text.strip()

                        cur_bodies = cur_faq.select('.card-body')
                        cur_answer = cur_bodies[0].text.strip()

                        cur_data.append({'question': cur_question, 'answer': cur_answer})

                    data_more_faqs.append({
                        'title': cur_title,
                        'url': cur_url,
                        'data': cur_data
                    })

* Topic found!  Vaccines
|---> # FAQs:  11
* Topic found!  Travel : General
|---> # FAQs:  3
* Topic found!  Travel : General : Domestic Travel
|---> # FAQs:  2
* Topic found!  Travel : General : Domestic Travel : International Travel
|---> # FAQs:  5
* Topic found!  Travel : General : Domestic Travel : International Travel : Air or Cruise Travel
|---> # FAQs:  4
* Topic found!  Healthcare Professionals : COVID-19 Risk
|---> # FAQs:  6
* Topic found!  Healthcare Professionals : COVID-19 Risk : Infection Control
|---> # FAQs:  18
* Topic found!  Healthcare Professionals : COVID-19 Risk : Infection Control : Transmission
|---> # FAQs:  3
* Topic found!  Healthcare Professionals : COVID-19 Risk : Infection Control : Transmission : Testing, Diagnosis, and Notification
|---> # FAQs:  6
* Topic found!  Healthcare Professionals : COVID-19 Risk : Infection Control : Transmission : Testing, Diagnosis, and Notification : Testing in Nursing Homes
|---> # FAQs:  10
* Topic found!  Healthcare Profe

In [96]:
len(data_more_faqs)

32

In [97]:
data_combined = all_main_topics + data_more_faqs

In [98]:
len(data_combined)

48

Save data to .json file

In [99]:
with open("qna/cdc.json", "w") as outfile: 
    json.dump(data_combined, outfile)