# Data Collection: WHO Covid-19 Q & A

https://www.who.int/emergencies/diseases/novel-coronavirus-2019/question-and-answers-hub

In [1]:
from time import sleep
import json  

import requests
from bs4 import BeautifulSoup


See the following blog for details:
https://hackersandslackers.com/scraping-urls-with-beautifulsoup/

In [2]:
get_url = lambda x: 'https://www.who.int' + x

In [3]:
url = get_url('/emergencies/diseases/novel-coronavirus-2019/question-and-answers-hub')

In [4]:
url

'https://www.who.int/emergencies/diseases/novel-coronavirus-2019/question-and-answers-hub'

In [5]:
headers = {
    'Access-Control-Allow-Origin': '*',
    'Access-Control-Allow-Methods': 'GET',
    'Access-Control-Allow-Headers': 'Content-Type',
    'Access-Control-Max-Age': '3600',
    'User-Agent': 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:52.0) Gecko/20100101 Firefox/52.0'
    }

In [6]:
req = requests.get(url, headers)
soup = BeautifulSoup(req.content, 'html.parser')

In [14]:
all_topics = soup.select('.sf-list-vertical__item')

In [None]:
data_all_topics = []

for cur_topic in all_topics:
    
    # Get main topic info
    cur_title = cur_topic.select_one('.full-title').text.strip()
    cur_topics = cur_title.split(':')
    
    # Skip topics that are not related to covid-19 general info,
    # e.g. 'In vitro diagnostics: Prequalification'
    if ('covid-19' not in cur_topics[0].lower()):
        continue
    
    if (len(cur_topics) > 1):        
        cur_topic_title = cur_topics[1]
    else:
        cur_topic_title = 'General'
        
    cur_url = get_url(cur_topic.get('href'))    

    # ============
    # Get Q&A data for the current main topic
    # ============
    
    # Add a small pause to prevent a DoS block 
    sleep(0.5)
    
    cur_req = requests.get(cur_url, headers)
    cur_soup = BeautifulSoup(cur_req.content, 'html.parser')
    
    cur_qna_panels = cur_soup.select('.sf-accordion__panel')
    
    cur_topic = []
    for cur_qna in cur_qna_panels:
        cur_question = cur_qna.select_one('.sf-accordion__trigger-panel').text.strip()

#         cur_answer = cur_qna.select_one('.sf-accordion__summary').text.strip()
        cur_answer = cur_qna.select_one('.sf-accordion__summary').decode_contents()
        cur_topic.append({'question': cur_question, 'answer': cur_answer})
        
    
    # ====
    # Assemble data for the current topic
    # ====
    data_all_topics.append({
        'title': cur_topic_title,
        'url': cur_url,
        'data': cur_topic
    })


In [None]:
data_all_topics

Save data to .json file

In [None]:
with open("qna/who.json", "w") as outfile: 
    json.dump(data_all_topics, outfile)

### TODO

1. Update script to correctly get data from a page that do not follow the default template, e.g.
https://www.who.int/emergencies/diseases/novel-coronavirus-2019/question-and-answers-hub/q-a-detail/coronavirus-disease-covid-19-working-in-hotels

2. To clean data so it matches the web app format
3. To create a python program for this
4. Scrape data from CDC: https://www.cdc.gov/coronavirus/2019-ncov/faq.html