# MERS Data Assembly Script
*Maimuna S. Majumder*

In [24]:
# This script utilizes the following libraries: 
# * requests, for making HTTP requests and pulling HTML content 
# * pandas, for organizing the data into an easily manipulatable table and then exporting said table 
# * regular expressions (re), for finding the relevant content on the source data page 
# * time, to suspend the script for a short period of time between each HTTP to avoid rate limiting
# * sys, for error tracking
import requests,pandas as pd,re,time,sys
from bs4 import BeautifulSoup
from datetime import timedelta
import datetime 

In [25]:
#Read the CSV as it exists currently
df=pd.read_csv("DataOutput.csv",index_col=False)

In [26]:
#List of source data URLs to have the script run on 
#For this work, these data are all sourced from the WHO 
urls=[
"https://www.who.int/csr/don/30-may-2015-mers-korea/en/",
"https://www.who.int/csr/don/31-may-2015-mers-korea/en/",
"https://www.who.int/csr/don/01-june-2015-mers-korea/en/",
"https://www.who.int/csr/don/04-june-2015-mers-korea/en/",
"https://www.who.int/csr/don/05-june-2015-mers-korea/en/",
"https://www.who.int/csr/don/06-june-2015-mers-korea/en/",
"https://www.who.int/csr/don/08-june-2015-mers-korea/en/",
"https://www.who.int/csr/don/09-june-2015-mers-korea/en/"
]

In [27]:
#Unlike the measles and Ebola case studies, this use case of the data assembly algorithm does not require the use of the pageiterator function due to the simplified page style of the source text

#The WHO reports case data in one of two ways
#If there is more than one case on a given day, the data are reported as an HTML list
#If there is only one case on a given day, the data are reported as a single paragraph with no list formatting
#This detects whether or not there are multiple case entries on a page and passes the individual entries (cases) for further processing
def get_case_entries(input_text):
    if input_text.findAll("li"):
        print("List Found")
        return([item.text for item in input_text.findAll("li")])
    else:
        print("NO LIST")
        return([input_text.find('h3',{'class':'section_head1'}).findNext('span').text])

#Because this work pulls individual-level data for each case, this defines the first entry UUID
casenum=1
    
#For this work, pull data for the field "sex"
def get_field_two(mytext):
    regex = r"old.*?(male|female|woman)"
    if re.findall(regex,mytext):
        if re.findall(regex,mytext)[0]=="female" or re.findall(regex,mytext)[0]=="woman":
            return("female")
        else:
            return("male")
    else: 
        return("male")

#For this work, pull data for the field "age"
def get_field_three(mytext):
    regex = r"([0-9]*).(year[-]?old)"
    return(re.findall(regex,mytext)[0][0])

#For this work, pull data for the field "date of symptoms"
def get_field_four(mytext):
    regex_1 = r"(developed symptoms.*?on )([+-]?[0-9]*[,]?[0-9]+ (\w+))"
    regex_2 = r"from ([+-]?[0-9]*[,]?[0-9]+ (\w+)).*(developed symptoms)"
    if (re.findall(regex_1,mytext)):
        return(re.findall(regex_1,mytext)[0][1])
    elif (re.findall(regex_2,mytext)):
        return(re.findall(regex_2,mytext)[0][1])
    else:
        return("No Symptoms Development Date Found")

#For this work, pull data for the field "date of diagnosis"
def get_field_five(mytext):
    regex_1 = r"(tested positive.*?on )([+-]?[0-9]*[,]?[0-9]+ (\w+))"
    regex_2 = r"from ([+-]?[0-9]*[,]?[0-9]+ (\w+)).*(developed symptoms)"
    if (re.findall(regex_1,mytext)):
        return(re.findall(regex_1,mytext)[0][1])
    else:
        return("No Tested Positive Date Found")
    
#For this work, pull data for the field "health care worker status"
def get_field_six(mytext):
    regex = r"(health professional|health worker)"
    if re.findall(regex,mytext):
        return("health care worker")
    else: 
        return("not health care worker")


In [31]:
# This is where the main execution of the script happens; a for loop runs through the list of URLs, makes requests to the pages, populates the table based on results, and reports each entry as the script iterates  
for url in urls: 
    req_object=requests.get(url)
    thetext=req_object.text
    soup=BeautifulSoup(thetext)
    html = ""
    container = soup.find('div',{"id":"primary"})
    for case in get_case_entries(container):
        print(f" ---\n {case} \n --- \n")
        df=df.append({'uuid':casenum,'sex':get_field_two(case),'age':get_field_three(case),'date of symptoms':get_field_four(case),'date of diagnosis':get_field_five(case),'health care worker status':get_field_six(case),'case_text':case,'source_url':url},ignore_index=True)
        casenum+=1

List Found
 ---
 The first case was reported in the DON of 24 May. The patient is a 68 year-old male with the following travel history: 18-29 April, Bahrain; 29-30 April, United Arab Emirates; 30 April to 1 May, Bahrain; 1-2 May, the Kingdom of Saudi Arabia; 2 May, Bahrain; and 2-3 May, Qatar. He arrived at Incheon International airport via Qatar on 4 May. The patient was asymptomatic on arrival. He developed symptoms on 11 May and sought medical care at a local clinic from 12 to 15 May. The patient was then admitted to hospital on 15 May and discharged on 17 May. On the day of discharge, he went to another clinic. Between 17 and 20 May, the patient visited another hospital. He tested positive for MERS-CoV on 20 May and was transferred to the nationally designated treatment facility for isolation. Currently, the patient is in stable condition. He has no history of exposure to known risk factors. Investigation of the source of infection is ongoing.  
 --- 

 ---
 The second case was rep

List Found
 ---
 A 69-year-old male developed symptoms on 1 June while admitted to hospital for an unrelated medical condition since 28 May. The patient shared the room with a laboratory-confirmed MERS-CoV case that was reported in a previous DON on 4 June (case n. 1). He tested positive for MERS-CoV on 3 June. 
 --- 

 ---
 A 54-year-old man developed symptoms on 29 May. After receiving medical care, he did not experience further symptoms. On 15, 22 and 29 May, the patient visited his mother who is a laboratory-confirmed MERS-CoV case that was reported in a previous DON on 30 May (case n. 10). He tested positive for MERS-CoV on 3 June. 
 --- 

 ---
 A 47-year-old male developed symptoms on 21 May. He sought medical care at different health facilities before being admitted to hospital on 1 June. The patient is a friend of a laboratory-confirmed MERS-CoV case (case 2 â see above). He visited his friend's mother on 15 May. The patient, who has no comorbidities, tested positive for MERS

 ---
 A 55-year-old male developed symptoms on 2 June. Between 27 and 28 May, the patient was admitted to hospital in the same zone as a laboratory-confirmed MERS-CoV case between 27 and 28 May. He was isolated at the hospital on 5 June and tested positive for MERS-CoV on 6 June. 
 --- 

 ---
 A 32-year-old, male health worker developed symptoms on 30 May. Between 28 and 29 May, the patient visited the emergency room of a hospital that reported several MERS-CoV cases. He tested positive for MERS-CoV on 6 June. Further investigation is ongoing. 
 --- 

 ---
 A 58-year-old female developed symptoms on 3 June. Between 27 and 28 May, the patient visited the emergency room of a hospital that reported several MERS-CoV cases. She started self-isolation at home on 31 May and tested positive for MERS-CoV on 6 June. 
 --- 

 ---
 A 75-year-old male developed symptoms on 4 June. Between 27 and 29 May, the patient visited the emergency room of a hospital that reported several MERS-CoV cases. Durin

In [29]:
# This simple block is just to render the content of the table after the script runs 
df

Unnamed: 0,uuid,sex,age,date of symptoms,date of diagnosis,health care worker status,case_text,source_url
0,1,male,68,11 May,20 May,not health care worker,The first case was reported in the DON of 24 M...,https://www.who.int/csr/don/30-may-2015-mers-k...
1,2,female,64,18 May,20 May,not health care worker,The second case was reported in the DON of 24 ...,https://www.who.int/csr/don/30-may-2015-mers-k...
2,3,male,76,20 May,No Tested Positive Date Found,not health care worker,The third case was reported in the DON of 24 M...,https://www.who.int/csr/don/30-may-2015-mers-k...
3,4,female,46,25 May,25 May,not health care worker,The fourth case is a 46 year-old female who is...,https://www.who.int/csr/don/30-may-2015-mers-k...
4,5,male,50,25 May,26 May,health care worker,"The fifth case is a 50 year-old, male health p...",https://www.who.int/csr/don/30-may-2015-mers-k...
...,...,...,...,...,...,...,...,...
58,59,female,37,1 June,6 June,health care worker,"A 37-year-old, female health worker developed ...",https://www.who.int/csr/don/09-june-2015-mers-...
59,60,male,55,2 June,6 June,not health care worker,A 55-year-old male developed symptoms on 2 Jun...,https://www.who.int/csr/don/09-june-2015-mers-...
60,61,male,32,30 May,6 June,health care worker,"A 32-year-old, male health worker developed sy...",https://www.who.int/csr/don/09-june-2015-mers-...
61,62,female,58,3 June,6 June,not health care worker,A 58-year-old female developed symptoms on 3 J...,https://www.who.int/csr/don/09-june-2015-mers-...


In [30]:
df.to_csv("DataOutput.csv",index=False)
df.to_excel("DataExcel.xlsx",index=False)

### Only execute the below to reset the CSV

Uncomment the code before running

In [23]:
#df=pd.DataFrame(columns=["uuid","sex","age","date of symptoms","date of diagnosis","health care worker status","case_text","source_url"])
#df.to_csv("DataOutput.csv",index=False)
#df.to_excel("DataExcel.xlsx",index=False)