# Measles Data Assembly Script
*Maimuna S. Majumder*

In [2]:
# This script utilizes the following libraries: 
# * requests, for making HTTP requests and pulling HTML content 
# * pandas, for organizing the data into an easily manipulatable table and then exporting said table 
# * regular expressions (re), for finding the relevant content on the source data page 
# * time, to suspend the script for a short period of time between each HTTP to avoid rate limiting
# * sys, for error tracking
import requests,pandas as pd,re,time,sys
from bs4 import BeautifulSoup
from datetime import timedelta
import datetime

In [48]:
#Read the CSV as it exists currently
df=pd.read_csv("DataOutput.csv",index_col=False)

In [54]:
#List of source data URLs to have the script run on 
#For this work, these data are all sourced from archived tweets via the Samoan Government's Twitter account
urls=[
"http://web.archive.org/web/20201210232948/https://twitter.com/samoagovt/status/1197790948178051074",
"http://web.archive.org/web/20201210233007/https://twitter.com/samoagovt/status/1198133748312629248",
"http://web.archive.org/web/20191127142955/https://twitter.com/samoagovt/status/1198810318136176641",
"http://web.archive.org/web/20201210233135/https://twitter.com/samoagovt/status/1199106622083063808",
"http://web.archive.org/web/20201210233155/https://twitter.com/samoagovt/status/1199490365130076160",
"http://web.archive.org/web/20201210233211/https://twitter.com/samoagovt/status/1199844462479822848",
"http://web.archive.org/web/20201210233315/https://twitter.com/samoagovt/status/1200179578745917442",
"http://web.archive.org/web/20201210232349/https://twitter.com/samoagovt/status/1200591914031927296",
"http://web.archive.org/web/20201125024235/https://twitter.com/samoagovt/status/1200923810439946240",
"http://web.archive.org/web/20201125024433/https://twitter.com/samoagovt/status/1201285534368190464",
"http://web.archive.org/web/20201210233617/https://twitter.com/samoagovt/status/1201637336515112960",
"http://web.archive.org/web/20201210233709/https://twitter.com/samoagovt/status/1201979349374619648",
"http://web.archive.org/web/20201125030102/https://twitter.com/samoagovt/status/1202335691310391296",
"http://web.archive.org/web/20201210233943/https://twitter.com/samoagovt/status/1202707973048418304",
"http://web.archive.org/web/20201210234054/https://twitter.com/samoagovt/status/1203077866646167552",
"http://web.archive.org/web/20201210234205/https://twitter.com/samoagovt/status/1203604804997566464",
"http://web.archive.org/web/20201210234328/https://twitter.com/samoagovt/status/1203793768182235136"
]

In [55]:
# This function (pageiterator) scans through the source data page content and looks for the regular expression that is passed in 
def pageiterator(regex,soup):
    for x in soup.findAll("p",{"class":"TweetTextSize TweetTextSize--jumbo js-tweet-text tweet-text"}):
        no_bad_characters=x.text.replace(u'\xa0', u' ') # The text replacement here cleans out bad characters 
        if(re.search(pattern=regex,string=no_bad_characters)):
            return(re.findall(regex,no_bad_characters))
    for x in soup.findAll("div"):
        no_bad_characters=x.text.replace(u'\xa0', u' ')
        if(re.search(pattern=regex,string=no_bad_characters)):
            return(re.findall(regex,no_bad_characters))
    return[["not found","not found 2","not found 3"]] # If nothing is found, the function returns a string saying "not found", numbered based off of which output of the regular expression the call to the function is looking for 

#For this work, pull data for the field "cumulative cases"
def get_field_two(mytext):
    regex = r"([+-]?[0-9]*[,]?[0-9]+)( measles cases)"
    return(next(string for string in pageiterator(regex,mytext)[0]))

#For this work, pull data for the field "incident cases", which includes only cases that have been recorded in last 24 hours
def get_field_three(mytext):
    regex = r"([+-]?[0-9]*[,]?[0-9]+)( recorded in the last 24 hours)"
    return(next(string for string in pageiterator(regex,mytext)[0]))

#For this work, pull data for the field "cumulative deaths"
def get_field_four(mytext):
    regex = r"([+-]?[0-9]*[,]?[0-9]+)( measles related deaths)"
    return(next(string for string in pageiterator(regex,mytext)[0]))

In [56]:
# This is where the main execution of the script happens; a for loop runs through the list of URLs, makes requests to the pages, populates the table based on results, and reports each URL as the script iterates  
for url in urls:
    try:
        req_object=requests.get(url)
        thetext=req_object.text
        soup=BeautifulSoup(thetext)
        if(df.empty): #If the table is completely empty, this script starts populating the table with the first date of reporting (2019-11-22)
            df=df.append({'date':pd.Timestamp(2019,11,22),'cumulative_cases':get_field_two(soup),'incident_cases':get_field_three(soup),'cumulative_deaths':get_field_four(soup),'source_url':url},ignore_index=True)
        else:
            df=df.append({'date':df.tail(1)['date'].values[0]+pd.to_timedelta(1, unit='D'),'cumulative_cases':get_field_two(soup),'incident_cases':get_field_three(soup),'cumulative_deaths':get_field_four(soup),'source_url':url},ignore_index=True)
        print(url)
        time.sleep(1)
    except Exception as e:
        pass

http://web.archive.org/web/20201210232948/https://twitter.com/samoagovt/status/1197790948178051074
http://web.archive.org/web/20201210233007/https://twitter.com/samoagovt/status/1198133748312629248
http://web.archive.org/web/20191127142955/https://twitter.com/samoagovt/status/1198810318136176641
http://web.archive.org/web/20201210233135/https://twitter.com/samoagovt/status/1199106622083063808
http://web.archive.org/web/20201210233155/https://twitter.com/samoagovt/status/1199490365130076160
http://web.archive.org/web/20201210233211/https://twitter.com/samoagovt/status/1199844462479822848
http://web.archive.org/web/20201210233315/https://twitter.com/samoagovt/status/1200179578745917442
http://web.archive.org/web/20201210232349/https://twitter.com/samoagovt/status/1200591914031927296
http://web.archive.org/web/20201125024235/https://twitter.com/samoagovt/status/1200923810439946240
http://web.archive.org/web/20201125024433/https://twitter.com/samoagovt/status/1201285534368190464
http://web

In [58]:
# This simple block is just to render the content of the table after the script runs 
df

Unnamed: 0,date,cumulative_cases,incident_cases,cumulative_deaths,source_url
0,2019-11-22,1644,202,20,http://web.archive.org/web/20201210232948/http...
1,2019-11-23,1797,153,22,http://web.archive.org/web/20201210233007/http...
2,2019-11-24,2194,144,25,http://web.archive.org/web/20191127142955/http...
3,2019-11-25,2437,243,32,http://web.archive.org/web/20201210233135/http...
4,2019-11-26,2686,249,33,http://web.archive.org/web/20201210233155/http...
5,2019-11-27,2936,250,39,http://web.archive.org/web/20201210233211/http...
6,2019-11-28,3149,213,42,http://web.archive.org/web/20201210233315/http...
7,2019-11-29,not found,not found,not found,http://web.archive.org/web/20201210232349/http...
8,2019-11-30,not found,not found,not found,http://web.archive.org/web/20201125024235/http...
9,2019-12-01,not found,not found,not found,http://web.archive.org/web/20201125024433/http...


In [7]:
#Send dataframe to a CSV and Excel file
df.to_csv("DataOutput.csv",index=False)
df.to_excel("DataExcel.xlsx",index=False)

### Only execute the below to reset the CSV
Uncomment the code before running

In [53]:
#df=pd.DataFrame(columns=["date","cumulative_cases","incident_cases","cumulative_deaths", "source_url"])
#df.to_csv("DataOutput.csv",index=False)
#df.to_excel("DataExcel.xlsx",index=False)