# Script for "A Data Scraping Algorithm for Modeling Ebola Virus Disease in the Eastern DR Congo"
*Maimuna S. Majumder*

In [1]:
# This script utilizes the following libraries: 
# * requests, for making HTTP requests and pulling HTML content 
# * pandas, for organizing the data into an easily manipulatable table and then exporting said table 
# * regular expressions (re), for finding the relevant content on the source data page 
# * time, to suspend the script for a short period of time between each HTTP to avoid rate limiting
# * sys, for error tracking
import requests,pandas as pd,re,time,sys
from bs4 import BeautifulSoup
from datetime import timedelta
import datetime

In [2]:
#Read the CSV as it exists currently
df=pd.read_csv("DataOutput.csv",index_col=False)

In [3]:
#List of source data URLs to have the script run on. 
#For this work, these data are all sourced from Ministère de la Santé RDC 
urls=[
    "https://mailchi.mp/70213f4262fb/ebola_kivu_6aout",
    "https://mailchi.mp/526091c2d161/ebola_kivu_7aout",
    "https://mailchi.mp/80a23d454958/ebola_kivu_8aout",
    "https://mailchi.mp/4cc2513a9f7d/ebola_kivu_9aout",
    "https://mailchi.mp/68ac0a236b26/ebola_kivu_10aout",
    "https://mailchi.mp/17136ac871cd/ebola_kivu_11aout",
    "https://mailchi.mp/5740eb659e90/ebola_kivu_12aout",
    "https://mailchi.mp/de936e52fb17/ebola_kivu_13aout",
    "https://mailchi.mp/37b7545e3544/ebola_kivu_14aout",
    "https://mailchi.mp/96767bcd0575/ebola_kivu_15aout",
    "https://mailchi.mp/2d5b7a2d510a/ebola_kivu_16aout",
    "https://mailchi.mp/74a121fdd538/ebola_kivu_17aout",
    "https://mailchi.mp/758eb254afd1/ebola_kivu_18aout",
    "https://mailchi.mp/231176da338d/ebola_kivu_19aout",
    "https://mailchi.mp/bad94a83ea28/ebola_kivu_20aout",
    "https://mailchi.mp/58d0398606a4/ebola_kivu_21aout",
    "https://mailchi.mp/9abcbef79b2c/ebola_kivu_22aout",
    "https://mailchi.mp/ad8db275a17f/ebola_kivu_23aout",
    "https://mailchi.mp/970b172dfebc/ebola_kivu_24aout",
    "https://mailchi.mp/c9c7043a5a23/ebola_kivu_25aout",
    "https://mailchi.mp/9feedcd3907b/ebola_kivu_26aout",
    "https://mailchi.mp/4b1528882fd6/ebola_kivu_27aout",
    "https://mailchi.mp/49dc88255f69/ebola_kivu_28aout",
    "https://mailchi.mp/aa63872541fd/ebola_kivu_29aout",
    "https://mailchi.mp/1012b27227bc/ebola_kivu_30aout",
    "https://mailchi.mp/230a80c1c317/ebola_kivu_31aout",
    "https://mailchi.mp/221fb6ed0a42/ebola_kivu_1sept",
    "https://mailchi.mp/fe4fb528194b/ebola_kivu_2sept",
    "https://mailchi.mp/f0e9bc8d9e2e/ebola_kivu_3sept",
    "https://mailchi.mp/b54b63c3f920/ebola_kivu_4sept",
    "https://mailchi.mp/534570505aa4/ebola_kivu_5sept-erratum",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=540896bcd7",
    "https://mailchi.mp/d8019cbd3724/ebola_kivu_7sept",
    "https://mailchi.mp/b0c4027473ad/ebola_kivu_8sept",
    "https://mailchi.mp/363955d77983/ebola_kivu_9sept",
    "https://mailchi.mp/196210a9e992/ebola_kivu_10sept",
    "https://mailchi.mp/ff1463ead572/ebola_kivu_11sept",
    "https://mailchi.mp/1644c6a622ef/ebola_kivu_12sept",
    "https://mailchi.mp/c58e80fafe6f/ebola_kivu_13sept",
    "https://mailchi.mp/820051e5d093/ebola_kivu_14sept",
    "https://mailchi.mp/2e6744c50cf5/ebola_kivu_15sept",
    "https://mailchi.mp/b9f8278091fc/ebola_kivu_16sept",
    "https://mailchi.mp/822fafac8a55/ebola_kivu_17sept",
    "https://mailchi.mp/eec05154b8e8/ebola_kivu_18sept",
    "https://mailchi.mp/695bb330c6a4/ebola_kivu_19sept",
    "https://mailchi.mp/38deffc31ebf/ebola_kivu_20sept",
    "https://mailchi.mp/4bc95ae1768f/ebola_kivu_21sept",
    "https://mailchi.mp/9de66c89895d/ebola_kivu_22sept",
    "https://mailchi.mp/ca153def9618/ebola_kivu_23sept",
    "https://mailchi.mp/efc69f724562/ebola_kivu_24sept",
    "https://mailchi.mp/13493f87482a/ebola_kivu_25sept",
    "https://mailchi.mp/fe4b2f287c8d/ebola_kivu_26sept",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_27sept",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_28sept",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_29sept",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_30sept",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_1oct",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_2oct",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_3oct",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_4oct",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_5oct",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_6oct",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_7oct",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_8oct",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_9oct",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_10oct",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_11oct",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_12oct",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_13oct",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_14oct",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_15oct",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_16oct",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_17oct",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_18oct",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=70a29bcd89",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=5b0b3867b3",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=b65abcbd17",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=3b2c72b6c8",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=54842c0f1b",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=cd46533fcd",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=c92e4270c7",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=ef9cd2d167",
    "https://us13.ampaign-archive.com/?u=89e5755d2cca4840b1af93176&id=9391cd7277",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=b7dfaf6885",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=80be8b0096",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=9c61f6f046",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=eb02714878",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=14196c9020",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=8674d4f7a6",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=baaa79a3de",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=4a9a8054b7",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=42c7af57f7",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=a79161bd59",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=4b5beec831",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=16a2ac3656",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=5458d53cfd",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=4b9848fdef",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=3594a8eb5d",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=3265aed53a",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=dccfabc433",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=5888dc1b5c",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=73b0a7cf24",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=fddf300d12",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=d35154a642",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=5794bf500d",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=133e4b89be",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=97b8d385d1",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=48622b0407",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=e1cf50698c",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=104902cbb9",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=affd57dff1",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=97a997e756",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=248cc081d9",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=55672db117",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=f00e29e648",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=bc5bddcafc",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=63263cf3e0",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=2c6e58f6f7",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=5ffd88f6ad",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=af4f77c907",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=35f603522f",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=e073c72968",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=8a505d9003",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=b7046ae344",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=35b29a9d1c",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=3c3aa04454",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=1779054324",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=04141c7428",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=0da99fb326",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=b19df2d61f",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=4b9dc5a5d9",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=9a337de507",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=dfc4e1bb7b",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=089b5462c7",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=89a4ad0ad2",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=c2608bba60",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=4ea3e9cc12",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=20fedfa6d0",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=900777b642",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=abef727b19",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=8511854d94",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=9958aef9a2",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=b3df097203",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=4ff93ad782",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=a405d11562",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=64b82ce1d7",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=39662f9103",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=f3a40bf680",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=eafcdad12e",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=e297e9c9be",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=d860e4d980",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=43d9c741b2",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=5452b8674b",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=fd4d70a646",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=7af14fa080",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=30f733dd94",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=83a6fcb988",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=e3acf00149",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_11jan19",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=922767f361",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=ca8b2dfd50",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=102ec4a195",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=89e1804c7f",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=1b813bd576",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=938227cc36",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=45b7929f20",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=2b3d226b45",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=2083b3391c",
    "http://eepurl.com/geLB1j",
    "http://eepurl.com/geTokb",
    "http://eepurl.com/ge0801",
    "http://eepurl.com/ge8971",
    "http://eepurl.com/gffeFL",
    "http://eepurl.com/gfjwTL",
    "http://eepurl.com/gfnr-H",
    "http://eepurl.com/gfuDmr",
    "http://eepurl.com/gfCgi9",
    "http://eepurl.com/gfL1OD",
    "http://eepurl.com/gfUGA1",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=377ff1d6b7",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=3115455c58",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=aea9304842",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=bc3267bb6a",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=c88e0bb48e",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=978bf2e04b",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=03dda59271",
    "http://eepurl.com/ggOk61",
    "http://eepurl.com/ggRdIP",
    "http://eepurl.com/ggUjtj",
    "http://eepurl.com/gg2Wof",
    "http://eepurl.com/gg-Oiv",
    "http://eepurl.com/ghgFND",
    "http://eepurl.com/gho8ab",
    "http://eepurl.com/ghxh-D",
    "http://eepurl.com/ghAB9b",
    "http://eepurl.com/ghDoKf",
    "http://eepurl.com/ghK6Fr",
    "http://eepurl.com/ghTapH",
    "http://eepurl.com/gh0kT5",
    "http://eepurl.com/gh9EG1",
    "http://eepurl.com/gie-yP",
    "http://eepurl.com/gii1Rr",
    "http://eepurl.com/gimwsL",
    "http://eepurl.com/giutPn",
    "http://eepurl.com/giEHkb",
    "http://eepurl.com/giQwgX",
    "http://eepurl.com/gi0sNT",
    "http://eepurl.com/gi-JKX",
    "http://eepurl.com/gjdreT",
    "http://eepurl.com/gjgRuX",
    "http://eepurl.com/gjqaeL",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_5mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_6mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_7mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_8mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_9mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_10mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_11mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_12mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_13mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_14mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_15mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_16mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_17mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_18mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_19mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_20mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_21mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_22mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_23mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_24mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_25mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_26mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_27mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_28mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_29mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_30mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_31mar19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_1avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_2avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_3avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_4avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_5avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_6avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_7avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_8avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_9avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_10avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_11avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_12avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_13avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_14avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_15avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_16avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_17avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_18avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_19avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_20avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_21avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_22avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_23avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_24avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_25avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_26avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_27avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_28avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_29avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_30avr19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_1mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_2mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_3mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_4mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_5mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_6mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_7mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_8mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_9mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_10mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_11mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_12mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_13mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_14mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_15mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_16mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_17mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_18mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_19mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_20mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_21mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_22mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_23mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_24mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_25mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_26mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_27mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_28mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_29mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_30mai19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_31mai19",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=5e35fa33ba",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=f056f28558",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=307c1f0c6f",
    "http://eepurl.com/gtSRl1",
    "http://eepurl.com/gtZH3L",
    "http://eepurl.com/gt7YnH",
    "http://eepurl.com/guc9V5",
    "http://eepurl.com/gugiU9",
    "http://eepurl.com/guic8H",
    "http://eepurl.com/guo1hj",
    "http://eepurl.com/guxoy9",
    "http://eepurl.com/guE1DP",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_13juin19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_14juin19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_15juin19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_16juin19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_17juin19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_18juin19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_19juin19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_20juin19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_21juin19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_22juin19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_23juin19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_24juin19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_25juin19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_26juin19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_27juin19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_28juin19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_29juin19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_30juin19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_01juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_02juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_03juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_04juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_05juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_06juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_07juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_08juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_09juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_10juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_11juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_12juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_13juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_14juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_15juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_16juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_17juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_18juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_19juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_20juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_21juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_22juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_23juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_24juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_25juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_26juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_27juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_28juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_29juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_30juil19",
    "https://mailchi.mp/sante.gouv.cd/ebola_kivu_31juil19",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=f8f3462d42",
    "https://us13.campaign-archive.com/?u=89e5755d2cca4840b1af93176&id=a28b56946a"
]

In [4]:
# This function (pageiterator) scans through the source data page content and looks for the regular expression that is passed in 
def pageiterator(regex,soup):
    for x in soup.findAll("li"):
        no_bad_characters=x.text.replace(u'\xa0', u' ') # The text replacement here cleans out bad characters 
        if(re.search(pattern=regex,string=no_bad_characters)):
            return(re.findall(regex,no_bad_characters))
    for x in soup.findAll("div"):
        no_bad_characters=x.text.replace(u'\xa0', u' ')
        if(re.search(pattern=regex,string=no_bad_characters)):
            return(re.findall(regex,no_bad_characters))
    return[["not found","not found 2","not found 3"]] # If nothing is found, the function returns a string saying "not found", numbered based off of which output of the regular expression the call to the function is looking for 


# If the content we're looking for is stored in a table (as the MoS occassionally formats content via HTML tables), this function will find the appropriate data inside of a the table instead of inside of a list   
def tableiterator(soup):
    table_text=soup.findAll("table",{"class":"mcnImageCardBottomContent"})[0].findAll('td',{"class":"mcnTextContent"})[0].findAll('strong')[0]
    no_bad_characters=table_text.text.replace(u'\xa0', u' ')
    no_bad_characters=no_bad_characters.replace(u' ','')
    
    try:
        return(int(no_bad_characters))
    except:
        table_text=soup.findAll("table",{"class":"mcnImageCardBottomContent"})[1].findAll('td',{"class":"mcnTextContent"})[0].findAll('strong')[0]
        no_bad_characters=table_text.text.replace(u'\xa0', u' ')
        no_bad_characters=no_bad_characters.replace(u' ','')
        return(int(no_bad_characters))
    return[["not found","not found 2","not found 3"]]

#For this work, pull data for the field "confirmed cumulative cases"
def get_field_two(mytext):
    regex = r"(?:le cumul des cas est de [+-]?[0-9]*[.]?[0-9]+, dont )([+-]?[0-9]*[.]?[0-9]+)|(?:ont été signalés dans la région, dont) ([+-]?[0-9]*[.]?[0-9]+) (?:confirmés)"
    return(next(string for string in pageiterator(regex,mytext)[0] if string).replace(".",","))

#For this work, pull data for the field "total cumulative cases", which includes confirmed AND probable cases
def get_field_three(mytext):
    regex = r"(?:le cumul des cas est de) ([+-]?[0-9]*[.]?[0-9]+)|([+-]?[0-9]*[.]?[0-9]+) (?:cas de fièvre hémorragique)"
    return(next(string for string in pageiterator(regex,mytext)[0] if string).replace(".",","))

#For this work, pull data for the field "confirmed cumulative deaths"
def get_field_four(mytext):
    regex = r"(?:il y a eu [+-]?[0-9]*[.]?[0-9]+ décès \()([+-]?[0-9]*[.]?[0-9]+)|(?:les [+-]?[0-9]*[.]?[0-9]+ confirmés,) ([+-]?[0-9]*[.]?[0-9]+) (?:sont décédés)"
    return(next(string for string in pageiterator(regex,mytext)[0] if string).replace(".",","))

#For this work, pull data for the field "total cumulative deaths", which includes confirmed AND probable deaths
def get_field_five(mytext):
    regex = r"(il y a eu )([+-]?[0-9]*[.]?[0-9]+)"
    return(pageiterator(regex,mytext))[0][1].replace(".",",")

#For this work, pull data for the field "cumulative cases recovered"
def get_field_six(mytext):
    regex = r"([+-]?[0-9]*[.]?[0-9]+) (?:personnes guéries\.)|(?:les [+-]?[0-9]*[.]?[0-9]+ confirmés, [+-]?[0-9]*[.]?[0-9]+ sont décédés,) ([+-]?[0-9]*[.]?[0-9]+) (?:sont guéris)|(?:les [+-]?[0-9]*[.]?[0-9]+ confirmés, [+-]?[0-9]*[.]?[0-9]+ sont décédés et) ([+-]?[0-9]*[.]?[0-9]+) (?:sont guéris)"
    return(next(string for string in pageiterator(regex,mytext)[0] if string).replace(".",","))

#For this work, pull data for the field "cumulative vaccinations deployed"
def get_field_seven(mytext):
    regex = r"(?:le début de la vaccination le [0-9] août 2018, )([+-]?[0-9]*[.]?[0-9]+) (?:personnes ont été vaccinées)"
    output=pageiterator(regex,mytext)
    if output==[["not found","not found 2","not found 3"]]:
        try:
            return(tableiterator(soup))
        except:
            return(output[0][0])
    else:
        return(str(pageiterator(regex,mytext)[0]).replace(".",","))

#For this work, pull data for the field "cumulative vaccinations deployed in Beni"
def get_field_eight(mytext):
    regex = r"(vaccinées).* ([+-]?[0-9]*[.]?[0-9]+) +(à Beni)"
    output=pageiterator(regex,mytext)
    if output==[["not found","not found 2","not found 3"]]:
        return(output[0][0])
    return(pageiterator(regex,mytext))[0][1].replace(".",",")

#For this work, pull data for the field "cumulative vaccinations deployed in Katwa"
def get_field_nine(mytext):
    regex = r"(?:vaccinées).* ([+-]?[0-9]*[.]?[0-9]+) +(?:à Katwa)"
    output=pageiterator(regex,mytext)
    if output==[["not found","not found 2","not found 3"]]:
        return(output[0][0])
    return(str(pageiterator(regex,mytext)[0]).replace(".",","))

#For this work, pull data for the field "cumulative vaccinations deployed in Mabalako"
def get_field_ten(mytext):
    regex = r"(?:vaccinées).* ([+-]?[0-9]*[.]?[0-9]+) +(?:à Mabalako)"
    output=pageiterator(regex,mytext)
    if output==[["not found","not found 2","not found 3"]]:
        return(output[0][0])
    return(str(pageiterator(regex,mytext)[0]).replace(".",","))

#For this work, pull data for the field "cumulative vaccinations deployed in Butembo"
def get_field_eleven(mytext):
    regex = r"(?:vaccinées).* ([+-]?[0-9]*[.]?[0-9]+) +(?:à Butembo)"
    output=pageiterator(regex,mytext)

    if output==[["not found","not found 2","not found 3"]]:
        return(output[0][0])
    return((str(pageiterator(regex,mytext)[0]).replace(".",",")))

In [5]:
# This is where the main execution of the script happens; a for loop runs through the list of URLs, makes requests to the pages, populates the table based on results, and reports each URL as the script iterates  
for url in urls:
    try:
        req_object=requests.get(url)
        thetext=req_object.text
        soup=BeautifulSoup(thetext)
        if(df.empty): #If the table is completely empty, this script starts populating the table with the first date of reporting (2016-08-06)
            df=df.append({'date':pd.Timestamp(2018,8,6),'confirmed_cumulative_cases':get_field_two(soup),'total_cumulative_cases':get_field_three(soup),'confirmed_cumulative_deaths':get_field_four(soup),'total_cumulative_deaths':get_field_five(soup),'cumulative_recoveries':get_field_six(soup),'cumulative_vaccinations':get_field_seven(soup),'cumulative_vaccinations_beni':get_field_eight(soup), 'cumulative_vaccinations_katwa':get_field_nine(soup), 'cumulative_vaccinations_mabalako':get_field_ten(soup), 'cumulative_vaccinations_butembo':get_field_eleven(soup), 'source_url':url},ignore_index=True)
        else:
            df=df.append({'date':df.tail(1)['date'].values[0]+pd.to_timedelta(1, unit='D'),'confirmed_cumulative_cases':get_field_two(soup),'total_cumulative_cases':get_field_three(soup),'confirmed_cumulative_deaths':get_field_four(soup),'total_cumulative_deaths':get_field_five(soup),'cumulative_recoveries':get_field_six(soup),'cumulative_vaccinations':get_field_seven(soup),'cumulative_vaccinations_beni':get_field_eight(soup),'cumulative_vaccinations_katwa':get_field_nine(soup),'cumulative_vaccinations_mabalako':get_field_ten(soup),'cumulative_vaccinations_butembo':get_field_eleven(soup),'source_url':url},ignore_index=True)
        print(url)
   #     time.sleep(1)
    except Exception as e:
        print("Unexpected error:", sys.exc_info()[0])
        print("Bad URL: "+ url)
        df=df.append({'date':df.tail(1)['date'].values[0]+pd.to_timedelta(1, unit='D'),'confirmed_cumulative_cases':"Bad URL",'total_cumulative_cases':"Bad URL",'confirmed_cumulative_deaths':"Bad URL",'total_cumulative_deaths':"Bad URL",'cumulative_recoveries':"Bad URL",'cumulative_vaccinations':"Bad URL",'cumulative_vaccinations_beni':"Bad URL",'cumulative_vaccinations_katwa':"Bad URL",'cumulative_vaccinations_mabalako':"Bad URL",'cumulative_vaccinations_butembo':"Bad URL",'source_url':"Bad URL"},ignore_index=True)
        pass

https://mailchi.mp/70213f4262fb/ebola_kivu_6aout
https://mailchi.mp/526091c2d161/ebola_kivu_7aout
https://mailchi.mp/80a23d454958/ebola_kivu_8aout
https://mailchi.mp/4cc2513a9f7d/ebola_kivu_9aout
https://mailchi.mp/68ac0a236b26/ebola_kivu_10aout
https://mailchi.mp/17136ac871cd/ebola_kivu_11aout
https://mailchi.mp/5740eb659e90/ebola_kivu_12aout
https://mailchi.mp/de936e52fb17/ebola_kivu_13aout
https://mailchi.mp/37b7545e3544/ebola_kivu_14aout
https://mailchi.mp/96767bcd0575/ebola_kivu_15aout
https://mailchi.mp/2d5b7a2d510a/ebola_kivu_16aout
https://mailchi.mp/74a121fdd538/ebola_kivu_17aout
https://mailchi.mp/758eb254afd1/ebola_kivu_18aout
https://mailchi.mp/231176da338d/ebola_kivu_19aout
https://mailchi.mp/bad94a83ea28/ebola_kivu_20aout
https://mailchi.mp/58d0398606a4/ebola_kivu_21aout
https://mailchi.mp/9abcbef79b2c/ebola_kivu_22aout
https://mailchi.mp/ad8db275a17f/ebola_kivu_23aout
https://mailchi.mp/970b172dfebc/ebola_kivu_24aout
https://mailchi.mp/c9c7043a5a23/ebola_kivu_25aout
http

In [6]:
# This simple block is just to render the content of the table after the script runs 
df

Unnamed: 0,date,confirmed_cumulative_cases,total_cumulative_cases,confirmed_cumulative_deaths,total_cumulative_deaths,cumulative_recoveries,cumulative_vaccinations,cumulative_vaccinations_beni,cumulative_vaccinations_katwa,cumulative_vaccinations_mabalako,cumulative_vaccinations_butembo,source_url
0,2018-08-06,16,43,not found,not found 2,not found,not found,not found,not found,not found,not found,https://mailchi.mp/70213f4262fb/ebola_kivu_6aout
1,2018-08-07,16,43,not found,not found 2,not found,not found,not found,not found,not found,not found,https://mailchi.mp/526091c2d161/ebola_kivu_7aout
2,2018-08-08,17,44,not found,not found 2,not found,not found,not found,not found,not found,not found,https://mailchi.mp/80a23d454958/ebola_kivu_8aout
3,2018-08-09,17,44,not found,not found 2,not found,not found,not found,not found,not found,not found,https://mailchi.mp/4cc2513a9f7d/ebola_kivu_9aout
4,2018-08-10,21,48,not found,not found 2,not found,not found,not found,not found,not found,not found,https://mailchi.mp/68ac0a236b26/ebola_kivu_10aout
5,2018-08-11,22,49,not found,not found 2,not found,not found,not found,not found,not found,not found,https://mailchi.mp/17136ac871cd/ebola_kivu_11aout
6,2018-08-12,25,52,not found,not found 2,not found,not found,not found,not found,not found,not found,https://mailchi.mp/5740eb659e90/ebola_kivu_12aout
7,2018-08-13,30,57,not found,not found 2,not found,not found,not found,not found,not found,not found,https://mailchi.mp/de936e52fb17/ebola_kivu_13aout
8,2018-08-14,39,66,not found,not found 2,not found,not found,not found,not found,not found,not found,https://mailchi.mp/37b7545e3544/ebola_kivu_14aout
9,2018-08-15,46,73,not found,not found 2,not found,not found,not found,not found,not found,not found,https://mailchi.mp/96767bcd0575/ebola_kivu_15aout


In [7]:
#Send dataframe to a CSV and Excel file
df.to_csv("DataOutput.csv",index=False)
df.to_excel("DataExcel.xlsx",index=False)

### Only execute the below to reset the CSV
Uncomment the code before running

In [8]:
#df=pd.DataFrame(columns=["date","confirmed_cumulative_cases","total_cumulative_cases","confirmed_cumulative_deaths","total_cumulative_deaths","cumulative_recoveries","cumulative_vaccinations","cumulative_vaccinations_beni","cumulative_vaccinations_katwa", "cumulative_vaccinations_mabalako", "cumulative_vaccinations_butembo", "source_url"])
#df.to_csv("DataOutput.csv",index=False)
#df.to_excel("DataExcel.xlsx",index=False)