## Scraping from the Public Website

Unfortunately, the resources are not as well-organized in Spiral, so scraping of the website will be needed.

In [14]:
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime
import re

Create the 'curatedBy' object

In [15]:
def create_curationObject():
    now = datetime.now()
    curatedBy = {
    "@type": "Organization",
    'identifier': 'imperialcollege',
    'url': 'http://www.imperial.ac.uk/mrc-global-infectious-disease-analysis/covid-19/covid-19-scientific-resources/',
    "name": "MRC Centre for Global Infectious Disease Analysis",
    "affiliation": ["Imperial College London"],
    "curationDate":now.strftime("%Y-%m-%d")
  } 

Unfortunately, items appearing in the resource list do not necessarily have any sort of identifier available. Create an identifier for the resource by creating an abbreviation from the description, to ensure uniqueness of the entry.

In [35]:
def create_id(description_text):
    words = description_text.lower().split()
    letters = [word[0] for word in words]
    identifier = "icl_"+"".join(e for e in letters if e.isalnum())
    return(identifier)

Parse the available content into metadata depending on the type of the resource. Use specific keywords to identify the type of the resource.

In [1]:
def transform_resource_meta(metaobject):
    tmpdict = {
      "@context": {
        "schema": "http://schema.org/",
        "outbreak": "https://discovery.biothings.io/view/outbreak/"
      },
      "author": {
        "@type": "Organization",
        "name": 'Imperial College COVID-19 Response Team',
        "affiliation": ["MRC Centre for Global Infectious Disease Analysis",
                        "Imperial College London"]
      }
    }
    tmpdict['name'] = metaobject.find("h3",{"class":"title"}).get_text()
    tmpdict['description'] = metaobject.find("p").get_text()
    tmpdict['identifier'] = create_id(tmpdict['description'])
    tmpdict['_id'] = tmpdict['identifier']
    basetype = metaobject.find("span",{"class":"link primary"}).get_text()
    tmpurl = metaobject.find("a").get("href") 
    
    if "http" in tmpurl:
        url = tmpurl
    else:
        url = baseurl+tmpurl
    try:
        basedate = re.findall("\(\d{2}\-\d{2}\-\d{4}\)", tmpdict['description'])[0].strip("(").strip(")")
        datetime_object = datetime.strptime(basedate, '%d-%m-%Y')
        datePublished = datetime_object.strftime("%Y-%m-%d")
    except:
        datePublished = "Not Available"  
    if "data" in basetype:
        tmpdict['@type'] = "Dataset"
        tmpdict['dataDownload'] = {
            "contentUrl": url,
            "dateModified": datePublished
        }
        tmpdict['species']: "Homo sapiens"
        tmpdict['infectiousAgent']: "SARS-CoV-2"
    elif "code" in basetype:
        tmpdict['@type'] = "SoftwareSourceCode"
        tmpdict['downloadUrl'] = url
        tmpdict['datePublished'] = datePublished
    elif "survey" in basetype:
        tmpdict['@type'] = "Protocol"
        tmpdict['url'] = url
        tmpdict['datePublished'] = datePublished
        tmpdict['protocolSetting'] = "public"
        tmpdict["protocolCategory"] = "protocol"
    if "for \"Report" in tmpdict['description']:
        report_check = tmpdict['description'].replace("for \"Report","for|Report").split("|")
        citedByTitle = report_check[1].replace('"','')
        tmpdict['citedBy'] = {"name": citedByTitle,
                              "type": "Publication"}
    return(tmpdict)

In [18]:
url = 'http://www.imperial.ac.uk/mrc-global-infectious-disease-analysis/covid-19/covid-19-scientific-resources/'
response = requests.get(url)
parsedlisting = BeautifulSoup(response.text, "html.parser")
baseurl = "http://www.imperial.ac.uk/"
resourceclass = parsedlisting.findAll("div", {"class": "media-item full light-secondary reverse equal-height"})


In [23]:
resourcelist = []
for eachblock in resourceclass[1:2]:
    tmpdict = transform_resource_meta(eachblock)   
    print(tmpdict)   
        

{'@context': {'schema': 'http://schema.org/', 'outbreak': 'https://discovery.biothings.io/view/outbreak/'}, 'author': {'@type': 'Organization', 'name': 'Imperial College COVID-19 Response Team', 'affiliation': ['MRC Centre for Global Infectious Disease Analysis', 'Imperial College London']}, 'name': 'Code (Report 13)', 'description': 'Replication code (30-03-2020) for "Report 13: Estimating the number of infections and the impact of non-pharmaceutical interventions on COVID-19 in 11 European countries"', 'identifier': 'Riclcicl(iclficl"icl1iclEiclticlnicloicliiclaiclticliicloiclnicliicloiclCicliicl1iclEiclc', '@type': 'SoftwareSourceCode', 'downloadUrl': 'https://github.com/ImperialCollegeLondon/covid19model/releases/tag/v1.0', 'datePublished': '2020-03-30', 'citedBy': {'name': 'Report 13: Estimating the number of infections and the impact of non-pharmaceutical interventions on COVID-19 in 11 European countries', 'type': 'Publication'}}


## Parse the analyses

Note that while MOST of the basic information can be obtained (except for the datePublished), detailed information is available but not parsable (wide variation in presentation and no discernible structure) and will require manual curation

In [36]:
analysislisturl = 'http://www.imperial.ac.uk/mrc-global-infectious-disease-analysis/covid-19/covid-19-planning-tools/'
analysisresponse = requests.get(analysislisturl)
analysislisting = BeautifulSoup(analysisresponse.text, "html.parser")
analysisclass = analysislisting.findAll("div", {"class": "media-item full light-secondary reverse equal-height"})

for eachblock in analysisclass[1:2]:
    tmpdict = {
      "@context": {
        "schema": "http://schema.org/",
        "outbreak": "https://discovery.biothings.io/view/outbreak/"
      },
      "author": {
        "@type": "Organization",
        "name": 'Imperial College COVID-19 Response Team',
        "affiliation": ["MRC Centre for Global Infectious Disease Analysis",
                        "Imperial College London"]
      }
    }
    tmpdict['name'] = eachblock.find("h3",{"class":"title"}).get_text()
    tmpurl = eachblock.find("a").get("href") 
    tmpdict['species'] = "Homo sapiens"
    tmpdict['infectiousAgent'] = "SARS-CoV-2"
    tmpdict['infectiousDisease'] = "COVID-19"
    tmpdict['description'] = eachblock.find("p").get_text()
    tmpdict['identifier'] = create_id(tmpdict['description'])
    tmpdict['_id'] = tmpdict['identifier']
    if "http" in tmpurl:
        tmpdict['url'] = tmpurl
    else:
        tmpdict['url'] = baseurl+tmpurl
    tmpdict['datePublished'] = 'Not Available'
    print(tmpdict, tmpdict['identifier'])

{'@context': {'schema': 'http://schema.org/', 'outbreak': 'https://discovery.biothings.io/view/outbreak/'}, 'author': {'@type': 'Organization', 'name': 'Imperial College COVID-19 Response Team', 'affiliation': ['MRC Centre for Global Infectious Disease Analysis', 'Imperial College London']}, 'name': 'Scenario analysis tool (covidsim.org)', 'species': 'Homo sapiens', 'infectiousAgent': 'SARS-CoV-2', 'infectiousDisease': 'COVID-19', 'description': 'This scenario analysis tool allows the user to make projections of the prevalence of infections each day and the expected number of people requiring hospitalisation and critical care facilities.', 'identifier': 'icl_tsatatutmpotpoiedatenoprhaccf', '_id': 'icl_tsatatutmpotpoiedatenoprhaccf', 'url': 'https://covidsim.org/', 'datePublished': 'Not Available'} icl_tsatatutmpotpoiedatenoprhaccf
