## Spiral Report crawler

Imperial College London has an open access repository, called Spiral. 
Spiral hosts the reports with more of the metadata exposed than is exposed in the public website access for the reports. Trying to pull the metadata from the public website links requires multiple pages with multiple levels of parsing. For the reports, it's easier to use the data pulled from spiral

In [12]:
import requests
from bs4 import BeautifulSoup
import json
from datetime import datetime

Create the Curation Object (ie- the curatedBy object)

In [13]:
def create_curationObject():
    now = datetime.now()
    curatedBy = {
    "@type": "Organization",
    'identifier': 'imperialcollege',
    'url': 'http://www.imperial.ac.uk/mrc-global-infectious-disease-analysis/covid-19/covid-19-reports/',
    "name": "MRC Centre for Global Infectious Disease Analysis",
    "affiliation": ["Imperial College London"],
    "curationDate":now.strftime("%Y-%m-%d")
    }    
    return(curatedBy)

Query Spiral for COVID-19 reports and parse the result into a list of urls of the individual report pages

In [14]:
def get_report_links(reports_url):
    recordlist = requests.get(reports_url)
    spiralbase = "https://spiral.imperial.ac.uk:8443/"
    parsedrecordlist = BeautifulSoup(recordlist.text, "html.parser")
    urlstable = parsedrecordlist.findAll("table")[0]
    urlstublist = urlstable.findAll("a")
    url_list = []
    for eachlink in urlstublist:
        tmpurl = spiralbase+eachlink.get("href")
        url_list.append(tmpurl)
    return(url_list)

Since we're dealing with html files, create a function to get content for a specific tag

In [15]:
def get_meta_content(metacontentfield):
    if len(metacontentfield) == 1:
        metacontentlist = metacontentfield[0].get("content")
    else:
        metacontentlist = []
        for eachitem in metacontentfield:
            metaitem = eachitem.get("content")
            metacontentlist.append(metaitem)
    return(metacontentlist)   

Pull content from the appropriate meta data tags and format it

In [27]:
def transform_pub_meta(soupobject):
    urlfield = soupobject.findAll("meta", {"name":"citation_pdf_url"})
    url = get_meta_content(urlfield)
    titlefield = soupobject.findAll("meta", {"name":"citation_title"})
    title = get_meta_content(titlefield)
    datePublishedfield = soupobject.findAll("meta", {"name":"citation_date"})
    datePublished = get_meta_content(datePublishedfield)
    abstractfield = soupobject.findAll("meta", {"name":"DCTERMS.abstract"})
    abstract = get_meta_content(abstractfield)
    defaultidurlfield = soupobject.findAll("meta", {"scheme":"DCTERMS.URI"})
    defaultid = get_meta_content(defaultidurlfield)
    tmpdict = {
        "@context": {
        "schema": "http://schema.org/",
        "outbreak": "https://discovery.biothings.io/view/outbreak/"
        },
        "@type": "Publication",
        "journalName": "Imperial College London",
        "journalNameAbbreviation": "imperialcollege",
        "publicationType": "Report", 
        "abstract":abstract,
        "name":title,
        "datePublished":datePublished,
        "url":url,
        "identifier":defaultid
    }
    keywordsfield = soupobject.findAll("meta", {"name":"DC.subject"})
    if len(keywordsfield)>0:
        keywordsobject = get_meta_content(keywordsfield)
        tmpdict["keywords"] = keywordsobject

    licensefield = soupobject.findAll("meta", {"name":"DC.rights"})
    if len(licensefield)>0:
        license = get_meta_content(licensefield)
        tmpdict["license"] = license
        
    identifiersfield = soupobject.findAll("meta", {"name":"DC.identifier"})
    for eachitem in identifiersfield:
        eachitemcontent = eachitem.get("content")
        if "doi" in eachitemcontent:
            doi = eachitemcontent.replace("https://doi.org/","")
            tmpdict["identifier"] = "icl_"+doi.split('/', 1)[-1]
            tmpdict["doi"] = doi
        elif "10." in eachitemcontent:
            doi = eachitemcontent
            tmpdict["identifier"] = "icl_"+doi.split('/', 1)[-1]
            tmpdict["doi"] = doi
    tmpdict['_id'] = tmpdict["identifier"]
    return(tmpdict)

Get the Author information

In [17]:
def get_authors(soupobject):
    authorsfield = soupobject.findAll("meta", {"name":"citation_author"})
    authors = get_meta_content(authorsfield)
    authorlist = []
    for eachauthor in authors:
        authparse = eachauthor.split(",")
        if (len(authparse) == 2) and len(authparse[1])<3:
            authdict = {'@type': 'outbreak:Person', 'affiliation': [], 'name': eachauthor, 
                       'familyName':authparse[0]}
        else:
            authdict = {'@type': 'outbreak:Person', 'affiliation': [], 'name': eachauthor}
        authorlist.append(authdict)
    return(authorlist)

Get the funding information

In [18]:
def get_funding(soupobject):
    fundersfield = soupobject.findAll("meta", {"name":"DC.contributor"})
    funders = get_meta_content(fundersfield)
    fundercheck = len(fundersfield)
    if fundercheck > 0:
        identifiersfield = soupobject.findAll("meta", {"name":"DC.identifier"}) 
        fundidlist = []
        for eachitem in identifiersfield:
            eachitemcontent = eachitem.get("content")
            if "https:" in eachitemcontent:
                miscurls = eachitemcontent
            else:
                fundingid = eachitemcontent
                fundidlist.append(fundingid)
        fundlist = []
        i=0
        while i < len(funders):
            fundict = {"@type": "MonetaryGrant",
                       "funder": {
                       "name": funders[i]
                       },
                      "identifier": fundidlist[i],
                      "name": ""
            }
            fundlist.append(fundict)
        fundflag = True
    else:
        fundlist = []
        fundflag = False
    return(fundlist, fundflag)

## Main

In [19]:
reports_url = 'https://spiral.imperial.ac.uk:8443/handle/10044/1/78555/simple-search?location=10044%2F1%2F78555&query=&filter_field_1=type&filter_type_1=equals&filter_value_1=Report&rpp=100&sort_by=score&order=DESC&etal=1&submit_search=Update'
url_list = get_report_links(reports_url)
curatedBy = create_curationObject()

## Pull the metadata from each report link and throw it into a dictionary
for each_url in url_list[0:2]:
    record_result = requests.get(each_url)
    parsed_record = BeautifulSoup(record_result.text, "html.parser")
    base_info = transform_pub_meta(parsed_record)
    base_info["curatedBy"] = curatedBy
    author_list = get_authors(parsed_record)
    fund_list, fund_flag = get_funding(parsed_record)
    ## Create the Json
    base_info["author"] = author_list
    if fund_flag == True:
        base_info["funding"] = fund_list
    print(base_info)

{'@type': 'Organization', 'identifier': 'imperialcollege', 'url': 'http://www.imperial.ac.uk/mrc-global-infectious-disease-analysis/covid-19/covid-19-reports/', 'name': 'MRC Centre for Global Infectious Disease Analysis', 'affiliation': ['Imperial College London'], 'curationDate': '2020-06-25'}
