In [None]:
import html2text
import logging
import re
import requests
from bs4 import BeautifulSoup

tagger = BeautifulSoup('', 'html.parser')

logger = logging.getLogger(__name__)

def get_protozoa_links():
    soup = BeautifulSoup(requests.get('http://www.sanger.ac.uk/resources/downloads/protozoa/').text, "html.parser")
    links = [(a.text, 'http://www.sanger.ac.uk' + a['href']) for a in soup.find_all('a') if '/resources/downloads/protozoa/' in a['href']]
    return links

def get_vectors_links():
    soup = BeautifulSoup(requests.get('http://www.sanger.ac.uk/resources/downloads/vectors/').text, "html.parser")
    links = [(a.text, 'http://www.sanger.ac.uk' + a['href']) for a in soup.find_all('a') if '/resources/downloads/vectors/' in a['href']]
    return links

def get_prokaryotes_links():
    soup = BeautifulSoup(requests.get('http://www.sanger.ac.uk/resources/downloads/bacteria/').text, "html.parser")
    links = []
    for col_class in ['col'+str(i) for i in range(1,4)]:
        col = soup.find(class_=col_class)
        links += [(a.text, 'http://www.sanger.ac.uk/resources/downloads/bacteria/' + a['href']) for a in col.find_all('a')]
    return links

def get_helminth_links():
    soup = BeautifulSoup(requests.get('http://www.sanger.ac.uk/resources/downloads/helminths/').text, "html.parser")
    links = [(a.text, 'http://www.sanger.ac.uk/resources/downloads/helminths/' + a['href']) for a in soup.find_all('ul')[2].find_all('a')]
    return links

def get_virus_links():
    soup = BeautifulSoup(requests.get('http://www.sanger.ac.uk/resources/downloads/viruses/').text, "html.parser")
    links = [(a.text, 'http://www.sanger.ac.uk/resources/downloads/viruses/' + a['href']) for a in soup.find_all('ul')[2].find_all('a')]
    return links

def include_ajax(soup):
    for ajax_div in soup.find_all(class_='ajax'):
        url = "http://www.sanger.ac.uk" + ajax_div['title']
        logger.debug("  adding content from %s" % url)
        ajax_html = requests.get(url).text
        ajax_soup = BeautifulSoup(ajax_html, 'html.parser')
        content = ajax_soup.find(id='main-content')
        ajax_div.replace_with(content)
        
def _previous_non_whitespace(start_element):
    previous = start_element.previous_sibling
    for i in range(20):
        if previous is None:
            break
        elif not previous.name is None:
            break
        elif str(previous).strip() != '':
            break
        else:
            previous = previous.previous_sibling
    else:
        logger.error("Something's gone wrong, we shouldn't have looped this often!")
        logger.error("previous: '%s' (%s)" % (previous, type(previous)))
        logger.error("start_element: '%s' (%s)" % (start_element, type(start_element)))
        raise ValueError("Ahhh! FIXME")
    return previous

def _next_non_whitespace(start_element):
    next_element = start_element.next_sibling
    for i in range(20):
        if next_element is None:
            break
        elif not next_element.name is None:
            break
        elif str(next_element).strip() != '':
            break
        else:
            next_element = next_element.next_sibling
    else:
        logger.error("Something's gone wrong, we shouldn't have looped this often!")
        logger.error("next_element: '%s' (%s)" % (next_element, type(next_element)))
        logger.error("start_element: '%s' (%s)" % (start_element, type(start_element)))
        raise ValueError("Ahhh! FIXME")
    return next_element
        
def _remove_title(description_panel):
    logger.debug("  removing species title")
    title_heading = description_panel.find('h2') or description_panel.find('h3')
    title = title_heading.text
    title_heading.replace_with("")
    return title

def _remove_bibliography(description_panel):
    bibliography_divs = [b for b in description_panel.find_all('h4') if b.text=='Bibliography']
    bibliography_divs += [b for b in description_panel.find_all('h5') if b.text=='Bibliography']
    bibliography_divs += [b for b in description_panel.find_all('h4') if b.text=='References']
    if len(bibliography_divs) == 0:
        logger.debug("  no bibliography to remove")
        return
    (b,) = bibliography_divs
    next_element = _next_non_whitespace(b)
    if next_element is None:
        logger.warning("  found a bibliography title but no bibliography, deleting it anyway")
        b.replace_with('')
    elif next_element.name == 'ul' and 'references' in next_element["class"]:
        logger.debug("  removing bibliography")
        b.replace_with('')
        next_element.replace_with('')
    else:
        logger.error("  can't find references adjacent to bibliography")
        logger.error("b: %s" % b)
        logger.error("next_element: %s" % next_element)

def _remove_studies(description_panel):
    logger.debug("  removing study details")
    study_divs = [b for b in description_panel.find_all('h4') if b.text.strip()=='Studies']
    if len(study_divs) != 0:
        (s,) = study_divs
        s.replace_with('')
    sub_nav_divs = [d for d in description_panel.find_all(class_='sub_nav') if d.name == 'div']
    if len(sub_nav_divs) != 0:
        (d,) = sub_nav_divs
        d.replace_with('')
    sub_data_divs = [d for d in description_panel.find_all(class_='sub_data') if d.name == 'div']
    if len(sub_data_divs) != 0:
        (d,) = sub_data_divs
        for div in d.find_all('div'):
            if "project_" in div.get('id', ''):
                div.replace_with('')

def _remove_breaks_in_tables(soup):
    logger.debug("  removing breaks from within tables")
    for table in soup.find_all('table'):
        for br in table.find_all('br'):
            first, *others = br.contents
            br.replace_with(first)
            previous = first
            for other in others:
                previous.insert_after(other)
                previous = other

def _add_break_before_tables(soup):
    logger.debug("  adding a break before tabels")
    for table in soup.find_all('table'):
        br = soup.new_tag('br')
        table.insert_before(br)
        
def _remove_spans_in_tables(soup):
    logger.debug("  removing spans from within tables")
    for table in soup.find_all('table'):
        for span in table.find_all('span'):
            first, *others = span.contents
            span.replace_with(first)
            previous = first
            for other in others:
                previous.insert_after(other)
                previous = other

def _add_space_between_links(soup):
    logger.debug("  adding a space between links")
    for a in soup.find_all('a'):
        if a.next_sibling and a.next_sibling.name == 'a':
            space = soup.new_tag('span')
            space.string = ' '
            a.insert_after(space)
            
def _remove_rhs(description_panel):
    logger.debug("  removing the panel on the rhs")
    rhs = description_panel.find(id='rhs')
    if rhs:
        rhs.replace_with('')
     
def _get_pubmed_id(periodical_link):
    
    # periodical_link.contents looks like:
    # ['PUBMED: ', <a href="http://ukpmc.ac.uk/abstract/MED/15875012">15875012</a>,
    #  '; PMC: ', <a href="http://ukpmc.ac.uk/articles/PMC1352341">1352341</a>,
    #  '; DOI: ', <a href="http://dx.doi.org/10.1038%2Fnature03481">10.1038/nature03481</a>]
    link_bits = (bit for bit in periodical_link.contents)
    pairs_of_link_bits = zip(link_bits, link_bits)
    (a,) = [a for prefix,a in pairs_of_link_bits if prefix == 'PUBMED: ']
    return a.text

def _get_pubmed_ids(soup):
    logger.debug("  getting pubmed ids")
    periodicals = soup.find_all(class_='periodical')
    _flatten = lambda parent_list: [el for child_list in parent_list for el in child_list]
    pub_links = _flatten([p.find_all(class_='links') for p in periodicals])
    return [_get_pubmed_id(l) for l in pub_links]

def _split_published_genome_data(description):
    logger.debug("  splitting out primary description")
    try:
        desc, pub_data = re.split(r'^#{2,3} Published Genome Data', description, maxsplit=1, flags=re.MULTILINE)
        return desc.strip(), pub_data.strip()
    except ValueError:
        return description, None       

def _fix_uls(soup):
    logger.debug("  joining adjacent lists")
    previous_ul, *other_uls = soup.find_all('ul')
    for ul in other_uls:
        # Go back and see if the previous (non-whitespace) tag was also a ul
        previous_tag = _previous_non_whitespace(ul)
        if previous_tag == previous_ul:
            if len(ul.find_all('li')) == len(ul.contents):
                for li in ul.find_all('li'):
                    previous_ul.append(li)
                ul.replace_with('')
        else:
            previous_ul = ul
            
def _add_p_before_ul(soup):
    logging.debug("  adding a space before each list")
    for ul in soup.find_all('ul'):
        p = soup.new_tag('p')
        ul.insert_before(p)
    
def get_details(link, expected_title=None):
    logger.info("Building soup from %s" % link)
    r = requests.get(link)
    r.raise_for_status()
    soup = BeautifulSoup(r.text, "html.parser")
    include_ajax(soup)
    try:
        download_panels = [panel for panel in soup.find(id='rhs').find_all(class_='panel') if panel.h3 and 'download' in panel.h3.text]
        links = [{'text': a.text, 'url': a['href']} for a in download_panels[0].find_all('a')]
    except IndexError:
        links = []
        
    try:
        related_panels = [panel for panel in soup.find(id='rhs').find_all(class_='panel') if panel.h5 and 'links' in panel.h5.text]
        links += [{'text': a.text, 'url': a['href']} for a in related_panels[0].find_all('a')]
    except IndexError:
        pass
    
    pubmed_ids = _get_pubmed_ids(soup)

    description_panel = soup.find_all(class_='panel')[0]
    
    _remove_bibliography(description_panel)
    _remove_studies(description_panel)
    _fix_uls(soup)
    _add_p_before_ul(soup)
    _remove_breaks_in_tables(soup)
    _remove_spans_in_tables(soup)
    _add_space_between_links(soup)
    _add_break_before_tables(soup)
    _remove_rhs(description_panel)
    if expected_title is None:
        title_heading = description_panel.find('h2') or description_panel.find('h3')
        title = title_heading.text
        _remove_title(description_panel)
    else:
        title = _remove_title(description_panel)
        if title != expected_title:
            logger.warning("  !!! Expected page title to be %s, got %s" % (expected_title, title))
    h = html2text.HTML2Text()
    h.body_width = 0
    description = h.handle(str(description_panel))
    description = re.sub(r'^\*{2,3}([^*]+)\*{2,3}$', r'## \1', description, flags=re.MULTILINE)
    description = re.sub(r'^###+', r'###', description, flags=re.MULTILINE)
    description = re.sub(r'^#', r'\n#', description, flags=re.MULTILINE)
    description = re.sub(r'\n{3,}', '\n\n', description, flags=re.MULTILINE).strip()
    description, published_data_description = _split_published_genome_data(description)
    
    return {
        'title': title,
        'description': description,
        'links': links,
        'pubmed_ids': pubmed_ids,
        'published_data_description': published_data_description
    }
    
links = get_virus_links()
len(links)

In [None]:
import time
import random

logger = logging.getLogger(__name__)

def fail(text):
    logging.error("\x1b[31m%s\x1b[0m" % text)
    
data = []
failures = []
logger.setLevel(logging.INFO)
for title, link in links:
    try:
        data.append(get_details(link, title))
        time.sleep(0.5)
    except KeyboardInterrupt:
        raise
    except Exception as e:
        fail("Issue getting data from %s" % link)
        failures.append((title, link, e))

In [None]:
from jinja2 import Template

template=Template("""\
---
databases:
{%- for database in databases %}
- {{database}}
{%- endfor %}
metadata:
  description: |
    {{description|indent(4)}}
  list_data: {% if list_data %}true{% else %}false{% endif %}
  title: {{ title }}
species:
{%- for datum in data %}
  {%- if datum['description'] or datum['published_data_description'] or datum['links'] or datum['pubmed_ids']%}
  {{datum['title']}}:
    {%- if datum['description'] %}
    description: |
      {{datum['description']|indent(6)}}
    {%- endif %}
    {%- if datum['published_data_description'] %}
    published_data_description: |
      {{datum['published_data_description']|indent(6)}}
    {%- endif %}
    {%- if datum['links'] %}
    links:
    {%- for link in datum['links'] %}
    - url: {{link['url']}}
      text: {{link['text']}}
    {%- endfor %}
    {%- endif %}
    {%- if datum['pubmed_ids'] %}
    pubmed_ids:
    {%- for pubmed_id in datum['pubmed_ids'] %}
    - {{pubmed_id}}
    {%- endfor %}
    {%- endif %}
  {%- else %}
  {{datum['title']}}: {}
  {%- endif %}
{% endfor %}""")

data = list(data)
config = template.render(data=data, databases=[''],
                description="""""",
                list_data=True, title='')
#print(config)
print(config, file=open('page_config/viruses.yml', 'w'))

In [None]:
logger.setLevel(logging.DEBUG)
link = "http://www.sanger.ac.uk/resources/downloads/bacteria/salmonella.html"
title = "Salmonella"
_data = [get_details(link, title)]
_config = template.render(data=_data, databases=['pathogen_prok_track', 'pathogen_pacbio_track'],
                description="""\
This page provides access to the genome sequence of bacteria sequenced at the Wellcome Trust Sanger Institute.
We have sequenced a large number of bacterial genomes and make all our sequence data available. The Institute's bacterial sequencing effort concentrates on pathogenic bacteria. The data include complete, ongoing and forthcoming sequencing projects.
You may also be interested in the following collaborative projects:
[MetaHit](http://www.sanger.ac.uk/resources/downloads/bacteria/metahit/) - the role of the human intestinal microbiota in health and disease
[NCTC reference collection](http://www.sanger.ac.uk/resources/downloads/bacteria/nctc/) - generation of annotated and assembled genomes for 3,000 bacteria and 500 viruses
[BSAC projects](http://www.sanger.ac.uk/resources/downloads/bacteria/bsac/) - British Society for Antimicrobial Chemotherapy projects""",
                list_data=True, title='Bacterial vector genomes - data download')
print(_config)

In [None]:
logger = logging.getLogger(__name__)
logger.setLevel(logging.DEBUG)
_data = [get_details(
        "http://www.sanger.ac.uk/resources/downloads/bacteria/streptococcus-pneumoniae.html",
        "Streptococcus pneumoniae")]
_config = template.render(data=_data, databases=['pathogen_prok_track', 'pathogen_pacbio_track'],
                description="""\
This page provides access to the genome sequence of bacteria sequenced at the Wellcome Trust Sanger Institute.
We have sequenced a large number of bacterial genomes and make all our sequence data available. The Institute's bacterial sequencing effort concentrates on pathogenic bacteria. The data include complete, ongoing and forthcoming sequencing projects.
You may also be interested in the following collaborative projects:
[MetaHit](http://www.sanger.ac.uk/resources/downloads/bacteria/metahit/) - the role of the human intestinal microbiota in health and disease
[NCTC reference collection](http://www.sanger.ac.uk/resources/downloads/bacteria/nctc/) - generation of annotated and assembled genomes for 3,000 bacteria and 500 viruses
[BSAC projects](http://www.sanger.ac.uk/resources/downloads/bacteria/bsac/) - British Society for Antimicrobial Chemotherapy projects""",
                list_data=True, title='Bacterial vector genomes - data download')
print(_config)

In [None]:
import markdown
import logging

print(config, file=open('tmp.md', 'w'))
title, d = list(yaml.load(config)['species'].items())[0]
f = open('site/tmp.html', 'w')

markup = lambda md: markdown.markdown(md, extensions=['markdown.extensions.tables'])

content = """\
<html>
<body>
<h1>{title}</h1>
<h2>Description</h2>
{description}
<h2>More details</h2>
{more}
</body>""".format(title=title, description=markup(d['description']),
                  more=markup(d['published_data_description']))
f.write(content)
f.close()
print("Done")
print(d['description'])

In [None]:
import requests
from bs4 import BeautifulSoup
url = 'http://www.sanger.ac.uk/resources/downloads/bacteria/'
soup = BeautifulSoup(requests.get(url).text, 'html.parser')
links = [{'url': 'http://www.sanger.ac.uk/resources/downloads/bacteria/' + a['href'], 'text': a.text} for a in soup.find_all('a') if not '/' in a['href'] and not '#' in a['href']]

In [None]:
import os
import time
tmp_dir = os.path.join(os.getcwd(), 'tmp_html')
failures = []
for i,url in enumerate((l['url'] for l in links)):
    fname = os.path.basename(url)
    path = os.path.join(tmp_dir, fname)
    with open(path, 'w') as outfile:
        r = requests.get(url)
        if r.status_code != 200:
            failures.append((url, path))
            continue
        print(r.text, file=outfile)
    time.sleep(1)
    if i % 10 == 0:
        print("Got %s URLs with %s failures" % (i, len(failures)))
print(len(failures), failures)        

In [None]:
link_counts = []
n = len(os.listdir(tmp_dir))
for i,path in enumerate((os.path.join(tmp_dir, fname) for fname in os.listdir(tmp_dir))):
    soup = BeautifulSoup(open(path, 'r'), 'html.parser')
    link_counts.append((len(soup.find(id='rhs').find_all('a')), path))
    if i%10 == 0:
        print("Done %s of %s: %s" % (i, n, path))
for lc in sorted(link_counts):
    print("%s\t%s" % lc)   

In [None]:
for n, path in sorted(link_counts, reverse=True):
    url = 'http://www.sanger.ac.uk/resources/downloads/bacteria/' + os.path.basename(path)
    print(n,url)