In [None]:
from nbdev import *
# default_exp scraping

In [None]:
# export
import urllib.request
import urllib.parse
import re
from bs4 import BeautifulSoup
from pathlib import Path

# Scraping
> A lot of the useful datasets are not in a ready-to-download format. Instead, they have to be collected over a variety of sub-pages. The following methods are utility for dealing with those kind of situations.

In [None]:
example_link = r'https://www.football-data.co.uk/data.php'

In [None]:
# export
def get_html(url, encoding='utf-8', bs=True):
    'Get the html code for a given url. If bs=True (which is the default), return the parsed BeautifulSoup object instead.'
    response = urllib.request.urlopen(url)
    html = response.read().decode(encoding=encoding)
    if bs: return BeautifulSoup(html, features="lxml")
    else: return html

In [None]:
example_html = get_html(example_link, bs=False)

example_html[:10], example_html[-10:]

('<HTML>\n\n<H', '\n</HTML>\n\n')

In [None]:
# export
CACHE_DIR = Path('../data/cache')
def cache(url, cache_name):
    chache_path = CACHE_DIR/cache_name
    if chache_path.is_file():
        bs = BeautifulSoup(chache_path.open(encoding='utf-8'), features="lxml")
    else:
        bs = get_html(url)
        with chache_path.open('w', encoding='utf-8') as f:
            f.write(str(bs))
        
    return bs

For testing purposes we should reduce the amount of actual http traffic, so we'll cache sites that are only used for testing the library.

In [None]:
chache_html = cache(example_link, 'cache_example')

assert (CACHE_DIR/'cache_example').is_file()

In [None]:
# export
def find_links_by_func(html, func=None, return_href=True):
    """Iterate over all links of the given html-BeautifulSoup-object.
    Return a list of all links for which func returns True.
    If no func is given, return all links
    If return_href=False, return a list of BeautifulSoup link objects"""
    if func is None:
        func = lambda target: True
    
    links = []
    for link in html.find_all('a'):
        target = link.get('href')
        if target:
            if func(target):
                if return_href: links.append(target)
                else: links.append(link)
    return links

In [None]:
example_bs = BeautifulSoup(example_html)

all_links = find_links_by_func(example_bs)
print(f'All links: {len(all_links)}')

All links: 209


In [None]:
absolute_in_site_links = find_links_by_func(example_bs, lambda link: 'www.football-data.co.uk' in link)
print(f'Absolute links in site: {len(absolute_in_site_links)}')

Absolute links in site: 102


In [None]:
print(f'return_href=True: {find_links_by_func(example_bs)[0]}')
print(f'return_href=False: {find_links_by_func(example_bs, return_href=False)[0]}')

return_href=True: https://www.football-data.co.uk/
return_href=False: <a href="https://www.football-data.co.uk/"><img alt="Football Betting - Football Results - Free Bets" border="0" src="https://www.football-data.co.uk/logo2.jpg"/></a>


In [None]:
# export
def find_links_by_pattern(html, pattern, return_href=True):
    """Iterate over all links of the given html-BeautifulSoup-object.
    Return a list of all links that match the given (regex)pattern.
    Patterns passed as string will be compiled to regex."""
    if isinstance(pattern, str):
        pattern = re.compile(pattern)
    return find_links_by_func(html, func=lambda target: pattern.match(target), return_href=return_href)

In [None]:
number_links = find_links_by_pattern(example_bs, r'.+\d+.+')
print(f'Links containg at least one number: {len(number_links)}')

Links containg at least one number: 28
