In [1]:
import os
import re
import requests

from bs4 import BeautifulSoup

In [2]:
MAX_DEPTH = 1
DOWNLOAD_DIRECTORY = "downloads"
BASE_URL = "https://www.20minutos.es"
SCRAPE_STARTING_URL = "https://www.20minutos.es/"

In [3]:
# Create download directory if it doesn't exist
if not os.path.exists(DOWNLOAD_DIRECTORY):
    os.makedirs(DOWNLOAD_DIRECTORY)

All internal HTML files start with "https://www.20minutos.es". To avoid missing any, I will also allow relative URLs starting with "/". Further, HTML link do not appear to have a format, but I will allows the optional ".html" termination as well.

In [4]:
INTERNAL_URLS_TO_HTMLS = r"^(?:https:\/\/www\.20minutos\.es)?\/(?!.*\..*\.)([^.]+(?:\.html)?)$"
INTERNAL_URL_REGEX = re.compile(INTERNAL_URLS_TO_HTMLS)

In [5]:
def make_canonical(relative_url):
    """
    Given a relative url, remove trailing slash if present,
    and add index if it is the base path
    """
    canonical_url = re.sub(r"\/$", '', relative_url).strip()
    if canonical_url != "":
        return canonical_url
    return "/index"

def get_file_path_from_url(url):
    """
    Create filepath for storage given a URL
    """
    relative_url = re.sub(f"^{BASE_URL}", '', url)
    canonical_url = make_canonical(relative_url)
    flattened_url = canonical_url.replace("/", '_')
    return f"{DOWNLOAD_DIRECTORY}/{flattened_url}.html"

def get_internal_links(html):
    """
    Get all internal links from an HTML document
    """
    soup = BeautifulSoup(html, 'html.parser')
    return set(a["href"] for a in soup.find_all('a', href=INTERNAL_URL_REGEX))

def scrape(url, depth):
    print(f'Scraping {url}, depth: {depth}')

    file_path = get_file_path_from_url(url)
    html = None

    file_exists = os.path.isfile(file_path)

    if file_exists:  # Read html from file
        with open(file_path, 'r') as f:
            html = f.read()
    else:  # Download html
        response = requests.get(url, headers={'accept': 'application/xml;q=0.9, */*;q=0.8'})
        html = response.text
        # Save html to file
        with open(file_path, 'w') as f:
            f.write(html)

    # Do not recurse if at max depth
    if depth >= MAX_DEPTH: return
    
    links = get_internal_links(html)

    for link in links:
        scrape(link, depth+1)

In [6]:
scrape(SCRAPE_STARTING_URL, 0)

Scraping https://www.20minutos.es/, depth: 0
Scraping https://www.20minutos.es/cinemania/noticias/daniel-radcliffe-producira-un-documental-sobre-su-doble-accion-que-quedo-silla-ruedas-rodando-harry-potter-5184296/, depth: 1
Scraping https://www.20minutos.es/gastronomia/recetas/comidas-trabajo-cinco-recetas-faciles-rapidas-invierno-5183436/, depth: 1
Scraping https://www.20minutos.es/noticia/5184613/0/frenara-por-fin-lagarde-este-jueves-subida-tipos-esto-es-que-esperan-los-analistas-efecto-bolsillo-los-espanoles/, depth: 1
Scraping https://www.20minutos.es/noticia/5184519/0/douglas-melton-ganador-abarca-prize-por-su-avanzada-busqueda-cura-diabetes-tipo-1/, depth: 1
Scraping https://www.20minutos.es/horoscopo/sagitario/, depth: 1
Scraping https://www.20minutos.es/mujer/moda/ana-milan-luc-loren-paula-gonu-nos-cuentan-cual-es-prenda-mas-cara-su-armario-5184741/, depth: 1
Scraping https://www.20minutos.es/television/joaquin-torres-responde-polemica-por-atico-tamara-falco-proyecto-inicial-er

Some of the downloaded files are not articles. I can think of two ways of distinguishing them:
* Articles URLs seem to contain a numeric ID like "5183611".
* Articles have an `<h1 class="article-title">`.

In [7]:
# retrieve all html files from DOWNLOAD_DIRECTORY
html_files = [f for f in os.listdir(DOWNLOAD_DIRECTORY) if f.endswith(".html")]

for file in html_files:
    print(file)
    file_path = os.path.join(DOWNLOAD_DIRECTORY, file)

    with open(file_path, "r", encoding="utf-8") as file:
        content = file.read()
        soup = BeautifulSoup(content)

        title_soup = soup.find("h1", {"class": "article-title"})

        if title_soup is None:  # It is not an article
            continue

        title = title_soup.get_text().strip()
        article_paragraphs = [paragraph.get_text().strip() for paragraph in soup.findAll("p")]

        print("## Titulo")
        print(title)
        print("## Articulo")
        for paragraph in article_paragraphs:
            print(paragraph)
    break

_noticia_5184941_0_quienes-son-las-flos-mariae-por-que-se-dividieron-grupo-pop-cristiano-que-inspiro-mesias-los-javis.html
## Titulo
Quienes son las Flos Mariae y por qué se dividieron: el grupo de pop cristiano que inspiró 'La Mesias' de los Javis
## Articulo
Un trueque con Dios lo cambió todo: María Durán de Bellido sobrevivió a la operación del tumor maligno, y sus hijas, en agradecimiento y cumplimiento con su promesa a la Virgen y el Señor, crearon un grupo musical que propagase la fe católica. Eran siete -todas hermanas- cuando en 2013 crearon Flos Marie. Ahora sos dos grupos, las 4HBD (4 Hermanas Bellido Durán) y Mariah’s Pop, y su historia vuelve a acaparar todos los focos tras el estreno de la serie 'La Mesías' de Los Javis, que parece tener muchas similitudes con esta historia familiar.
María Durán tenía 17 años cuando se casó por primera vez. Vivió seis años en una relación de maltrato y tuvo tres hijos, después, se separó. Más tarde conoció a Xavier, su segundo marido, que 