**Disclaimer: if you want to use the data scraped by running this notebook to build an actual product/service you need to contact vg.no as this data is owned by them.**

In [90]:
from bs4 import BeautifulSoup
import requests
import re
from pathlib import Path
import os

Note: to find a new tag, vist an article with the tag you're interested in, click the tag at the end of the article, and it should be visible in the url

In [125]:
topic_tags = {
    "krim": "9711b2e2-b098-48aa-98d4-5dfc0244e289",
    "politikk": "e8c7541a-9618-4992-a5d0-5607fd771248",
    "vær": "a385212b-72cc-4dcf-baad-9c916f7ccd7c",
    "sjakk": "315ad1d0-7176-46fa-a592-bccd02216a71"
}

In [116]:
articles_url_template = "https://www.vg.no/iris/v1/teasers?offset={offset}&limit={limit}&section=&excludedSections=&tag={tag}&story="

List all the articles for one particular tag

In [155]:
def get_article_urls(tag, num, batch_size=100):
    num_yielded = 0
    while num_yielded < num:
        articles_request = articles_url_template.format(
            offset=num_yielded,
            limit=batch_size,
            tag=tag
        )
        response = requests.get(articles_request)
        try:
            articles = response.json()["articles"]
            if not articles:
                return
        except KeyError:
            return
        for a in articles:
            yield a["links"]["canonicalUrl"]
            num_yielded += 1

Read and parse the article found at the given url

In [156]:
def parse_article(url):
    response = requests.get(url)
    soup = BeautifulSoup(response.text, "html.parser")
    body = ""
    article = soup.find("article")
    if article is not None:
        for text_part in article.find_all(re.compile("^(?:h\d|p)$")):
            body += text_part.text + "\n"
    return body

Store articles in this dir

In [157]:
data_dir = Path("data/vg_nyheter")

Note: To find the actual article corresponding to the scraped article, visit https://www.vg.no/i/<article_id>

In [158]:
def fetch_articles(tags_dict, num_articles_per_tag):
    for tag_name, tag in tags_dict.items():
        tag_dir = data_dir / tag_name
        if not tag_dir.is_dir():
            tag_dir.mkdir(parents=True)
        print(f"Fetching articles for tag {tag_name} ")
        for url in get_article_urls(tag, num_articles_per_tag):
            print(".", end="")
            article_id = re.search("/i/(?P<id>\w+)/", url).group("id")
            article_file = tag_dir / (article_id + ".txt")
            if not article_file.is_file():
                article_body = parse_article(url)
                with open(article_file, "w", encoding="utf8") as f:
                    f.write(article_body)
        print()

In [159]:
fetch_articles(topic_tags, 1000)

Fetching articles for tag krim 
........................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................................