# Import the stuff that we need

In [6]:
from bs4 import BeautifulSoup
import bs4
import requests
import json
from selenium import webdriver
import time
import os
import re
import csv

# Retrieve URL's to the individual news articles

Note that the "Selenium" package is used to open the web page. The reason for this is that the page listing the articles is javascript based, and the javascript is used to load the article. If you would not accessed this page directly from BeautifulSoup, the aricles would not have been loaded.

Note that we also need to scroll down on the web page to get more articles. For this purpose, selenium can also be used, as selenium has functionality for all kinds of intreraction with the web page.

In [7]:
def get_article_urls(topic):
    # Create the browser and access the page
    url = "http://www.vg.no/nyheter/" + topic
    browser = webdriver.Chrome()
    browser.get(url)
    
    # Scroll down to get more news articles
    for i in range(10):
        browser.execute_script("window.scrollTo(0, document.body.scrollHeight);")
        time.sleep(1)
        
    # Fetch all the visible article urls
    article_urls = []
    for article in browser.find_elements_by_tag_name("article"):
        a_url = article.find_element_by_tag_name("a").get_attribute("href")
        if a_url not in article_urls:
            article_urls.append(a_url)
            
    browser.close()
    print("{} articles about {} found".format(len(article_urls), topic))
    return article_urls    

# Extracting the article body

The article body consists of three different types of text (on this page):
- **The preamble** - the introduction to the article
- **Headers**
- **Paragraphs** - the main content

We need to exract each of these parts and then concatenate them. Note that each part has substrings, so this is why we need to iterate through all the `strings` of each text element.

In [8]:
def extract_article_body(soup):
    # Concatenate the different parts of the text
    text = ""
    article_body = soup.find("div", {"itemprop": "articleBody"})
    for c in article_body.children:
        if type(c) == bs4.element.Tag:
            # Read only the preamble, paragraphs and headers
            if c.get("id") == "preamble" or c.name == 'p' or c.name[0] == "h":
                for s in c.strings:
                    text += s + " "
                    
    # Some text cleaning
    text = re.sub(" (?=[.!?])", "", text)
    text = " ".join(text.split())
    text = re.sub("(?<=[.!?]) –(?= )", "", text)
    return text

# Retrieving an article

In this retrieval method, we retrieve the following parts of the article:
- The **author** of the article
- The **time** the article was created
- The **title** of the article
- The **body** (main text) of the article

In [9]:
def retrieve_article(url):
    soup = BeautifulSoup(requests.get(url).text, "html.parser")
    article = {}
    
    # Get the auther of the article
    try:
        article["author"] = soup.find("a", {"itemprop": "author"}).get_text().strip()
    except AttributeError:
        article["author"] = None
    
    # Creation time
    try:
        article["create_time"] = soup.find("time", {"itemprop": "datePublished"}).get_text().strip()
    except AttributeError:
        article["create_time"] = None
    
    # Get the title
    article["title"] = soup.find("h1", class_="main-title").get_text().strip()
    
    # Get the body
    article["body"] = extract_article_body(soup).strip()
    
    return article

# Getting some articles for specific topics


In [10]:
topics = ["Donald Trump", "fotball"]
articles = []
for topic in topics:
    for url in get_article_urls(topic):
        article = retrieve_article(url)
        article["topic"] = topic
        articles.append(article)

220 articles about Donald Trump found
140 articles about fotball found


# Save the article data

In [176]:
if not os.path.exists("data"):
    os.makedirs("data")
    
with open(os.path.join("data", "vg_articles.csv"), 'w', encoding='utf-8') as csvfile:
    columns = ["author", "create_time", "title", "body", "topic"]
    csv_writer = csv.DictWriter(csvfile, fieldnames=columns)
    csv_writer.writeheader()
    for row in articles:
        csv_writer.writerow(row)