In [1]:
# Some initialisation code.

# Load necessary modules.
import os
import sys
import re

# Set the folder in which individual files will be stored.
DOCUMENTS_FOLDER = "./documents"


class Date:
    def __init__(self, day, month, year):
        self.day = day
        self.month = month
        self.year = year

    def __repr__(self):
        return "{} {} {}".format(self.day, self.month, self.year)

    def isValid(self):
        return self.day != -1 and self.month != -1 and self.year != -1

class Document:
    def __init__(self, title, date, body):
        self.title = title
        self.date = date
        self.body = body

    def __repr__(self):
        MAX_LENGTH = 100
        body = self.body if len(self.body) < MAX_LENGTH else self.body[:MAX_LENGTH]
        return "{}, {}: {}".format(self.title, self.date, body)

**The interesting code starts below.**

## Split dataset in individual documents

In [2]:
# Read the downloaded dataset file and store in variable 'lines'.
lines = []
with open('dataset.txt', 'r') as f:
    lines = f.read()
    
print("First 500 characters of the data:\n{}".format(lines[:500]))

First 500 characters of the data:
﻿
                               1 of 200 DOCUMENTS



                                   Het Parool

                             20 april 2016 woensdag

Kohl tart Merkel

BYLINE: SANDER BECKER

SECTION: Nieuws; Blz. 4

LENGTH: 748 woorden


Duitsland: Oud-bondskanselier ontvangt Hongaarse premier

Oud-bondskanselier Helmot Kohl ontving gisteren de Hongaarse premier Viktor
Orbán. Een klap in het gezicht van de huidige bondskanselier Angela Merkel - en
niet voor het eerst.

De voormalige Duitse 


In [3]:
# newDocumentExpression matches text of the form: "123 of 200 DOCUMENTS".
# \d matches digits [0-9].
# For more regular expressions, see RegularExpressions-Example.ipynb
newDocumentExpression = r"\b\d+\b of \b\d+\b DOCUMENTS"

In [4]:
# Split the document in several documents based on the newDocumentExpression.
# Skip the first element (index 0) as it contains all text BEFORE the first document.
# Example:
#     Text before Article
#     1 of 200 DOCUMENTS
#     Title of Article
#     Body of Article
documents = re.split(newDocumentExpression, lines)[1:]

In [5]:
# Check if all documents are found.
# The length (amount) of list of documents should equal the amount of documents.
print("The amount of documents found is: {}".format(len(documents)))

The amount of documents found is: 200


In [6]:
# Remove the leading/trailing whitespace from the documents.
documents = [document.strip() for document in documents]

In [7]:
# Remove all hyperlink-like text from the documents.
hyperlinkExpression = r"https?:\/\/(www\.)?[-a-zA-Z0-9@:%._\+~#=]{2,256}\.[a-z]{2,6}\b(([-a-zA-Z0-9@:%_\+.~#?&//=]|\s*-)*)"
documents = [re.sub(hyperlinkExpression, '', document) for document in documents]

Removes hyperlinks of the form:<br>
http://www.standaard.be/cnt/dmf20161129_02597674

but also:<br>
https://www.rijksoverheid.nl/actueel/nieuws/2016/11/03/minister-president<br>
[WHITESPACE]-rutte-verzorgt-de-preek-van-de-leek

but only the first line of:<br>
http://www.hln.be/hln/nl/4125/Internet/article/detail/2972254/2016/11/10/<br>
[WHITESPACE]President<br>
[WHITESPACE]-Trump-mag-twittervolgers-van-POTUS-houden.dhtml?utm_medium=rss&utm_content=ihln<br>
[WHITESPACE]ophlnbehetallerlaatstenieuwsoverinternetgames

If you can do any better, let me know :)

### Saving the individual documents

In [8]:
# Create the folder where individual documents are stored.
if not os.path.exists(DOCUMENTS_FOLDER):
    os.makedirs(DOCUMENTS_FOLDER)

In [9]:
# Write the documents to individual files as '[number].txt'
for document in documents:
    index = documents.index(document) + 1
    with open('{}/{}.txt'.format(DOCUMENTS_FOLDER, index), 'w+') as writeFile:
        writeFile.write(document)

## Aggregating the documents per year

In [10]:
# Extract the titles of the documents.
titles = []
for document in documents:
    documentSplit = document.split('\n')
    documentSplit = list(filter(None, documentSplit))

    # Find the part that says 'LENGTH:', because...
    lengthItem = next((s for s in documentSplit if 'LENGTH:' in s), None)
    lengthIndex = documentSplit.index(lengthItem)
    # ...the text is stored in the string one before the one that says 'LENGTH: xxx woorden'
    title = documentSplit[lengthIndex-1]
    titles.append(title)
    
print("The first five titles are:\n{}".format(titles[:5]))

The first five titles are:
['SECTION: Nieuws; Blz. 4', 'SECTION: De Verdieping; Blz. 2', 'SECTION: GOBUI1', 'SECTION: GOBUI1', 'SECTION: Buitenland; Blz. 13']


In [11]:
# Extract the bodies of the documents.
bodies = []
for document in documents:
    documentSplit = document.split('\n')
    documentSplit = list(filter(None, documentSplit))

     # Find the part that says 'LENGTH:', because...
    lengthItem = next((s for s in documentSplit if 'LENGTH:' in s), None)
    lengthIndex = documentSplit.index(lengthItem)
    # ...the text is stored in the strings one after the one that says 'LENGTH: xxx woorden'
    text = ' '.join(documentSplit[lengthIndex+1:])
    bodies.append(text)
    
print("The first five bodies are:\n{}".format(bodies[:5]))

The first five bodies are:
['Duitsland: Oud-bondskanselier ontvangt Hongaarse premier Oud-bondskanselier Helmot Kohl ontving gisteren de Hongaarse premier Viktor Orbán. Een klap in het gezicht van de huidige bondskanselier Angela Merkel - en niet voor het eerst. De voormalige Duitse bondskanselier Helmut Kohl (86) leek van het podium verdwenen. Toch wist hij de afgelopen dagen ineens weer alle ogen op zich gericht dankzij twee opmerkelijke acties. Dit weekend leverde hij om te beginnen stevige kritiek op het ruimhartige vluchtelingenbeleid van zijn opvolgster Angela Merkel. En gisteren ontving hij Merkels grootste tegenstander in het verhitte vluchtelingendebat: de Hongaarse premier Viktor Orbán. Vluchtelingen Het waren rake klappen in het gezicht van de zittende bondskanselier, met wie Kohl nog een appeltje te schillen had. Maar volgens veel Duitsers speelde er meer dan alleen een koningsdrama tussen een gewezen politicus en zijn opvolgster. Kohl, een overtuigd Europeaan, zou ook een 

### Finding the distribution dates

In [12]:
# Create lists of all possible months to check for.
MONTHS_DUTCH = ["januari", "februari", "maart", "april", "mei", "juni", "juli", "augustus", "september", "oktober", "november", "december"]
MONTHS_ENGLISH = ["january", "february", "march", "april", "may", "june", "july", "august", "september", "october", "november", "december"]
MONTHS_DUTCH_CAPITAL = [month.capitalize() for month in MONTHS_DUTCH]
MONTHS_ENGLISH_CAPITAL = [month.capitalize() for month in MONTHS_ENGLISH]

MONTHS = list(set(MONTHS_DUTCH + MONTHS_DUTCH_CAPITAL + MONTHS_ENGLISH + MONTHS_ENGLISH_CAPITAL))

# Add a regular subexpression for each month to the general regular expression.
reMonth = []
for month in MONTHS:
    # Expression for dd-mm-yyyy
    reMonth.append(r"\d\d* {} \d\d\d\d".format(month))
    # Expression for mm-dd-yyyy
    reMonth.append(r"{} \d\d* \d\d\d\d".format(month))
dateExpression = "(" + '|'.join(reMonth) + ")"

print("Final date regular expression:\n{}".format(dateExpression))

Final date regular expression:
(\d\d* december \d\d\d\d|december \d\d* \d\d\d\d|\d\d* July \d\d\d\d|July \d\d* \d\d\d\d|\d\d* february \d\d\d\d|february \d\d* \d\d\d\d|\d\d* Februari \d\d\d\d|Februari \d\d* \d\d\d\d|\d\d* September \d\d\d\d|September \d\d* \d\d\d\d|\d\d* November \d\d\d\d|November \d\d* \d\d\d\d|\d\d* october \d\d\d\d|october \d\d* \d\d\d\d|\d\d* March \d\d\d\d|March \d\d* \d\d\d\d|\d\d* september \d\d\d\d|september \d\d* \d\d\d\d|\d\d* mei \d\d\d\d|mei \d\d* \d\d\d\d|\d\d* july \d\d\d\d|july \d\d* \d\d\d\d|\d\d* maart \d\d\d\d|maart \d\d* \d\d\d\d|\d\d* October \d\d\d\d|October \d\d* \d\d\d\d|\d\d* juni \d\d\d\d|juni \d\d* \d\d\d\d|\d\d* august \d\d\d\d|august \d\d* \d\d\d\d|\d\d* January \d\d\d\d|January \d\d* \d\d\d\d|\d\d* Oktober \d\d\d\d|Oktober \d\d* \d\d\d\d|\d\d* August \d\d\d\d|August \d\d* \d\d\d\d|\d\d* juli \d\d\d\d|juli \d\d* \d\d\d\d|\d\d* Mei \d\d\d\d|Mei \d\d* \d\d\d\d|\d\d* Juni \d\d\d\d|Juni \d\d* \d\d\d\d|\d\d* June \d\d\d\d|June \d\d* \d\d\d\d|\d\d

In [13]:
distributionDates = []
for document in documents:
    allDates = re.findall(dateExpression, document)
    # Select the first date in the article and assume it is the distribution date.
    try:
        distributionDate = allDates[0]
    except IndexError:
        distributionDate = ""
    distributionDates.append(distributionDate)

print("The first five distribution dates are:\n{}".format(distributionDates[:5]))

The first five distribution dates are:
['20 april 2016', '20 augustus 2011', '', '', '20 april 2016']


In [14]:
def createDate(dateString):
    """A dateString is of the following format:
        dd month yyyy
    Return a Date that has a selectable day, month and year.
    """
    try:
        dateStringSplit = dateString.split(" ")
        return Date(dateStringSplit[0], dateStringSplit[1], dateStringSplit[2])
    except IndexError:
        return Date(-1, -1, -1)

### Creating and saving the aggregated documents

In [15]:
# Create document objects for each document (for easier access to title, body and date).
documents = [Document(titles[documents.index(document)], createDate(distributionDates[documents.index(document)]), bodies[documents.index(document)]) for document in documents]
# Remove documents with an invalid date, these cannot be labelled properly.
documents = [document for document in documents if document.date.isValid()]

In [16]:
# Create a new folder for the per_year data.
perYearRoot = 'per_year'
if not os.path.exists(perYearRoot):
    os.makedirs(perYearRoot)

# Use the documents' dates to create yearly documents containing the title and
# body of each article.
for document in documents:
    with open("{}/{}.txt".format(perYearRoot, document.date.year), "a+") as outputFile:
        outputFile.write(document.title + "\n\n")
        outputFile.write(document.body + "\n\n\n")

We are done with preprocessing.

The `documents` and `per_year` folder now contain documents that can be processed with for example NLTK.

See `NLTK-Example.ipynb`.