# Filtering the dataset

Create a small subset of full data

In [1]:
import sys
import os
import pandas as pd
import json

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

In [44]:
METADATA_PATH = os.path.join("..", "data", "metadata.csv")
ARTICLES_CLEAN_DIR = os.path.join("..", "data", "articles_clean")

FILTERED_METADATA_PATH = os.path.join("..", "data", "filtered_metadata.csv")

Available metadata fields

In [3]:
metadata = pd.read_csv(METADATA_PATH)
metadata["published_at"] = pd.to_datetime(metadata["published_at"])
metadata.head()

Unnamed: 0,id,filename,published_at,author,title,category,section,word_count,financial_crisis,sustainability,fake_news,ai,digitalization,local_journalism,covid,demographics,innovation,valid_indicator
0,948de0b1-b3b7-4c45-b22a-a074d3761cfc,unerbittliche-unvernunft.json,2010-06-09 18:41:00,Walter Hämmerle,Unerbittliche Unvernunft,Leitartikel,Meinung,284,0.387,0.4209,0.3082,0.3251,0.3121,0.3207,0.3231,0.3609,0.3064,False
1,d3da5c1b-9f10-4603-a648-d511cee9c14e,englisches-fruhstuck.json,2016-02-17 17:50:00,WZ-Korrespondentin Martyna Czarnowska,Englisches Frühstück,Politik,Nachrichten,411,0.0158,0.2556,0.3991,0.303,0.0306,0.1843,0.4453,0.1941,0.1053,False
2,a23cee8a-a2e3-43b7-b62b-6e5c3c0f2e87,gewinne-sprudeln-im-osten.json,2008-03-26 18:57:00,Helmut Dité,Gewinne sprudeln im Osten,Wirtschaft,Nachrichten,383,0.4757,0.3447,0.2938,0.2853,0.2373,0.2838,0.3346,0.324,0.3013,False
3,9fa0d59f-06ed-434f-9a7e-aabeff65fcbf,starke-ansage-leere-drohung.json,2015-12-17 18:15:00,Michael Schmölzer,Starke Ansage - leere Drohung?,Politik,Nachrichten,766,0.4811,0.5247,0.4703,0.4721,0.5028,0.4828,0.5063,0.508,0.4979,False
4,dda1a4cf-76b9-41b0-a637-d0b7e12625d9,wirtschaftspolitik-als-waffe.json,2022-11-14 09:30:00,Peter De Coensel,Wirtschaftspolitik als Waffe,Gastkommentare,Meinung,760,0.2336,0.271,0.1838,0.2259,0.1745,0.1517,0.2287,0.243,0.1161,False


Analysis

In [7]:
metadata["category"].describe()

count       87754
unique         47
top       Politik
freq        31919
Name: category, dtype: object

In [8]:
metadata["category"].value_counts()  # 47 different categories

category
Politik                                  31919
Wirtschaft                               16512
Kommentare                               11408
Gastkommentare                            7253
Wissen                                    5632
Europaarchiv                              5341
Leitartikel                               3292
Analysen                                  2478
Reflexionen                               2308
Recht                                      652
Auf Justitias Spuren                       196
Leserforum                                 170
Klimawandel                                 52
Wiener Zeitung - seit 1703                  47
Sterbehilfe                                 44
1914                                        39
100 Jahre Republik                          38
Stadtentwicklung                            28
Wald                                        27
100 Jahre Verfassung                        26
EU für mich                                 26
Asyl

In [9]:
metadata["section"].value_counts()

section
Nachrichten    56371
Meinung        22123
Archiv          7931
Themen           961
Dossiers         368
Name: count, dtype: int64

In [10]:
metadata["word_count"].describe()

count    87754.000000
mean       520.058379
std        357.316632
min          0.000000
25%        286.000000
50%        428.000000
75%        642.000000
max       5995.000000
Name: word_count, dtype: float64

In [11]:
# Missing data
metadata["author"].isnull().sum()

1449

Filter the metadata df according to your needs

Hint: Try to not make the dataset too big for processing reasons (try around to get a feeling what your machine can handle)

In [46]:
filtered_metadata = metadata[
    # Date
    (metadata["published_at"] >= "2007-01-01") &
    (metadata["published_at"] < "2012-01-01") &

    # Authors
    # (metadata["author"].isin(["Christine Zeiner", "Silke Farmer"])) &

    # Category
    # (metadata["section"].isin(["Politik", "Wirtschaft"])) &  # ('Politik', 'Wirtschaft', 'Kommentare', 'Gastkommentare', 'Wissen', see above for all categories)

    # Section
    # (metadata["section"].isin(["Nachrichten", "Meinung"])) &  # ('Meinung', 'Nachrichten', 'Archiv', 'Themen', 'Dossiers')

    # Word count
    # (metadata["word_count"] >= 100) &

    # Tags
    (metadata["valid_indicator"]) &  # this is an estimate if the probabilities assigned to the category are actually valid, recommended to use but will reduce the dataset size
    (metadata["financial_crisis"] > 0.8) # ('financial_crisis', 'sustainability', 'fake_news', 'ai', 'digitalization', 'local_journalism', 'covid', 'demographics', 'innovation')
]
print(f"Expected number of articles: {filtered_metadata.shape[0]}")

Expected number of articles: 423


Get filtered articles

In [29]:
def filter_articles(filtered_metadata, articles_dir):
    """
    Filter articles based on filtered metadata
    """
    articles = {}
    for _, row in filtered_metadata.iterrows():
        article_path = os.path.join(articles_dir, row["filename"])
        with open(article_path, "r", encoding="utf-8") as file:
            article = json.load(file)
            articles[row["filename"]] = article
    return articles

In [None]:
filtered_articles = filter_articles(filtered_metadata, ARTICLES_CLEAN_DIR)

In [47]:
# Check some articles from that sample
sample_id = 0

print(f"Number of articles: {len(filtered_articles)}\n")
print(f"Sample article metadata:\n {filtered_metadata.iloc[sample_id]}\n")
print(f"Sample article:\n {filtered_articles[list(filtered_articles.keys())[sample_id]]}")

Number of articles: 545

Sample article metadata:
 id                  fd343f2b-736d-4c86-920d-35fc4e3b4a2e
filename                      die-gotter-des-geldes.json
published_at                         2011-05-10 18:31:00
author                                   Reinhard Göweil
title                              Die Götter des Geldes
category                                     Leitartikel
section                                          Meinung
word_count                                           326
financial_crisis                                  0.8968
sustainability                                    0.6328
fake_news                                         0.4778
ai                                                0.5007
digitalization                                     0.351
local_journalism                                  0.3519
covid                                             0.5301
demographics                                      0.5913
innovation                           

In [49]:
# Store the filtered metadata to csv
if input("Do you want to store (overwrite) the filtered metadata to csv? (y/n): ") == "y":
    filtered_metadata.to_csv(FILTERED_METADATA_PATH, index=False)