# Metadata source

NOTE: Not intended to be run by you - just fyi on how metadata was created

In [1]:
import sys
import os
import pandas as pd
import json
from tqdm import tqdm

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

Setup the paths to data sources

In [30]:
ARTICLES_CLEAN_DIR = os.path.join("..", "data", "articles_clean")
METADATA_PATH = os.path.join("..", "data", "metadata.csv")

Extract metadata

In [3]:
articles = os.listdir(ARTICLES_CLEAN_DIR)

article_ls = []
for name in tqdm(articles):
    with open(os.path.join(ARTICLES_CLEAN_DIR, name), "r", encoding="utf-8") as file:
        article = json.load(file)
    article.update({"filename": name})
    article_ls.append(article)

article_df = pd.DataFrame(article_ls)

In [11]:
# add word count
article_df.loc[:, "word_count"] = article_df["text"].apply(lambda x: len(x.split()))

In [12]:
article_df

Unnamed: 0,id,published_at,author,title,category,section,text,filename,word_count
0,948de0b1-b3b7-4c45-b22a-a074d3761cfc,2010-06-09 18:41,Walter Hämmerle,Unerbittliche Unvernunft,Leitartikel,Meinung,In Deutschland gibt es einige. Einer von ihnen...,unerbittliche-unvernunft.json,284
1,d3da5c1b-9f10-4603-a648-d511cee9c14e,2016-02-17 17:50,WZ-Korrespondentin Martyna Czarnowska,Englisches Frühstück,Politik,Nachrichten,"Brüssel. ""Ein ""englisches Frühstück"": Das könn...",englisches-fruhstuck.json,411
2,a23cee8a-a2e3-43b7-b62b-6e5c3c0f2e87,2008-03-26 18:57,Helmut Dité,Gewinne sprudeln im Osten,Wirtschaft,Nachrichten,Im Gegenteil: Nachdem der Gewinn im CEE-Raum 2...,gewinne-sprudeln-im-osten.json,383
3,9fa0d59f-06ed-434f-9a7e-aabeff65fcbf,2015-12-17 18:15,Michael Schmölzer,Starke Ansage - leere Drohung?,Politik,Nachrichten,Wien/Berlin/Brüssel. Im Streit um die EU-weite...,starke-ansage-leere-drohung.json,766
4,dda1a4cf-76b9-41b0-a637-d0b7e12625d9,2022-11-14 09:30,Peter De Coensel,Wirtschaftspolitik als Waffe,Gastkommentare,Meinung,Von einer vorübergehenden Inflation im Zusamme...,wirtschaftspolitik-als-waffe.json,760
...,...,...,...,...,...,...,...,...,...
87749,4ed72709-b2e1-40a0-bb37-7ceb31f61c14,2016-10-11 17:48,Gregor Kucera,Explosives Smartphone-Business,Wirtschaft,Nachrichten,Seoul/San Francisco. Es begann mit einer klein...,explosives-smartphone-business.json,939
87750,824b2248-4ca9-4ab4-978a-b12f3da4b3f0,2021-04-26 06:45,"Michael Ortner, Christian Rösner und Thomas Se...",Geldspritze für Corona-Vakzinforschung?,Wirtschaft,Nachrichten,Schnelle Schlagzeilen zu generieren gehört zum...,geldspritze-fur-corona-vakzinforschung.json,1132
87751,ca712be9-260d-47a4-89eb-4e77af2c4447,2015-09-02 17:04,Tamara Arthofer,"Antiquierte Regeln, schwammig gemacht",Kommentare,Meinung,Gut gemeint ist noch lange nicht gut durchdach...,antiquierte-regeln-schwammig-gemacht.json,249
87752,4a2ea920-3f28-4c41-8e2f-3c6c47d16654,2018-11-28 17:38,Brigitte Pechar,Der Erste muss nicht Erster sein,Politik,Nachrichten,Wien. Im Mai kommenden Jahres finden die EU-Wa...,der-erste-muss-nicht-erster-sein.json,714


In [10]:
import os
import glob
import json

import string
cache_dir="/path/to/my/huggingface/cache"
# --- Configuration ---
DATA_DIR = "/export/share/krausef99dm/AIM_delete/data/2025_02_25_wienerzeitung_archiv/content"
OUTPUT_DIR = "/export/share/krausef99dm/AIM_delete/data/2025_02_25_wienerzeitung_archiv/batch_results"   # Folder to store batch output files
BATCH_SIZE = 100               # Adjust batch size according to memory constraints
MAX_LENGTH = 512

# only keys used!
CANDIDATE_LABELS = {
    'Finanzkrise': "Finanzkrise",
    'Nachhaltigkeit': "Nachhaltigkeit Umweltschutz Ressourcenschonung Zukunftsfähigkeit Sustainability",
    'Fake News': "Fake News Desinformation Falschnachrichten Irreführung",
    'Künstliche Intelligenz': "Künstliche Intelligenz KI Maschinelles Lernen Deep Learning Automatisierung Artificial Intelligence openAI",
    'Digitalisierung': 'Digitalisierung Digitaler Wandel digitale Transformation Technologisierung',
    'lokaler Journalismus': 'lokaler Journalismus Regionaljournalismus Stadtjournalismus kommunaler Journalismus',
    'COVID': "COVID Coronavirus Pandemie SARS-CoV-2 Corona-Pandemie",
    'Demografie': "Demografie Bevölkerungsentwicklung Altersstruktur demographischer Wandel",
    'Innovation': "Innovation Neuerung Erneuerung technologischer Fortschritt"
}

os.makedirs(OUTPUT_DIR, exist_ok=True)

In [None]:
# --- Setup device and model ---
import torch
from transformers import AutoModelForSequenceClassification, AutoTokenizer
from nltk.corpus import stopwords

# https://huggingface.co/joeddav/xlm-roberta-large-xnli
# Fun fact: processing took 4,5h on an A100 GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_name = 'joeddav/xlm-roberta-large-xnli'
nli_model = AutoModelForSequenceClassification.from_pretrained(model_name, cache_dir="/export/share/krausef99dm/AIM_delete").to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

In [62]:
# --- Cleaning function ---
stop_words = set(stopwords.words("german"))
def clean_text(text, max_length=10000):
    text = text.lower()
    text = "".join([char if char not in string.punctuation else " " for char in text])
    words = text.split()
    words = [word for word in words if word not in stop_words]
    cleaned = " ".join(words)
    return cleaned[:max_length]

# --- Helper function to process a batch ---
def process_batch(file_list):
    results = {}
    premises = []
    ids = []
    # Load and clean texts
    for file_path in file_list:
        try:
            with open(file_path, 'r', encoding='utf-8') as f:
                data = json.load(f)
            article_id = data.get('id')
            text = data.get('text', '')
            if not text or article_id is None:
                continue
            cleaned = clean_text(text)

            #ids.append(article_id)
            ids.append(file_path.split("/")[-1])

            premises.append(cleaned)
        except Exception as e:
            print(f"Error processing {file_path}: {e}")
            continue

    if not premises:
        return results

    batch_size = len(premises)
    # Create a list of input pairs for all candidate labels per text.
    # The model expects (premise, hypothesis) pair.
    input_pairs = []
    for text in premises:
        for labels in CANDIDATE_LABELS.keys():
            hypothesis = f"Dieser Artikel handelt von {labels}"
            input_pairs.append((text, hypothesis))

    # Tokenize in batch
    encodings = tokenizer.batch_encode_plus(
        input_pairs,
        return_tensors='pt',
        truncation=True,
        max_length=MAX_LENGTH,
        padding=True
    )
    encodings = {k: v.to(device) for k, v in encodings.items()}

    # Forward pass
    with torch.no_grad():
        logits = nli_model(**encodings)[0]

    # We discard the "neutral" (dim 1) and take probability for "entailment" (dim 2)
    entail_contradiction_logits = logits[:, [0, 2]]
    probs = torch.softmax(entail_contradiction_logits, dim=1)
    # Reshape: each text has len(CANDIDATE_LABELS) predictions
    probs = probs[:, 1].reshape(batch_size, len(CANDIDATE_LABELS)).cpu().tolist()

    # Map probabilities to each article id and candidate labels
    for i, article_id in enumerate(ids):
        results[article_id] = {label: prob for label, prob in zip(CANDIDATE_LABELS.keys(), probs[i])}
    return results

# --- Main processing loop ---
failed_batches = []
# Get all JSON files
file_paths = sorted(glob.glob(os.path.join(DATA_DIR, '*.json')))

total_files = len(file_paths)

num_batches = (total_files + BATCH_SIZE - 1) // BATCH_SIZE

# Process each batch
for batch_idx in tqdm(range(num_batches)):
    output_file = os.path.join(OUTPUT_DIR, f"batch_{batch_idx}.json")
    # Skip batch if already processed
    if os.path.exists(output_file):
        print(f"Skipping batch {batch_idx} (already processed).")
        continue

    start = batch_idx * BATCH_SIZE
    end = min(start + BATCH_SIZE, total_files)
    batch_files = file_paths[start:end]
    print(f"Processing batch {batch_idx} (files {start} to {end-1})...")

    try:
        batch_results = process_batch(batch_files)
        # Save intermediate results
        with open(output_file, 'w', encoding='utf-8') as f_out:
            json.dump(batch_results, f_out, indent=2)
    except Exception as e:
        print(f"Error in batch {batch_idx}: {e}")
        failed_batches.append(batch_idx)
        # Optionally, log the error or break/continue
        continue

if failed_batches:
    print("Failed batches:", failed_batches)

Aggregation

In [None]:
import json
import glob

# Define the folder path and search pattern
folder_path = "../data/batch_results/*.json"

# Initialize an empty dictionary to hold aggregated data
aggregated_data = {}

# Iterate through all JSON files in the folder
for file in glob.glob(folder_path):
    with open(file, "r", encoding="utf-8") as f:
        batch = json.load(f)
        # Each JSON file contains a dictionary with one key, whose value is the sub-dictionary
        # We merge these into our aggregated_data dictionary
        aggregated_data.update(batch)

# Create a DataFrame using the keys as the index
df = pd.DataFrame.from_dict(aggregated_data, orient="index")

# Round the DataFrame values to reduce precision (e.g., 3 decimal places)
df = df.round(4)

In [None]:
# Merge metadata
article_metadata = article_df.drop("text", axis=1)
metadata_df = article_metadata.merge(df, left_on="filename", right_index=True, how="left")

In [44]:
metadata_df

Unnamed: 0,id,published_at,author,title,category,section,filename,Finanzkrise,Nachhaltigkeit,Fake News,Künstliche Intelligenz,Digitalisierung,lokaler Journalismus,COVID,Demografie,Innovation
0,948de0b1-b3b7-4c45-b22a-a074d3761cfc,2010-06-09 18:41,Walter Hämmerle,Unerbittliche Unvernunft,Leitartikel,Meinung,unerbittliche-unvernunft.json,0.3870,0.4209,0.3082,0.3251,0.3121,0.3207,0.3231,0.3609,0.3064
1,d3da5c1b-9f10-4603-a648-d511cee9c14e,2016-02-17 17:50,WZ-Korrespondentin Martyna Czarnowska,Englisches Frühstück,Politik,Nachrichten,englisches-fruhstuck.json,0.0158,0.2556,0.3991,0.3030,0.0306,0.1843,0.4453,0.1941,0.1053
2,a23cee8a-a2e3-43b7-b62b-6e5c3c0f2e87,2008-03-26 18:57,Helmut Dité,Gewinne sprudeln im Osten,Wirtschaft,Nachrichten,gewinne-sprudeln-im-osten.json,0.4757,0.3447,0.2938,0.2853,0.2373,0.2838,0.3346,0.3240,0.3013
3,9fa0d59f-06ed-434f-9a7e-aabeff65fcbf,2015-12-17 18:15,Michael Schmölzer,Starke Ansage - leere Drohung?,Politik,Nachrichten,starke-ansage-leere-drohung.json,0.4811,0.5247,0.4703,0.4721,0.5028,0.4828,0.5063,0.5080,0.4979
4,dda1a4cf-76b9-41b0-a637-d0b7e12625d9,2022-11-14 09:30,Peter De Coensel,Wirtschaftspolitik als Waffe,Gastkommentare,Meinung,wirtschaftspolitik-als-waffe.json,0.2336,0.2710,0.1838,0.2259,0.1745,0.1517,0.2287,0.2430,0.1161
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
195,5abf187f-bcfb-46cf-9c1e-eb45bf6ef65b,2012-10-11 16:44,Alexandra Grass,Die Zukunft heißt Open Access,Wissen,Nachrichten,die-zukunft-heisst-open-access.json,0.4628,0.5383,0.5284,0.5539,0.5406,0.4309,0.5533,0.5492,0.5467
196,912e57bb-67fb-4656-a56b-e6841780f268,2009-01-14 19:28,WZ-Korrespondent Wolfgang Tucek,"""Wettlauf um die größte Lüge""",Europaarchiv,Archiv,wettlauf-um-die-grosste-luge.json,0.1397,0.5269,0.5147,0.2606,0.3049,0.4183,0.4946,0.1543,0.4410
197,13915d2a-9f46-404b-aeca-c760943e9e2b,2002-08-26 00:00,Christine Zeiner,Harry Potter ist schon wieder out,Wirtschaft,Nachrichten,harry-potter-ist-schon-wieder-out.json,0.2424,0.2684,0.2550,0.2872,0.2539,0.2783,0.2718,0.2823,0.2725
198,bb2d220c-0d59-41b2-92e8-ccf8cb0c9c0f,2006-06-16 00:00,WZ-Korrespondent Wolfgang Tucek,Endspurt für den Ratsvorsitz,Europaarchiv,Archiv,endspurt-fur-den-ratsvorsitz.json,0.1647,0.1621,0.1464,0.1581,0.1143,0.1495,0.1807,0.1299,0.1417


Check tags

In [14]:
# Estimating if values are valid, or model was triggered by some other reason and all categories are high

def detect_valid(row, range_threshold=0.1, certainty_threshold=0.6):
    probs = row[list(CANDIDATE_LABELS.keys())].tolist()
    avg = sum(probs) / len(probs)

    # Check uniformity
    if all(abs(p - avg) < range_threshold for p in probs):
        return False

    # Check if at least 4 probabilities are above 0.6
    if sum(p > certainty_threshold for p in probs) >= 4:
        return False

    # Check if at least one probability is above 0.6
    if not any(p > certainty_threshold for p in probs):
        return False

    return True

In [15]:
metadata_df.loc[:, "valid_indicator"] = metadata_df.apply(detect_valid, axis=1)

In [61]:
# extract only rows were valid indicator = True
metadata_df[metadata_df["valid_indicator"]]
# 111 out of 1k -> around 10% valid detected

Unnamed: 0,id,published_at,author,title,category,section,filename,Finanzkrise,Nachhaltigkeit,Fake News,Künstliche Intelligenz,Digitalisierung,lokaler Journalismus,COVID,Demografie,Innovation,valid_indicator
13,71074161-4c49-48c4-90e7-bb54a5ff64d9,2009-05-26 16:50,Stephanie Dirnbacher,Einfachere Teilnahme an Hauptversammlungen,Wirtschaft,Nachrichten,einfachere-teilnahme-an-hauptversammlungen.json,0.5162,0.3391,0.3757,0.3756,0.5980,0.2286,0.4004,0.3723,0.7886,True
29,f4b32c2a-7901-4914-b046-1f4351278e9a,2020-12-18 09:33,,Kogler übernimmt Karenzvertretung für Zadic,Politik,Nachrichten,kogler-ubernimmt-karenzvertretung-fur-zadic.json,0.0197,0.2611,0.2416,0.3610,0.9844,0.1648,0.4519,0.2147,0.8954,True
40,e28da953-cd6e-44f2-8590-0018eee95b4c,2020-11-23 09:04,,Nationalrat muss am Donnerstag fürs Budget nac...,Politik,Nachrichten,nationalrat-muss-am-donnerstag-furs-budget-nac...,0.7802,0.2539,0.3318,0.3408,0.1902,0.3427,0.4710,0.1811,0.2701,True
59,fd343f2b-736d-4c86-920d-35fc4e3b4a2e,2011-05-10 18:31,Reinhard Göweil,Die Götter des Geldes,Leitartikel,Meinung,die-gotter-des-geldes.json,0.8968,0.6328,0.4778,0.5007,0.3510,0.3519,0.5301,0.5913,0.2105,True
66,d879acdd-83d8-4095-b067-8c1bc0b7c101,2017-07-20 17:07,Eva Stanzl,Ausbildung senkt Demenz-Risiko,Wissen,Nachrichten,ausbildung-senkt-demenz-risiko.json,0.3650,0.5477,0.5038,0.4485,0.4430,0.5213,0.5388,0.6145,0.5106,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
951,e4cc5c58-d1e1-4807-8743-d062fc5457b9,2004-04-05 00:00,Barbara Ottawa,Differenzen als Anregung,Wissen,Nachrichten,differenzen-als-anregung.json,0.3582,0.3442,0.3813,0.3925,0.2495,0.0269,0.2810,0.4806,0.7525,True
959,eb5ed91b-5b70-4d4b-8b6e-96a208cbddbb,2008-07-04 11:16,WZ Online,Biosprit macht Lebensmittel unerschwinglich,Wirtschaft,Nachrichten,biosprit-macht-lebensmittel-unerschwinglich.json,0.3969,0.9046,0.1630,0.1679,0.0457,0.0057,0.1699,0.4397,0.0376,True
961,9a54296f-ad17-4f7e-9781-41104902a37b,2002-01-21 00:00,Veronika Gasser,Mit Wasser Geld verdienen,Wirtschaft,Nachrichten,mit-wasser-geld-verdienen.json,0.1022,0.6996,0.3574,0.2303,0.0325,0.4152,0.1418,0.2225,0.3491,True
964,623dffad-f2cd-4907-bbd3-9a7c8e0ccc89,2022-11-15 16:12,Christina Böck,Vom Spektakel zur Fußnote,Kommentare,Meinung,vom-spektakel-zur-fussnote.json,0.0115,0.9575,0.1502,0.2521,0.0077,0.0207,0.1477,0.0811,0.0478,True


In [17]:
# rename columns to english and use python naming convention
metadata_df.columns = ['id', 'published_at', 'author', 'title', 'category', 'section',
       'filename', 'word_count', 'financial_crisis', 'sustainability', 'fake_news',
       'ai', 'digitalization', 'local_journalism',
       'covid', 'demographics', 'innovation', 'valid_indicator']

# restructure columns that filename is after id
metadata_df = metadata_df[['id', 'filename', 'published_at', 'author', 'title', 'category', 'section', 'word_count',
       'financial_crisis', 'sustainability', 'fake_news',
       'ai', 'digitalization', 'local_journalism',
       'covid', 'demographics', 'innovation', 'valid_indicator']]

In [18]:
metadata_df.head(2)

Unnamed: 0,id,filename,published_at,author,title,category,section,word_count,financial_crisis,sustainability,fake_news,ai,digitalization,local_journalism,covid,demographics,innovation,valid_indicator
0,948de0b1-b3b7-4c45-b22a-a074d3761cfc,unerbittliche-unvernunft.json,2010-06-09 18:41,Walter Hämmerle,Unerbittliche Unvernunft,Leitartikel,Meinung,284,0.387,0.4209,0.3082,0.3251,0.3121,0.3207,0.3231,0.3609,0.3064,False
1,d3da5c1b-9f10-4603-a648-d511cee9c14e,englisches-fruhstuck.json,2016-02-17 17:50,WZ-Korrespondentin Martyna Czarnowska,Englisches Frühstück,Politik,Nachrichten,411,0.0158,0.2556,0.3991,0.303,0.0306,0.1843,0.4453,0.1941,0.1053,False


In [None]:
# Store data, check if already exists
if os.path.exists(METADATA_PATH):
    print("Metadata file already exists.")
else:
    metadata_df.to_csv(METADATA_PATH, index=False)