### Dataset basic operations

In [None]:
import sys
import os
import zipfile
import json
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

from src.articles import process_raw_article

Setup the paths to data sources

In [None]:
ARCHIVE_FILENAME = "2025_02_25_wienerzeitung_archiv.zip"
ARCHIVE_EXTRACT_DIR = os.path.join("..", "data", "articles_raw")
ARTICLES_CLEAN_DIR = os.path.join("..", "data", "articles_clean")
ARTICLES_RAW_DIR = os.path.join("..", "data", "articles_raw", "2025_02_25_wienerzeitung_archiv", "content")
archive_path = os.path.join("..", "data", "archive", ARCHIVE_FILENAME)

Extract ZIP archive to raw JSONs

In [None]:
# Extract the archive to the articles directory
with zipfile.ZipFile(archive_path, "r") as archive:
    archive.extractall(ARCHIVE_EXTRACT_DIR)

In [None]:
print(f"Number of articles in archive: {len(os.listdir(ARTICLES_RAW_DIR))}")

Clean the raw JSON to key content

In [None]:
def process_article_file(article_file):
    article_path = os.path.join(ARTICLES_RAW_DIR, article_file)
    # Read the raw article
    with open(article_path, "r", encoding="utf-8") as file:
        article = json.load(file)
    
    # Process the article
    article_clean = process_raw_article(article)
    
    # Write the cleaned article
    clean_path = os.path.join(ARTICLES_CLEAN_DIR, article_file)
    with open(clean_path, "w", encoding="utf-8") as file:
        json.dump(article_clean, file, indent=4, ensure_ascii=False)
    return article_file

# Ensure the output directory exists
if not os.path.exists(ARTICLES_CLEAN_DIR):
    os.makedirs(ARTICLES_CLEAN_DIR)

# List of raw article files
articles_raw = os.listdir(ARTICLES_RAW_DIR)

# Use ThreadPoolExecutor for concurrent processing
with ThreadPoolExecutor(max_workers=8) as executor:
    futures = {executor.submit(process_article_file, article_file): article_file for article_file in articles_raw}
    
    for future in tqdm(as_completed(futures), total=len(futures)):
        try:
            file_name = future.result()
        except Exception as e:
            print(f"Error processing {futures[future]}: {e}")


Example of raw article

In [None]:
# Pick 1st article for showcase
raw_article_path = os.path.join(ARTICLES_RAW_DIR, articles_raw[0])
with open(raw_article_path, "r", encoding="utf-8") as file:
    raw_article = json.load(file)
raw_article

Example of cleaned article

In [None]:
articles_clean = os.listdir(ARTICLES_CLEAN_DIR)
sample_article = articles_clean[0]
sample_article_path = os.path.join(ARTICLES_CLEAN_DIR, sample_article)

with open(sample_article_path, "r", encoding="utf-8") as file:
    sample_article = json.load(file)

for key, value in sample_article.items():
    print(f"{key}: {value} \n")