### Filtering the dataset

In [None]:
import sys
import os
import pandas as pd
import json
from dotenv import load_dotenv
from datetime import datetime

module_path = os.path.abspath(os.path.join('..'))
if module_path not in sys.path:
    sys.path.append(module_path)

if not load_dotenv():
    raise Exception('Error loading .env file. Make sure to place a valid OPEN_AI_KEY in the .env file.')

In [None]:
METADATA_PATH = os.path.join("..", "data", "metadata.csv")
ARTICLES_CLEAN_DIR = os.path.join("..", "data", "articles_clean")

Available metadata fields

In [None]:
metadata = pd.read_csv(METADATA_PATH)
metadata["published_at"] = pd.to_datetime(metadata["published_at"])
metadata.head()

Filter the metadata df according to your needs

In [None]:
filtered_metadata = metadata[
    # Authors
    (metadata["author"].isin(["Christine Zeiner", "Silke Farmer"])) &
    # Date
    (metadata["published_at"] >= "2000-01-01") &
    # Word count
    (metadata["words_count"] >= 100) &
    # Tags
    (metadata["tags"].str.contains("demographics"))
]

Get filtered articles

In [None]:
def filter_articles(filtered_metadata, articles_dir):
    """
    Filter articles based on filtered metadata
    """
    articles = []
    for _, row in filtered_metadata.iterrows():
        article_path = os.path.join(articles_dir, row["filename"])
        with open(article_path, "r", encoding="utf-8") as file:
            article = json.load(file)
            articles.append(article)
    return articles

In [None]:
filtered_articles = filter_articles(filtered_metadata, ARTICLES_CLEAN_DIR)
print(f"Number of articles: {len(filtered_articles)}\n")
print(f"Sample article metadata:\n {filtered_metadata.iloc[0]}\n")
print(f"Sample article:\n {filtered_articles[0]}")